From 23c832b10ca9ab2685d7d3e0990800ffc846fc92 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 12 Oct 2016 19:23:50 -0400
Subject: remove spd_release_page()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/splice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/splice.h b/include/linux/splice.h
index 00a21166e268..647243bdd9d7 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -82,7 +82,6 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
  */
 extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *);
 extern void splice_shrink_spd(struct splice_pipe_desc *);
-extern void spd_release_page(struct splice_pipe_desc *, unsigned int);
 
 extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
 extern const struct pipe_buf_operations default_pipe_buf_ops;
-- 
cgit v1.2.3


From f81dc7d7d5a2528f98f26a0b9406e822d0b35011 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 31 Oct 2016 16:47:15 -0400
Subject: splice_pipe_desc: kill ->flags

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/splice.h | 1 -
 kernel/relay.c         | 1 -
 kernel/trace/trace.c   | 2 --
 net/core/skbuff.c      | 1 -
 4 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/splice.h b/include/linux/splice.h
index 647243bdd9d7..3c98dad93bf3 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -55,7 +55,6 @@ struct splice_pipe_desc {
 	struct partial_page *partial;	/* pages[] may not be contig */
 	int nr_pages;			/* number of populated pages in map */
 	unsigned int nr_pages_max;	/* pages[] & partial[] arrays size */
-	unsigned int flags;		/* splice flags */
 	const struct pipe_buf_operations *ops;/* ops associated with output pipe */
 	void (*spd_release)(struct splice_pipe_desc *, unsigned int);
 };
diff --git a/kernel/relay.c b/kernel/relay.c
index 8f18d314a96a..9b48284eac56 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in,
 		.nr_pages = 0,
 		.nr_pages_max = PIPE_DEF_BUFFERS,
 		.partial = partial,
-		.flags = flags,
 		.ops = &relay_pipe_buf_ops,
 		.spd_release = relay_page_release,
 	};
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7449783987a..77c2d9bcb40f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5536,7 +5536,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		.partial	= partial_def,
 		.nr_pages	= 0, /* This gets updated below. */
 		.nr_pages_max	= PIPE_DEF_BUFFERS,
-		.flags		= flags,
 		.ops		= &tracing_pipe_buf_ops,
 		.spd_release	= tracing_spd_release_pipe,
 	};
@@ -6434,7 +6433,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.pages		= pages_def,
 		.partial	= partial_def,
 		.nr_pages_max	= PIPE_DEF_BUFFERS,
-		.flags		= flags,
 		.ops		= &buffer_pipe_buf_ops,
 		.spd_release	= buffer_spd_release,
 	};
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a03730fbc1a..0835ac93a4b1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1976,7 +1976,6 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		.pages = pages,
 		.partial = partial,
 		.nr_pages_max = MAX_SKB_FRAGS,
-		.flags = flags,
 		.ops = &nosteal_pipe_buf_ops,
 		.spd_release = sock_spd_release,
 	};
-- 
cgit v1.2.3


From 3d6ea290f337cc64cf44290482e36306fc8aaa31 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 10 Dec 2016 13:17:32 -0500
Subject: splice/tee/vmsplice: validate flags

Long overdue...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/splice.c            | 8 ++++++++
 include/linux/splice.h | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index bf17a92e26c3..1af65632c371 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1351,6 +1351,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
 	struct fd f;
 	long error;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
 	if (unlikely(nr_segs > UIO_MAXIOV))
 		return -EINVAL;
 	else if (unlikely(!nr_segs))
@@ -1401,6 +1403,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 	if (unlikely(!len))
 		return 0;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
+
 	error = -EBADF;
 	in = fdget(fd_in);
 	if (in.file) {
@@ -1729,6 +1734,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 	struct fd in;
 	int error;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
+
 	if (unlikely(!len))
 		return 0;
 
diff --git a/include/linux/splice.h b/include/linux/splice.h
index 3c98dad93bf3..db42746bdfea 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -20,6 +20,8 @@
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
 #define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
 
+#define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT)
+
 /*
  * Passed to the actors
  */
-- 
cgit v1.2.3


From 2955b73def6712b693fc7ad82b34b3831faaa146 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 21 Feb 2017 09:30:00 +0000
Subject: dma-buf/reservation: Wrap ww_mutex_trylock

In a similar fashion to reservation_object_lock() and
reservation_object_unlock(), ww_mutex_trylock is also useful and so is
worth wrapping for consistency.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
[danvet: Add __must_check Joonas wants.]
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20170221093000.22802-1-chris@chris-wilson.co.uk
---
 include/linux/reservation.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index 2b5a4679daea..156cfd330b66 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -166,6 +166,26 @@ reservation_object_lock(struct reservation_object *obj,
 	return ww_mutex_lock(&obj->lock, ctx);
 }
 
+/**
+ * reservation_object_trylock - trylock the reservation object
+ * @obj: the reservation object
+ *
+ * Tries to lock the reservation object for exclusive access and modification.
+ * Note, that the lock is only against other writers, readers will run
+ * concurrently with a writer under RCU. The seqlock is used to notify readers
+ * if they overlap with a writer.
+ *
+ * Also note that since no context is provided, no deadlock protection is
+ * possible.
+ *
+ * Returns true if the lock was acquired, false otherwise.
+ */
+static inline bool __must_check
+reservation_object_trylock(struct reservation_object *obj)
+{
+	return ww_mutex_trylock(&obj->lock);
+}
+
 /**
  * reservation_object_unlock - unlock the reservation object
  * @obj: the reservation object
-- 
cgit v1.2.3


From 38b0b219fbe89d824213beabf03bfb00b5d2c8fa Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Date: Fri, 24 Feb 2017 17:14:33 +0100
Subject: of: add devm_ functions for populate and depopulate

Lots of calls to of_platform_populate() are not unbalanced by a call
to of_platform_depopulate(). This create issues while drivers are
bind/unbind.

In way to solve those issues is to add devm_of_platform_populate()
which will call of_platform_depopulate() when the device is unbound
from the bus.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1487952874-23635-2-git-send-email-benjamin.gaignard@linaro.org
---
 drivers/of/platform.c       | 71 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of_platform.h | 11 +++++++
 2 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index b8064bc2b6eb..f5bbb500ab80 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -571,6 +571,77 @@ void of_platform_depopulate(struct device *parent)
 }
 EXPORT_SYMBOL_GPL(of_platform_depopulate);
 
+static void devm_of_platform_populate_release(struct device *dev, void *res)
+{
+	of_platform_depopulate(*(struct device **)res);
+}
+
+/**
+ * devm_of_platform_populate() - Populate platform_devices from device tree data
+ * @dev: device that requested to populate from device tree data
+ *
+ * Similar to of_platform_populate(), but will automatically call
+ * of_platform_depopulate() when the device is unbound from the bus.
+ *
+ * Returns 0 on success, < 0 on failure.
+ */
+int devm_of_platform_populate(struct device *dev)
+{
+	struct device **ptr;
+	int ret;
+
+	if (!dev)
+		return -EINVAL;
+
+	ptr = devres_alloc(devm_of_platform_populate_release,
+			   sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = of_platform_populate(dev->of_node, NULL, NULL, dev);
+	if (ret) {
+		devres_free(ptr);
+	} else {
+		*ptr = dev;
+		devres_add(dev, ptr);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_of_platform_populate);
+
+static int devm_of_platform_match(struct device *dev, void *res, void *data)
+{
+	struct device **ptr = res;
+
+	if (!ptr) {
+		WARN_ON(!ptr);
+		return 0;
+	}
+
+	return *ptr == data;
+}
+
+/**
+ * devm_of_platform_depopulate() - Remove devices populated from device tree
+ * @dev: device that requested to depopulate from device tree data
+ *
+ * Complementary to devm_of_platform_populate(), this function removes children
+ * of the given device (and, recurrently, their children) that have been
+ * created from their respective device tree nodes (and only those,
+ * leaving others - eg. manually created - unharmed).
+ */
+void devm_of_platform_depopulate(struct device *dev)
+{
+	int ret;
+
+	ret = devres_release(dev, devm_of_platform_populate_release,
+			     devm_of_platform_match, dev);
+
+	WARN_ON(ret);
+}
+EXPORT_SYMBOL_GPL(devm_of_platform_depopulate);
+
 #ifdef CONFIG_OF_DYNAMIC
 static int of_platform_notify(struct notifier_block *nb,
 				unsigned long action, void *arg)
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index 956a1006aefc..dc8224ae28d5 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -76,6 +76,10 @@ extern int of_platform_default_populate(struct device_node *root,
 					const struct of_dev_auxdata *lookup,
 					struct device *parent);
 extern void of_platform_depopulate(struct device *parent);
+
+extern int devm_of_platform_populate(struct device *dev);
+
+extern void devm_of_platform_depopulate(struct device *dev);
 #else
 static inline int of_platform_populate(struct device_node *root,
 					const struct of_device_id *matches,
@@ -91,6 +95,13 @@ static inline int of_platform_default_populate(struct device_node *root,
 	return -ENODEV;
 }
 static inline void of_platform_depopulate(struct device *parent) { }
+
+static inline int devm_of_platform_populate(struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline void devm_of_platform_depopulate(struct device *dev) { }
 #endif
 
 #if defined(CONFIG_OF_DYNAMIC) && defined(CONFIG_OF_ADDRESS)
-- 
cgit v1.2.3


From 90ec5e89e393c76e19afc845d8f88a5dc8315919 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2017 19:23:37 +0530
Subject: kretprobes: Ensure probe location is at function entry

kretprobes can be registered by specifying an absolute address or by
specifying offset to a symbol. However, we need to ensure this falls at
function entry so as to be able to determine the return address.

Validate the same during kretprobe registration. By default, there
should not be any offset from a function entry, as determined through a
kallsyms_lookup(). Introduce arch_function_offset_within_entry() as a
way for architectures to override this.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/f1583bc4839a3862cfc2acefcc56f9c8837fa2ba.1487770934.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/kprobes.h |  1 +
 kernel/kprobes.c        | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index c328e4f7dcad..177bdf6c6aeb 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,6 +267,7 @@ extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
+extern bool arch_function_offset_within_entry(unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 699c5bc51a92..448759d4a263 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1875,12 +1875,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(pre_handler_kretprobe);
 
+bool __weak arch_function_offset_within_entry(unsigned long offset)
+{
+	return !offset;
+}
+
 int register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
 	int i;
 	void *addr;
+	unsigned long offset;
+
+	addr = kprobe_addr(&rp->kp);
+	if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+		return -EINVAL;
+
+	if (!arch_function_offset_within_entry(offset))
+		return -EINVAL;
 
 	if (kretprobe_blacklist_size) {
 		addr = kprobe_addr(&rp->kp);
-- 
cgit v1.2.3


From 59d0f2da35693bfbcf6ffb014213cb8e225c8928 Mon Sep 17 00:00:00 2001
From: Song Hongyan <hongyan.song@intel.com>
Date: Fri, 3 Mar 2017 21:44:32 +0800
Subject: iio: hid: Add temperature sensor support

Environmental temperature sensor is a hid defined sensor,
it measures temperature.

More information can be found in:
http://www.usb.org/developers/hidpage/HUTRR39b.pdf

According to IIO ABI definition, IIO_TEMP data output unit is
milli degrees Celsius. Add the unit convert from degree to milli degree.

Signed-off-by: Song Hongyan <hongyan.song@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 .../iio/common/hid-sensors/hid-sensor-attributes.c |   3 +
 drivers/iio/temperature/Kconfig                    |  14 +
 drivers/iio/temperature/Makefile                   |   1 +
 drivers/iio/temperature/hid-sensor-temperature.c   | 311 +++++++++++++++++++++
 include/linux/hid-sensor-ids.h                     |   4 +
 5 files changed, 333 insertions(+)
 create mode 100644 drivers/iio/temperature/hid-sensor-temperature.c

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
index 7afdac42ed42..9338f94ce1dc 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
@@ -62,6 +62,9 @@ static struct {
 	{HID_USAGE_SENSOR_TIME_TIMESTAMP, 0, 1000000000, 0},
 	{HID_USAGE_SENSOR_TIME_TIMESTAMP, HID_USAGE_SENSOR_UNITS_MILLISECOND,
 		1000000, 0},
+
+	{HID_USAGE_SENSOR_TEMPERATURE, 0, 1000, 0},
+	{HID_USAGE_SENSOR_TEMPERATURE, HID_USAGE_SENSOR_UNITS_DEGREES, 1000, 0},
 };
 
 static int pow_10(unsigned power)
diff --git a/drivers/iio/temperature/Kconfig b/drivers/iio/temperature/Kconfig
index 3089e8d0a32d..5378976d6d27 100644
--- a/drivers/iio/temperature/Kconfig
+++ b/drivers/iio/temperature/Kconfig
@@ -19,6 +19,20 @@ config MAXIM_THERMOCOUPLE
 	  This driver can also be built as a module. If so, the module will
 	  be called maxim_thermocouple.
 
+config HID_SENSOR_TEMP
+	tristate "HID Environmental temperature sensor"
+	depends on HID_SENSOR_HUB
+	select IIO_BUFFER
+	select IIO_TRIGGERED_BUFFER
+	select HID_SENSOR_IIO_COMMON
+	select HID_SENSOR_IIO_TRIGGER
+	help
+	  Say yes here to build support for the HID SENSOR
+	  temperature driver
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called hid-sensor-temperature.
+
 config MLX90614
 	tristate "MLX90614 contact-less infrared sensor"
 	depends on I2C
diff --git a/drivers/iio/temperature/Makefile b/drivers/iio/temperature/Makefile
index 4c4377480726..ad1d668de546 100644
--- a/drivers/iio/temperature/Makefile
+++ b/drivers/iio/temperature/Makefile
@@ -2,6 +2,7 @@
 # Makefile for industrial I/O temperature drivers
 #
 
+obj-$(CONFIG_HID_SENSOR_TEMP) += hid-sensor-temperature.o
 obj-$(CONFIG_MAXIM_THERMOCOUPLE) += maxim_thermocouple.o
 obj-$(CONFIG_MLX90614) += mlx90614.o
 obj-$(CONFIG_TMP006) += tmp006.o
diff --git a/drivers/iio/temperature/hid-sensor-temperature.c b/drivers/iio/temperature/hid-sensor-temperature.c
new file mode 100644
index 000000000000..c01efeca4002
--- /dev/null
+++ b/drivers/iio/temperature/hid-sensor-temperature.c
@@ -0,0 +1,311 @@
+/*
+ * HID Sensors Driver
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.
+ */
+#include <linux/device.h>
+#include <linux/hid-sensor-hub.h>
+#include <linux/iio/buffer.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/triggered_buffer.h>
+#include <linux/iio/trigger_consumer.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include "../common/hid-sensors/hid-sensor-trigger.h"
+
+struct temperature_state {
+	struct hid_sensor_common common_attributes;
+	struct hid_sensor_hub_attribute_info temperature_attr;
+	s32 temperature_data;
+	int scale_pre_decml;
+	int scale_post_decml;
+	int scale_precision;
+	int value_offset;
+};
+
+/* Channel definitions */
+static const struct iio_chan_spec temperature_channels[] = {
+	{
+		.type = IIO_TEMP,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+			BIT(IIO_CHAN_INFO_SCALE) |
+			BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+			BIT(IIO_CHAN_INFO_HYSTERESIS),
+	},
+	IIO_CHAN_SOFT_TIMESTAMP(3),
+};
+
+/* Adjust channel real bits based on report descriptor */
+static void temperature_adjust_channel_bit_mask(struct iio_chan_spec *channels,
+					int channel, int size)
+{
+	channels[channel].scan_type.sign = 's';
+	/* Real storage bits will change based on the report desc. */
+	channels[channel].scan_type.realbits = size * 8;
+	/* Maximum size of a sample to capture is s32 */
+	channels[channel].scan_type.storagebits = sizeof(s32) * 8;
+}
+
+static int temperature_read_raw(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan,
+				int *val, int *val2, long mask)
+{
+	struct temperature_state *temp_st = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		if (chan->type != IIO_TEMP)
+			return -EINVAL;
+		hid_sensor_power_state(
+			&temp_st->common_attributes, true);
+		*val = sensor_hub_input_attr_get_raw_value(
+			temp_st->common_attributes.hsdev,
+			HID_USAGE_SENSOR_TEMPERATURE,
+			HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE,
+			temp_st->temperature_attr.report_id,
+			SENSOR_HUB_SYNC);
+		hid_sensor_power_state(
+				&temp_st->common_attributes,
+				false);
+
+		return IIO_VAL_INT;
+
+	case IIO_CHAN_INFO_SCALE:
+		*val = temp_st->scale_pre_decml;
+		*val2 = temp_st->scale_post_decml;
+		return temp_st->scale_precision;
+
+	case IIO_CHAN_INFO_OFFSET:
+		*val = temp_st->value_offset;
+		return IIO_VAL_INT;
+
+	case IIO_CHAN_INFO_SAMP_FREQ:
+		return hid_sensor_read_samp_freq_value(
+				&temp_st->common_attributes, val, val2);
+
+	case IIO_CHAN_INFO_HYSTERESIS:
+		return hid_sensor_read_raw_hyst_value(
+				&temp_st->common_attributes, val, val2);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int temperature_write_raw(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan,
+				int val, int val2, long mask)
+{
+	struct temperature_state *temp_st = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_SAMP_FREQ:
+		return hid_sensor_write_samp_freq_value(
+				&temp_st->common_attributes, val, val2);
+	case IIO_CHAN_INFO_HYSTERESIS:
+		return hid_sensor_write_raw_hyst_value(
+				&temp_st->common_attributes, val, val2);
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct iio_info temperature_info = {
+	.driver_module = THIS_MODULE,
+	.read_raw = &temperature_read_raw,
+	.write_raw = &temperature_write_raw,
+};
+
+/* Callback handler to send event after all samples are received and captured */
+static int temperature_proc_event(struct hid_sensor_hub_device *hsdev,
+				unsigned int usage_id, void *pdev)
+{
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct temperature_state *temp_st = iio_priv(indio_dev);
+
+	if (atomic_read(&temp_st->common_attributes.data_ready))
+		iio_push_to_buffers_with_timestamp(indio_dev,
+				&temp_st->temperature_data,
+				iio_get_time_ns(indio_dev));
+
+	return 0;
+}
+
+/* Capture samples in local storage */
+static int temperature_capture_sample(struct hid_sensor_hub_device *hsdev,
+				unsigned int usage_id, size_t raw_len,
+				char *raw_data, void *pdev)
+{
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct temperature_state *temp_st = iio_priv(indio_dev);
+
+	switch (usage_id) {
+	case HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE:
+		temp_st->temperature_data = *(s32 *)raw_data;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* Parse report which is specific to an usage id*/
+static int temperature_parse_report(struct platform_device *pdev,
+				struct hid_sensor_hub_device *hsdev,
+				struct iio_chan_spec *channels,
+				unsigned int usage_id,
+				struct temperature_state *st)
+{
+	int ret;
+
+	ret = sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT,
+			usage_id,
+			HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE,
+			&st->temperature_attr);
+	if (ret < 0)
+		return ret;
+
+	temperature_adjust_channel_bit_mask(channels, 0,
+					st->temperature_attr.size);
+
+	st->scale_precision = hid_sensor_format_scale(
+				HID_USAGE_SENSOR_TEMPERATURE,
+				&st->temperature_attr,
+				&st->scale_pre_decml, &st->scale_post_decml);
+
+	/* Set Sensitivity field ids, when there is no individual modifier */
+	if (st->common_attributes.sensitivity.index < 0)
+		sensor_hub_input_get_attribute_info(hsdev,
+			HID_FEATURE_REPORT, usage_id,
+			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
+			HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE,
+			&st->common_attributes.sensitivity);
+
+	return ret;
+}
+
+static struct hid_sensor_hub_callbacks temperature_callbacks = {
+	.send_event = &temperature_proc_event,
+	.capture_sample = &temperature_capture_sample,
+};
+
+/* Function to initialize the processing for usage id */
+static int hid_temperature_probe(struct platform_device *pdev)
+{
+	static const char *name = "temperature";
+	struct iio_dev *indio_dev;
+	struct temperature_state *temp_st;
+	struct iio_chan_spec *temp_chans;
+	struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev);
+	int ret;
+
+	indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*temp_st));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	temp_st = iio_priv(indio_dev);
+	temp_st->common_attributes.hsdev = hsdev;
+	temp_st->common_attributes.pdev = pdev;
+
+	ret = hid_sensor_parse_common_attributes(hsdev,
+					HID_USAGE_SENSOR_TEMPERATURE,
+					&temp_st->common_attributes);
+	if (ret)
+		return ret;
+
+	temp_chans = devm_kmemdup(&indio_dev->dev, temperature_channels,
+				sizeof(temperature_channels), GFP_KERNEL);
+	if (!temp_chans)
+		return -ENOMEM;
+
+	ret = temperature_parse_report(pdev, hsdev, temp_chans,
+				HID_USAGE_SENSOR_TEMPERATURE, temp_st);
+	if (ret)
+		return ret;
+
+	indio_dev->channels = temp_chans;
+	indio_dev->num_channels = ARRAY_SIZE(temperature_channels);
+	indio_dev->dev.parent = &pdev->dev;
+	indio_dev->info = &temperature_info;
+	indio_dev->name = name;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+
+	ret = devm_iio_triggered_buffer_setup(&pdev->dev, indio_dev,
+					&iio_pollfunc_store_time, NULL, NULL);
+	if (ret)
+		return ret;
+
+	atomic_set(&temp_st->common_attributes.data_ready, 0);
+	ret = hid_sensor_setup_trigger(indio_dev, name,
+				&temp_st->common_attributes);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, indio_dev);
+
+	temperature_callbacks.pdev = pdev;
+	ret = sensor_hub_register_callback(hsdev, HID_USAGE_SENSOR_TEMPERATURE,
+					&temperature_callbacks);
+	if (ret)
+		goto error_remove_trigger;
+
+	ret = devm_iio_device_register(indio_dev->dev.parent, indio_dev);
+	if (ret)
+		goto error_remove_callback;
+
+	return ret;
+
+error_remove_callback:
+	sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TEMPERATURE);
+error_remove_trigger:
+	hid_sensor_remove_trigger(&temp_st->common_attributes);
+	return ret;
+}
+
+/* Function to deinitialize the processing for usage id */
+static int hid_temperature_remove(struct platform_device *pdev)
+{
+	struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev);
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct temperature_state *temp_st = iio_priv(indio_dev);
+
+	sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TEMPERATURE);
+	hid_sensor_remove_trigger(&temp_st->common_attributes);
+
+	return 0;
+}
+
+static const struct platform_device_id hid_temperature_ids[] = {
+	{
+		/* Format: HID-SENSOR-usage_id_in_hex_lowercase */
+		.name = "HID-SENSOR-200033",
+	},
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, hid_temperature_ids);
+
+static struct platform_driver hid_temperature_platform_driver = {
+	.id_table = hid_temperature_ids,
+	.driver = {
+		.name	= "temperature-sensor",
+		.pm	= &hid_sensor_pm_ops,
+	},
+	.probe		= hid_temperature_probe,
+	.remove		= hid_temperature_remove,
+};
+module_platform_driver(hid_temperature_platform_driver);
+
+MODULE_DESCRIPTION("HID Environmental temperature sensor");
+MODULE_AUTHOR("Song Hongyan <hongyan.song@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 30c7dc45e45f..46dd1f27d2f2 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -45,6 +45,10 @@
 #define HID_USAGE_SENSOR_DATA_ATMOSPHERIC_PRESSURE              0x200430
 #define HID_USAGE_SENSOR_ATMOSPHERIC_PRESSURE                   0x200431
 
+/* Tempreture (200033) */
+#define	HID_USAGE_SENSOR_TEMPERATURE				0x200033
+#define	HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE		0x200434
+
 /* Gyro 3D: (200076) */
 #define HID_USAGE_SENSOR_GYRO_3D				0x200076
 #define HID_USAGE_SENSOR_DATA_ANGL_VELOCITY			0x200456
-- 
cgit v1.2.3


From 791ec491c372f49cea3ea7a7143454a9023ac9d4 Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Fri, 17 Feb 2017 07:57:00 -0500
Subject: prlimit,security,selinux: add a security hook for prlimit

When SELinux was first added to the kernel, a process could only get
and set its own resource limits via getrlimit(2) and setrlimit(2), so no
MAC checks were required for those operations, and thus no security hooks
were defined for them. Later, SELinux introduced a hook for setlimit(2)
with a check if the hard limit was being changed in order to be able to
rely on the hard limit value as a safe reset point upon context
transitions.

Later on, when prlimit(2) was added to the kernel with the ability to get
or set resource limits (hard or soft) of another process, LSM/SELinux was
not updated other than to pass the target process to the setrlimit hook.
This resulted in incomplete control over both getting and setting the
resource limits of another process.

Add a new security_task_prlimit() hook to the check_prlimit_permission()
function to provide complete mediation.  The hook is only called when
acting on another task, and only if the existing DAC/capability checks
would allow access.  Pass flags down to the hook to indicate whether the
prlimit(2) call will read, write, or both read and write the resource
limits of the target process.

The existing security_task_setrlimit() hook is left alone; it continues
to serve a purpose in supporting the ability to make decisions based on
the old and/or new resource limit values when setting limits.  This
is consistent with the DAC/capability logic, where
check_prlimit_permission() performs generic DAC/capability checks for
acting on another task, while do_prlimit() performs a capability check
based on a comparison of the old and new resource limits.  Fix the
inline documentation for the hook to match the code.

Implement the new hook for SELinux.  For setting resource limits, we
reuse the existing setrlimit permission.  Note that this does overload
the setrlimit permission to mean the ability to set the resource limit
(soft or hard) of another process or the ability to change one's own
hard limit.  For getting resource limits, a new getrlimit permission
is defined.  This was not originally defined since getrlimit(2) could
only be used to obtain a process' own limits.

Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/lsm_hooks.h           | 18 +++++++++++++++---
 include/linux/security.h            | 13 +++++++++++++
 kernel/sys.c                        | 30 ++++++++++++++++++------------
 security/security.c                 |  8 ++++++++
 security/selinux/hooks.c            | 14 ++++++++++++++
 security/selinux/include/classmap.h |  2 +-
 6 files changed, 69 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index e29d4c62a3c8..ba3049f05aea 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -630,10 +630,19 @@
  *	Check permission before getting the ioprio value of @p.
  *	@p contains the task_struct of process.
  *	Return 0 if permission is granted.
+ * @task_prlimit:
+ *	Check permission before getting and/or setting the resource limits of
+ *	another task.
+ *	@cred points to the cred structure for the current task.
+ *	@tcred points to the cred structure for the target task.
+ *	@flags contains the LSM_PRLIMIT_* flag bits indicating whether the
+ *	resource limits are being read, modified, or both.
+ *	Return 0 if permission is granted.
  * @task_setrlimit:
- *	Check permission before setting the resource limits of the current
- *	process for @resource to @new_rlim.  The old resource limit values can
- *	be examined by dereferencing (current->signal->rlim + resource).
+ *	Check permission before setting the resource limits of process @p
+ *	for @resource to @new_rlim.  The old resource limit values can
+ *	be examined by dereferencing (p->signal->rlim + resource).
+ *	@p points to the task_struct for the target task's group leader.
  *	@resource contains the resource whose limit is being set.
  *	@new_rlim contains the new limits for @resource.
  *	Return 0 if permission is granted.
@@ -1494,6 +1503,8 @@ union security_list_options {
 	int (*task_setnice)(struct task_struct *p, int nice);
 	int (*task_setioprio)(struct task_struct *p, int ioprio);
 	int (*task_getioprio)(struct task_struct *p);
+	int (*task_prlimit)(const struct cred *cred, const struct cred *tcred,
+			    unsigned int flags);
 	int (*task_setrlimit)(struct task_struct *p, unsigned int resource,
 				struct rlimit *new_rlim);
 	int (*task_setscheduler)(struct task_struct *p);
@@ -1755,6 +1766,7 @@ struct security_hook_heads {
 	struct list_head task_setnice;
 	struct list_head task_setioprio;
 	struct list_head task_getioprio;
+	struct list_head task_prlimit;
 	struct list_head task_setrlimit;
 	struct list_head task_setscheduler;
 	struct list_head task_getscheduler;
diff --git a/include/linux/security.h b/include/linux/security.h
index 96899fad7016..97df7bac5b48 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -133,6 +133,10 @@ extern unsigned long dac_mmap_min_addr;
 /* setfsuid or setfsgid, id0 == fsuid or fsgid */
 #define LSM_SETID_FS	8
 
+/* Flags for security_task_prlimit(). */
+#define LSM_PRLIMIT_READ  1
+#define LSM_PRLIMIT_WRITE 2
+
 /* forward declares to avoid warnings */
 struct sched_param;
 struct request_sock;
@@ -324,6 +328,8 @@ void security_task_getsecid(struct task_struct *p, u32 *secid);
 int security_task_setnice(struct task_struct *p, int nice);
 int security_task_setioprio(struct task_struct *p, int ioprio);
 int security_task_getioprio(struct task_struct *p);
+int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
+			  unsigned int flags);
 int security_task_setrlimit(struct task_struct *p, unsigned int resource,
 		struct rlimit *new_rlim);
 int security_task_setscheduler(struct task_struct *p);
@@ -949,6 +955,13 @@ static inline int security_task_getioprio(struct task_struct *p)
 	return 0;
 }
 
+static inline int security_task_prlimit(const struct cred *cred,
+					const struct cred *tcred,
+					unsigned int flags)
+{
+	return 0;
+}
+
 static inline int security_task_setrlimit(struct task_struct *p,
 					  unsigned int resource,
 					  struct rlimit *new_rlim)
diff --git a/kernel/sys.c b/kernel/sys.c
index 7ff6d1b10cec..196c7134bee6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1432,25 +1432,26 @@ out:
 }
 
 /* rcu lock must be held */
-static int check_prlimit_permission(struct task_struct *task)
+static int check_prlimit_permission(struct task_struct *task,
+				    unsigned int flags)
 {
 	const struct cred *cred = current_cred(), *tcred;
+	bool id_match;
 
 	if (current == task)
 		return 0;
 
 	tcred = __task_cred(task);
-	if (uid_eq(cred->uid, tcred->euid) &&
-	    uid_eq(cred->uid, tcred->suid) &&
-	    uid_eq(cred->uid, tcred->uid)  &&
-	    gid_eq(cred->gid, tcred->egid) &&
-	    gid_eq(cred->gid, tcred->sgid) &&
-	    gid_eq(cred->gid, tcred->gid))
-		return 0;
-	if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
-		return 0;
+	id_match = (uid_eq(cred->uid, tcred->euid) &&
+		    uid_eq(cred->uid, tcred->suid) &&
+		    uid_eq(cred->uid, tcred->uid)  &&
+		    gid_eq(cred->gid, tcred->egid) &&
+		    gid_eq(cred->gid, tcred->sgid) &&
+		    gid_eq(cred->gid, tcred->gid));
+	if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE))
+		return -EPERM;
 
-	return -EPERM;
+	return security_task_prlimit(cred, tcred, flags);
 }
 
 SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
@@ -1460,12 +1461,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
 	struct rlimit64 old64, new64;
 	struct rlimit old, new;
 	struct task_struct *tsk;
+	unsigned int checkflags = 0;
 	int ret;
 
+	if (old_rlim)
+		checkflags |= LSM_PRLIMIT_READ;
+
 	if (new_rlim) {
 		if (copy_from_user(&new64, new_rlim, sizeof(new64)))
 			return -EFAULT;
 		rlim64_to_rlim(&new64, &new);
+		checkflags |= LSM_PRLIMIT_WRITE;
 	}
 
 	rcu_read_lock();
@@ -1474,7 +1480,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
 		rcu_read_unlock();
 		return -ESRCH;
 	}
-	ret = check_prlimit_permission(tsk);
+	ret = check_prlimit_permission(tsk, checkflags);
 	if (ret) {
 		rcu_read_unlock();
 		return ret;
diff --git a/security/security.c b/security/security.c
index d0e07f269b2d..905dad2811d3 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1036,6 +1036,12 @@ int security_task_getioprio(struct task_struct *p)
 	return call_int_hook(task_getioprio, 0, p);
 }
 
+int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
+			  unsigned int flags)
+{
+	return call_int_hook(task_prlimit, 0, cred, tcred, flags);
+}
+
 int security_task_setrlimit(struct task_struct *p, unsigned int resource,
 		struct rlimit *new_rlim)
 {
@@ -1793,6 +1799,8 @@ struct security_hook_heads security_hook_heads = {
 		LIST_HEAD_INIT(security_hook_heads.task_setioprio),
 	.task_getioprio =
 		LIST_HEAD_INIT(security_hook_heads.task_getioprio),
+	.task_prlimit =
+		LIST_HEAD_INIT(security_hook_heads.task_prlimit),
 	.task_setrlimit =
 		LIST_HEAD_INIT(security_hook_heads.task_setrlimit),
 	.task_setscheduler =
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 0c2ac318aa7f..870d24ecc2de 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3920,6 +3920,19 @@ static int selinux_task_getioprio(struct task_struct *p)
 			    PROCESS__GETSCHED, NULL);
 }
 
+int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred,
+			 unsigned int flags)
+{
+	u32 av = 0;
+
+	if (flags & LSM_PRLIMIT_WRITE)
+		av |= PROCESS__SETRLIMIT;
+	if (flags & LSM_PRLIMIT_READ)
+		av |= PROCESS__GETRLIMIT;
+	return avc_has_perm(cred_sid(cred), cred_sid(tcred),
+			    SECCLASS_PROCESS, av, NULL);
+}
+
 static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
 		struct rlimit *new_rlim)
 {
@@ -6206,6 +6219,7 @@ static struct security_hook_list selinux_hooks[] = {
 	LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
 	LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
 	LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
+	LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit),
 	LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit),
 	LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler),
 	LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index d429c4a1c551..1e0cc9b5de20 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -47,7 +47,7 @@ struct security_class_mapping secclass_map[] = {
 	    "getattr", "setexec", "setfscreate", "noatsecure", "siginh",
 	    "setrlimit", "rlimitinh", "dyntransition", "setcurrent",
 	    "execmem", "execstack", "execheap", "setkeycreate",
-	    "setsockcreate", NULL } },
+	    "setsockcreate", "getrlimit", NULL } },
 	{ "system",
 	  { "ipc_info", "syslog_read", "syslog_mod",
 	    "syslog_console", "module_request", "module_load", NULL } },
-- 
cgit v1.2.3


From dd0859dccbe291cf8179a96390f5c0e45cb9af1d Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Wed, 15 Feb 2017 00:17:24 +1100
Subject: security: introduce CONFIG_SECURITY_WRITABLE_HOOKS

Subsequent patches will add RO hardening to LSM hooks, however, SELinux
still needs to be able to perform runtime disablement after init to handle
architectures where init-time disablement via boot parameters is not feasible.

Introduce a new kernel configuration parameter CONFIG_SECURITY_WRITABLE_HOOKS,
and a helper macro __lsm_ro_after_init, to handle this case.

Signed-off-by: James Morris <james.l.morris@oracle.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Kees Cook <keescook@chromium.org>
---
 include/linux/lsm_hooks.h | 7 +++++++
 security/Kconfig          | 5 +++++
 security/selinux/Kconfig  | 6 ++++++
 3 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index ba3049f05aea..1aa63335de9e 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1920,6 +1920,13 @@ static inline void security_delete_hooks(struct security_hook_list *hooks,
 }
 #endif /* CONFIG_SECURITY_SELINUX_DISABLE */
 
+/* Currently required to handle SELinux runtime hook disable. */
+#ifdef CONFIG_SECURITY_WRITABLE_HOOKS
+#define __lsm_ro_after_init
+#else
+#define __lsm_ro_after_init	__ro_after_init
+#endif /* CONFIG_SECURITY_WRITABLE_HOOKS */
+
 extern int __init security_module_enable(const char *module);
 extern void __init capability_add_hooks(void);
 #ifdef CONFIG_SECURITY_YAMA
diff --git a/security/Kconfig b/security/Kconfig
index d900f47eaa68..3ff1bf91080e 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -31,6 +31,11 @@ config SECURITY
 
 	  If you are unsure how to answer this question, answer N.
 
+config SECURITY_WRITABLE_HOOKS
+	depends on SECURITY
+	bool
+	default n
+
 config SECURITYFS
 	bool "Enable the securityfs filesystem"
 	help
diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig
index ea7e3efbe0f7..8af7a690eb40 100644
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -40,6 +40,7 @@ config SECURITY_SELINUX_BOOTPARAM_VALUE
 config SECURITY_SELINUX_DISABLE
 	bool "NSA SELinux runtime disable"
 	depends on SECURITY_SELINUX
+	select SECURITY_WRITABLE_HOOKS
 	default n
 	help
 	  This option enables writing to a selinuxfs node 'disable', which
@@ -50,6 +51,11 @@ config SECURITY_SELINUX_DISABLE
 	  portability across platforms where boot parameters are difficult
 	  to employ.
 
+	  NOTE: selecting this option will disable the '__ro_after_init'
+	  kernel hardening feature for security hooks.   Please consider
+	  using the selinux=0 boot parameter instead of enabling this
+	  option.
+
 	  If you are unsure how to answer this question, answer N.
 
 config SECURITY_SELINUX_DEVELOP
-- 
cgit v1.2.3


From 5e6039d8a307d8411422c154f3d446b44fa32b6d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 27 Dec 2016 18:00:15 -0500
Subject: uaccess: move VERIFY_{READ,WRITE} definitions to linux/uaccess.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/uaccess.h      | 3 ---
 arch/arm/include/asm/uaccess.h        | 3 ---
 arch/arm64/include/asm/uaccess.h      | 3 ---
 arch/avr32/include/asm/uaccess.h      | 3 ---
 arch/blackfin/include/asm/uaccess.h   | 3 ---
 arch/cris/include/asm/uaccess.h       | 3 ---
 arch/frv/include/asm/uaccess.h        | 3 ---
 arch/hexagon/include/asm/uaccess.h    | 2 --
 arch/ia64/include/asm/uaccess.h       | 3 ---
 arch/m32r/include/asm/uaccess.h       | 3 ---
 arch/m68k/include/asm/uaccess_mm.h    | 3 ---
 arch/m68k/include/asm/uaccess_no.h    | 3 ---
 arch/metag/include/asm/uaccess.h      | 3 ---
 arch/microblaze/include/asm/uaccess.h | 3 ---
 arch/mips/include/asm/uaccess.h       | 3 ---
 arch/mn10300/include/asm/uaccess.h    | 3 ---
 arch/nios2/include/asm/uaccess.h      | 3 ---
 arch/openrisc/include/asm/uaccess.h   | 3 ---
 arch/parisc/include/asm/uaccess.h     | 3 ---
 arch/powerpc/include/asm/uaccess.h    | 3 ---
 arch/s390/include/asm/uaccess.h       | 3 ---
 arch/score/include/asm/uaccess.h      | 3 ---
 arch/sh/include/asm/uaccess.h         | 3 ---
 arch/sparc/include/asm/uaccess_32.h   | 3 ---
 arch/sparc/include/asm/uaccess_64.h   | 3 ---
 arch/tile/include/asm/uaccess.h       | 3 ---
 arch/x86/include/asm/uaccess.h        | 3 ---
 arch/xtensa/include/asm/asm-uaccess.h | 3 ---
 arch/xtensa/include/asm/uaccess.h     | 3 ---
 include/asm-generic/uaccess.h         | 3 ---
 include/linux/uaccess.h               | 4 ++++
 31 files changed, 4 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 94f587535dee..a37267a5d399 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -20,9 +20,6 @@
 #define KERNEL_DS	((mm_segment_t) { 0UL })
 #define USER_DS		((mm_segment_t) { -0x40000000000UL })
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_fs()  (current_thread_info()->addr_limit)
 #define get_ds()  (KERNEL_DS)
 #define set_fs(x) (current_thread_info()->addr_limit = (x))
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index b7e0125c0bbf..a13f39b3e9f8 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -26,9 +26,6 @@
 #define __put_user_unaligned __put_user
 #endif
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 5308d696311b..f5e1e090b4d2 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -36,9 +36,6 @@
 #include <asm/memory.h>
 #include <asm/compiler.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The exception table consists of pairs of relative offsets: the first
  * is the relative offset to an instruction that is allowed to fault,
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index b1ec1fa06463..1c7f234385fc 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -11,9 +11,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 typedef struct {
 	unsigned int is_user_space;
 } mm_segment_t;
diff --git a/arch/blackfin/include/asm/uaccess.h b/arch/blackfin/include/asm/uaccess.h
index 0eff88aa6d6a..d9a91108964f 100644
--- a/arch/blackfin/include/asm/uaccess.h
+++ b/arch/blackfin/include/asm/uaccess.h
@@ -29,9 +29,6 @@ static inline void set_fs(mm_segment_t fs)
 
 #define segment_eq(a, b) ((a) == (b))
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define access_ok(type, addr, size) _access_ok((unsigned long)(addr), (size))
 
 /*
diff --git a/arch/cris/include/asm/uaccess.h b/arch/cris/include/asm/uaccess.h
index 56c7d5750abd..82bfcb5e2c9b 100644
--- a/arch/cris/include/asm/uaccess.h
+++ b/arch/cris/include/asm/uaccess.h
@@ -21,9 +21,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index c0f4057eab60..9e01bd798a03 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -22,9 +22,6 @@
 
 #define __ptr(x) ((unsigned long __force *)(x))
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * check that a range of addresses falls within the current address limit
  */
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index f61cfb28e9f2..21f63593e2b6 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -50,8 +50,6 @@
  * reasonably simple and not *too* slow.  After all, we've got the
  * MMU for backup.
  */
-#define VERIFY_READ     0
-#define VERIFY_WRITE    1
 
 #define __access_ok(addr, size) \
 	((get_fs().seg == KERNEL_DS.seg) || \
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index 471044be2a3b..c60ff6cc8dbd 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -48,9 +48,6 @@
 #define KERNEL_DS	((mm_segment_t) { ~0UL })		/* cf. access_ok() */
 #define USER_DS		((mm_segment_t) { TASK_SIZE-1 })	/* cf. access_ok() */
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_ds()  (KERNEL_DS)
 #define get_fs()  (current_thread_info()->addr_limit)
 #define set_fs(x) (current_thread_info()->addr_limit = (x))
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 6f8982157a75..7d993a837e39 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -16,9 +16,6 @@
 #include <asm/page.h>
 #include <asm/setup.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index d228601b3afc..fa84e9c6e8f4 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -10,9 +10,6 @@
 #include <linux/sched.h>
 #include <asm/segment.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /* We let the MMU do all checking */
 static inline int access_ok(int type, const void __user *addr,
 			    unsigned long size)
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index 36deeb36503b..fab489a25b95 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -10,9 +10,6 @@
 
 #include <asm/segment.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define access_ok(type,addr,size)	_access_ok((unsigned long)(addr),(size))
 
 /*
diff --git a/arch/metag/include/asm/uaccess.h b/arch/metag/include/asm/uaccess.h
index 273e61225c27..46c1f6c54103 100644
--- a/arch/metag/include/asm/uaccess.h
+++ b/arch/metag/include/asm/uaccess.h
@@ -6,9 +6,6 @@
  */
 #include <linux/sched.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 253a67e275ad..70cf5f3dfae3 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -24,9 +24,6 @@
 #include <asm/pgtable.h>
 #include <linux/string.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * On Microblaze the fs value is actually the top of the corresponding
  * address space.
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 5347cfe15af2..a058c04b8dd4 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -71,9 +71,6 @@ extern u64 __ua_limit;
 #define USER_DS		((mm_segment_t) { __UA_LIMIT })
 #endif
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
 #define set_fs(x)	(current_thread_info()->addr_limit = (x))
diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 2eedf6f46a57..3e16850c4ccd 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -19,9 +19,6 @@
 #include <asm/page.h>
 #include <asm/errno.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h
index 0ab82324c817..07fc68c3e23c 100644
--- a/arch/nios2/include/asm/uaccess.h
+++ b/arch/nios2/include/asm/uaccess.h
@@ -19,9 +19,6 @@
 
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 140faa16685a..6f88cf8bd112 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -28,9 +28,6 @@
 #include <linux/string.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index fb4382c28259..598b52e5aa03 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -13,9 +13,6 @@
 #include <linux/string.h>
 #include <linux/thread_info.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 #define KERNEL_DS	((mm_segment_t){0})
 #define USER_DS 	((mm_segment_t){1})
 
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 0e6add3187bc..81307633c33f 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -11,9 +11,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 136932ff4250..0e1f515d239b 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -17,9 +17,6 @@
 #include <asm/processor.h>
 #include <asm/ctl_reg.h>
 
-#define VERIFY_READ     0
-#define VERIFY_WRITE    1
-
 
 /*
  * The fs value determines whether argument validity checking should be
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index db58ab98ec4b..51914244e867 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -6,9 +6,6 @@
 #include <linux/thread_info.h>
 #include <asm/extable.h>
 
-#define VERIFY_READ		0
-#define VERIFY_WRITE		1
-
 #define get_ds()		(KERNEL_DS)
 #define get_fs()		(current_thread_info()->addr_limit)
 #define segment_eq(a, b)	((a).seg == (b).seg)
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index c4f0fee812c3..6b66d67c21d2 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -5,9 +5,6 @@
 #include <linux/sched.h>
 #include <asm/segment.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define __addr_ok(addr) \
 	((unsigned long __force)(addr) < current_thread_info()->addr_limit.seg)
 
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index ea55f86d7ccd..d8857f5fafad 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -30,9 +30,6 @@
 #define KERNEL_DS   ((mm_segment_t) { 0 })
 #define USER_DS     ((mm_segment_t) { -1 })
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current->thread.current_ds)
 #define set_fs(val)	((current->thread.current_ds) = (val))
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 5373136c412b..619223dc9022 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -36,9 +36,6 @@
 #define KERNEL_DS   ((mm_segment_t) { ASI_P })
 #define USER_DS     ((mm_segment_t) { ASI_AIUS })	/* har har har */
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)})
 #define get_ds() (KERNEL_DS)
 
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index a77369e91e54..730073326b46 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -24,9 +24,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ea148313570f..8dffb8b1d328 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -13,9 +13,6 @@
 #include <asm/smap.h>
 #include <asm/extable.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/xtensa/include/asm/asm-uaccess.h b/arch/xtensa/include/asm/asm-uaccess.h
index a7a110039786..dfdf9fae1f84 100644
--- a/arch/xtensa/include/asm/asm-uaccess.h
+++ b/arch/xtensa/include/asm/asm-uaccess.h
@@ -19,9 +19,6 @@
 #include <linux/errno.h>
 #include <asm/types.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/processor.h>
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index 848a3d736bcb..dd6b13649aad 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -20,9 +20,6 @@
 #include <linux/prefetch.h>
 #include <asm/types.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #include <linux/sched.h>
 
 /*
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index cc6bb319e464..d7c17bfd4601 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -35,9 +35,6 @@ static inline void set_fs(mm_segment_t fs)
 #define segment_eq(a, b) ((a).seg == (b).seg)
 #endif
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size))
 
 /*
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index f30c187ed785..b660f37beaf5 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,6 +2,10 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/sched.h>
+
+#define VERIFY_READ 0
+#define VERIFY_WRITE 1
+
 #include <asm/uaccess.h>
 
 static __always_inline void pagefault_disabled_inc(void)
-- 
cgit v1.2.3


From af1d5b37d6211c814fac0d5d0b71ec695618054a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 27 Dec 2016 18:14:09 -0500
Subject: uaccess: drop duplicate includes from asm/uaccess.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/uaccess.h      | 4 ----
 arch/arc/include/asm/uaccess.h        | 2 --
 arch/arm/include/asm/uaccess.h        | 2 --
 arch/arm64/include/asm/uaccess.h      | 2 --
 arch/avr32/include/asm/uaccess.h      | 3 ---
 arch/blackfin/include/asm/uaccess.h   | 1 -
 arch/cris/include/asm/uaccess.h       | 2 --
 arch/frv/include/asm/uaccess.h        | 1 -
 arch/hexagon/include/asm/uaccess.h    | 1 -
 arch/ia64/include/asm/uaccess.h       | 2 --
 arch/m32r/include/asm/uaccess.h       | 2 --
 arch/m68k/include/asm/uaccess_mm.h    | 2 --
 arch/m68k/include/asm/uaccess_no.h    | 1 -
 arch/metag/include/asm/uaccess.h      | 1 -
 arch/microblaze/include/asm/uaccess.h | 2 --
 arch/mips/include/asm/uaccess.h       | 2 --
 arch/mn10300/include/asm/uaccess.h    | 2 --
 arch/nios2/include/asm/uaccess.h      | 2 --
 arch/openrisc/include/asm/uaccess.h   | 2 --
 arch/parisc/include/asm/uaccess.h     | 2 --
 arch/powerpc/include/asm/uaccess.h    | 2 --
 arch/s390/include/asm/uaccess.h       | 2 --
 arch/score/include/asm/uaccess.h      | 2 --
 arch/sh/include/asm/uaccess.h         | 2 --
 arch/sparc/include/asm/uaccess_32.h   | 2 --
 arch/sparc/include/asm/uaccess_64.h   | 2 --
 arch/tile/include/asm/uaccess.h       | 1 -
 arch/um/include/asm/uaccess.h         | 1 -
 arch/unicore32/include/asm/uaccess.h  | 3 ---
 arch/x86/include/asm/uaccess.h        | 2 --
 arch/x86/include/asm/uaccess_32.h     | 2 --
 arch/x86/include/asm/uaccess_64.h     | 1 -
 arch/xtensa/include/asm/uaccess.h     | 3 ---
 include/asm-generic/uaccess.h         | 1 -
 include/linux/uaccess.h               | 1 +
 35 files changed, 1 insertion(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index a37267a5d399..77c55ce89936 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -1,10 +1,6 @@
 #ifndef __ALPHA_UACCESS_H
 #define __ALPHA_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index 41faf17cd28d..0431f5668354 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -24,8 +24,6 @@
 #ifndef _ASM_ARC_UACCESS_H
 #define _ASM_ARC_UACCESS_H
 
-#include <linux/sched.h>
-#include <asm/errno.h>
 #include <linux/string.h>	/* for generic string functions */
 
 
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index a13f39b3e9f8..9677a7cf7987 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -12,8 +12,6 @@
  * User space memory access functions
  */
 #include <linux/string.h>
-#include <linux/thread_info.h>
-#include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/domain.h>
 #include <asm/unified.h>
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index f5e1e090b4d2..7c514e10a08e 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -28,11 +28,9 @@
 #include <linux/bitops.h>
 #include <linux/kasan-checks.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 
 #include <asm/cpufeature.h>
 #include <asm/ptrace.h>
-#include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/compiler.h>
 
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index 1c7f234385fc..7ca5cb33369b 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -8,9 +8,6 @@
 #ifndef __ASM_AVR32_UACCESS_H
 #define __ASM_AVR32_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-
 typedef struct {
 	unsigned int is_user_space;
 } mm_segment_t;
diff --git a/arch/blackfin/include/asm/uaccess.h b/arch/blackfin/include/asm/uaccess.h
index d9a91108964f..c9fedc3be30c 100644
--- a/arch/blackfin/include/asm/uaccess.h
+++ b/arch/blackfin/include/asm/uaccess.h
@@ -12,7 +12,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 
diff --git a/arch/cris/include/asm/uaccess.h b/arch/cris/include/asm/uaccess.h
index 82bfcb5e2c9b..bb3004a2b2f7 100644
--- a/arch/cris/include/asm/uaccess.h
+++ b/arch/cris/include/asm/uaccess.h
@@ -16,8 +16,6 @@
 #define _CRIS_UACCESS_H
 
 #ifndef __ASSEMBLY__
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 
diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index 9e01bd798a03..55b3a69c6c53 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -15,7 +15,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/segment.h>
 #include <asm/sections.h>
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index 21f63593e2b6..3a7f818e5ef7 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -23,7 +23,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/segment.h>
 #include <asm/sections.h>
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index c60ff6cc8dbd..d471d1a1afd0 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -33,8 +33,6 @@
  */
 
 #include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
 #include <linux/page-flags.h>
 #include <linux/mm.h>
 
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 7d993a837e39..96b0efdb5f22 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -11,8 +11,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <asm/page.h>
 #include <asm/setup.h>
 
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index fa84e9c6e8f4..fb72b710759e 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -5,9 +5,7 @@
  * User space memory access functions
  */
 #include <linux/compiler.h>
-#include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
 #include <asm/segment.h>
 
 /* We let the MMU do all checking */
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index fab489a25b95..e77ce66c14d5 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -4,7 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 
diff --git a/arch/metag/include/asm/uaccess.h b/arch/metag/include/asm/uaccess.h
index 46c1f6c54103..7fc5277ae71f 100644
--- a/arch/metag/include/asm/uaccess.h
+++ b/arch/metag/include/asm/uaccess.h
@@ -4,7 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 
 /*
  * The fs value determines whether argument validity checking should be
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 70cf5f3dfae3..a3c0a06d7848 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -15,8 +15,6 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/sched.h> /* RLIMIT_FSIZE */
 #include <linux/mm.h>
 
 #include <asm/mmu.h>
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index a058c04b8dd4..dd25b312c973 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -12,8 +12,6 @@
 #define _ASM_UACCESS_H
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm-eva.h>
 #include <asm/extable.h>
diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 3e16850c4ccd..2da7b0fed4aa 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -14,10 +14,8 @@
 /*
  * User space memory access functions
  */
-#include <linux/thread_info.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
-#include <asm/errno.h>
 
 /*
  * The fs value determines whether argument validity checking should be
diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h
index 07fc68c3e23c..198bbf15f644 100644
--- a/arch/nios2/include/asm/uaccess.h
+++ b/arch/nios2/include/asm/uaccess.h
@@ -13,8 +13,6 @@
 #ifndef _ASM_NIOS2_UACCESS_H
 #define _ASM_NIOS2_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 
 #include <asm/page.h>
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 6f88cf8bd112..0b0f60444b76 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -22,8 +22,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/prefetch.h>
 #include <linux/string.h>
 #include <asm/page.h>
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index 598b52e5aa03..a0b461336b6a 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -6,12 +6,10 @@
  */
 #include <asm/page.h>
 #include <asm/cache.h>
-#include <asm/errno.h>
 #include <asm-generic/uaccess-unaligned.h>
 
 #include <linux/bug.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 
 #define KERNEL_DS	((mm_segment_t){0})
 #define USER_DS 	((mm_segment_t){1})
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 81307633c33f..2ec70aa1cc5d 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -4,8 +4,6 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/asm-compat.h>
 #include <asm/ppc_asm.h>
 #include <asm/processor.h>
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 0e1f515d239b..9e9a5e8d6cf6 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -12,8 +12,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/processor.h>
 #include <asm/ctl_reg.h>
 
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index 51914244e867..7a6c6982420a 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -2,8 +2,6 @@
 #define __SCORE_UACCESS_H
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <asm/extable.h>
 
 #define get_ds()		(KERNEL_DS)
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index 6b66d67c21d2..89a28dfbabfa 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -1,8 +1,6 @@
 #ifndef __ASM_SH_UACCESS_H
 #define __ASM_SH_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
 #include <asm/segment.h>
 
 #define __addr_ok(addr) \
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index d8857f5fafad..b10f7d626f0e 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -9,9 +9,7 @@
 
 #ifdef __KERNEL__
 #include <linux/compiler.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/errno.h>
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 619223dc9022..d76362cad80f 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -6,10 +6,8 @@
  */
 
 #ifdef __KERNEL__
-#include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 #include <asm/asi.h>
 #include <asm/spitfire.h>
 #include <asm-generic/uaccess-unaligned.h>
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index 730073326b46..14ea3d1ca2c7 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -18,7 +18,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm-generic/uaccess-unaligned.h>
 #include <asm/processor.h>
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 3705620ca298..076bdcb0c2ad 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -7,7 +7,6 @@
 #ifndef __UM_UACCESS_H
 #define __UM_UACCESS_H
 
-#include <asm/thread_info.h>
 #include <asm/elf.h>
 
 #define __under_task_size(addr, size) \
diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h
index 897e11ad8124..1622f37a0514 100644
--- a/arch/unicore32/include/asm/uaccess.h
+++ b/arch/unicore32/include/asm/uaccess.h
@@ -12,9 +12,6 @@
 #ifndef __UNICORE_UACCESS_H__
 #define __UNICORE_UACCESS_H__
 
-#include <linux/thread_info.h>
-#include <linux/errno.h>
-
 #include <asm/memory.h>
 
 #define __copy_from_user	__copy_from_user
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8dffb8b1d328..0522d88a7f90 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -3,10 +3,8 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/kasan-checks.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
 #include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 7d3bdd1ed697..5268ecceea96 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -4,8 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
 #include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 673059a109fe..8ddadd93639e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -5,7 +5,6 @@
  * User space memory access functions
  */
 #include <linux/compiler.h>
-#include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/kasan-checks.h>
 #include <asm/alternative.h>
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index dd6b13649aad..bd8861c811ef 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -16,12 +16,9 @@
 #ifndef _XTENSA_UACCESS_H
 #define _XTENSA_UACCESS_H
 
-#include <linux/errno.h>
 #include <linux/prefetch.h>
 #include <asm/types.h>
 
-#include <linux/sched.h>
-
 /*
  * The fs value determines whether argument validity checking should
  * be performed or not.  If get_fs() == USER_DS, checking is
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index d7c17bfd4601..d20955e495b3 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -6,7 +6,6 @@
  * on any machine that has kernel and user data in the same
  * address space, e.g. all NOMMU machines.
  */
-#include <linux/sched.h>
 #include <linux/string.h>
 
 #include <asm/segment.h>
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index b660f37beaf5..b786ca2419b4 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,6 +2,7 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/sched.h>
+#include <linux/thread_info.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
-- 
cgit v1.2.3


From f22775ede2eb58ed84b55e30768d041f607a2199 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Mon, 13 Feb 2017 14:37:41 +0200
Subject: ieee80211: add FT-PSK AKM suite selector

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 0dd9498c694f..6ea381c98aae 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2347,6 +2347,7 @@ enum ieee80211_sa_query_action {
 /* AKM suite selectors */
 #define WLAN_AKM_SUITE_8021X		SUITE(0x000FAC, 1)
 #define WLAN_AKM_SUITE_PSK		SUITE(0x000FAC, 2)
+#define WLAN_AKM_SUITE_FT_PSK		SUITE(0x000FAC, 4)
 #define WLAN_AKM_SUITE_8021X_SHA256	SUITE(0x000FAC, 5)
 #define WLAN_AKM_SUITE_PSK_SHA256	SUITE(0x000FAC, 6)
 #define WLAN_AKM_SUITE_TDLS		SUITE(0x000FAC, 7)
-- 
cgit v1.2.3


From 2fb51c35815dc08638a7d9b1a497a9d7cb4109b8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 15 Feb 2017 15:02:06 +0100
Subject: ieee80211: rename CCFS1/CCFS2 to CCFS0/CCFS1

This matches the spec, and otherwise things are really
confusing with the next patch adding CCFS2.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/marvell/mwifiex/tdls.c |  2 +-
 include/linux/ieee80211.h                   |  4 ++--
 net/mac80211/spectmgmt.c                    |  4 ++--
 net/mac80211/util.c                         | 22 +++++++++++-----------
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/marvell/mwifiex/tdls.c b/drivers/net/wireless/marvell/mwifiex/tdls.c
index df9704de0715..5fc8319ed302 100644
--- a/drivers/net/wireless/marvell/mwifiex/tdls.c
+++ b/drivers/net/wireless/marvell/mwifiex/tdls.c
@@ -349,7 +349,7 @@ static int mwifiex_tdls_add_vht_oper(struct mwifiex_private *priv,
 		chan_bw = IEEE80211_VHT_CHANWIDTH_USE_HT;
 		break;
 	}
-	vht_oper->center_freq_seg1_idx =
+	vht_oper->center_freq_seg0_idx =
 			mwifiex_get_center_freq_index(priv, BAND_AAC,
 						      bss_desc->channel,
 						      chan_bw);
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 6ea381c98aae..e167a262d3b0 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1525,14 +1525,14 @@ enum ieee80211_vht_chanwidth {
  * This structure is the "VHT operation element" as
  * described in 802.11ac D3.0 8.4.2.161
  * @chan_width: Operating channel width
+ * @center_freq_seg0_idx: center freq segment 0 index
  * @center_freq_seg1_idx: center freq segment 1 index
- * @center_freq_seg2_idx: center freq segment 2 index
  * @basic_mcs_set: VHT Basic MCS rate set
  */
 struct ieee80211_vht_operation {
 	u8 chan_width;
+	u8 center_freq_seg0_idx;
 	u8 center_freq_seg1_idx;
-	u8 center_freq_seg2_idx;
 	__le16 basic_mcs_set;
 } __packed;
 
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 97f4c9d6b54c..0782e486fe89 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -132,9 +132,9 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_vht_operation vht_oper = {
 			.chan_width =
 				wide_bw_chansw_ie->new_channel_width,
-			.center_freq_seg1_idx =
+			.center_freq_seg0_idx =
 				wide_bw_chansw_ie->new_center_freq_seg0,
-			.center_freq_seg2_idx =
+			.center_freq_seg1_idx =
 				wide_bw_chansw_ie->new_center_freq_seg1,
 			/* .basic_mcs_set doesn't matter */
 		};
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index ac59fbd280df..7a37ce78bb38 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2413,13 +2413,13 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
 	*pos++ = WLAN_EID_VHT_OPERATION;
 	*pos++ = sizeof(struct ieee80211_vht_operation);
 	vht_oper = (struct ieee80211_vht_operation *)pos;
-	vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(
+	vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel(
 							chandef->center_freq1);
 	if (chandef->center_freq2)
-		vht_oper->center_freq_seg2_idx =
+		vht_oper->center_freq_seg1_idx =
 			ieee80211_frequency_to_channel(chandef->center_freq2);
 	else
-		vht_oper->center_freq_seg2_idx = 0x00;
+		vht_oper->center_freq_seg1_idx = 0x00;
 
 	switch (chandef->width) {
 	case NL80211_CHAN_WIDTH_160:
@@ -2428,11 +2428,11 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
 		 * workaround.
 		 */
 		vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
-		vht_oper->center_freq_seg2_idx = vht_oper->center_freq_seg1_idx;
+		vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx;
 		if (chandef->chan->center_freq < chandef->center_freq1)
-			vht_oper->center_freq_seg1_idx -= 8;
+			vht_oper->center_freq_seg0_idx -= 8;
 		else
-			vht_oper->center_freq_seg1_idx += 8;
+			vht_oper->center_freq_seg0_idx += 8;
 		break;
 	case NL80211_CHAN_WIDTH_80P80:
 		/*
@@ -2491,9 +2491,9 @@ bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
 	if (!oper)
 		return false;
 
-	cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
+	cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg0_idx,
 					     chandef->chan->band);
-	cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg2_idx,
+	cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
 					     chandef->chan->band);
 
 	switch (oper->chan_width) {
@@ -2503,11 +2503,11 @@ bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
 		new.width = NL80211_CHAN_WIDTH_80;
 		new.center_freq1 = cf1;
 		/* If needed, adjust based on the newer interop workaround. */
-		if (oper->center_freq_seg2_idx) {
+		if (oper->center_freq_seg1_idx) {
 			unsigned int diff;
 
-			diff = abs(oper->center_freq_seg2_idx -
-				   oper->center_freq_seg1_idx);
+			diff = abs(oper->center_freq_seg1_idx -
+				   oper->center_freq_seg0_idx);
 			if (diff == 8) {
 				new.width = NL80211_CHAN_WIDTH_160;
 				new.center_freq1 = cf2;
-- 
cgit v1.2.3


From 75b99bc300463e65f87c90425704c2688489f963 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 15 Feb 2017 15:02:10 +0100
Subject: ieee80211: define HT operation CCFS2 field

The Channel Center Frequency Segment 2 field is used in
802.11-2016 for encoding the actual channel position of
the 80+80/160 MHz channel, if the max NSS is restricted.
This is used for backwards compatibility.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e167a262d3b0..22bf0676d928 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1411,6 +1411,8 @@ struct ieee80211_ht_operation {
 #define		IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED	3
 #define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT		0x0004
 #define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT		0x0010
+#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT		5
+#define IEEE80211_HT_OP_MODE_CCFS2_MASK			0x1fe0
 
 /* for stbc_param */
 #define IEEE80211_HT_STBC_PARAM_DUAL_BEACON		0x0040
-- 
cgit v1.2.3


From 826cf175ed705f70a49d04aca832c1cc9ff048d8 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 28 Feb 2017 14:25:18 -0800
Subject: spi: allow attaching device properties to SPI board info

Generic device properties support statically defined property sets. For
them to be usable, we need to attach these property sets before devices
are registered and probed. Allowing to attach property list to
spi_board_info structure will allow non-ACPI non-DT boards switch to using
generic properties and get rid of custom platform data.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 32 ++++++++++++++++++++++++++++----
 include/linux/spi/spi.h |  4 ++++
 2 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 90b5b2efafbf..6cc86060d22f 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -31,6 +31,7 @@
 #include <linux/of_gpio.h>
 #include <linux/pm_runtime.h>
 #include <linux/pm_domain.h>
+#include <linux/property.h>
 #include <linux/export.h>
 #include <linux/sched/rt.h>
 #include <uapi/linux/sched/types.h>
@@ -600,13 +601,28 @@ struct spi_device *spi_new_device(struct spi_master *master,
 	proxy->controller_data = chip->controller_data;
 	proxy->controller_state = NULL;
 
-	status = spi_add_device(proxy);
-	if (status < 0) {
-		spi_dev_put(proxy);
-		return NULL;
+	if (chip->properties) {
+		status = device_add_properties(&proxy->dev, chip->properties);
+		if (status) {
+			dev_err(&master->dev,
+				"failed to add properties to '%s': %d\n",
+				chip->modalias, status);
+			goto err_dev_put;
+		}
 	}
 
+	status = spi_add_device(proxy);
+	if (status < 0)
+		goto err_remove_props;
+
 	return proxy;
+
+err_remove_props:
+	if (chip->properties)
+		device_remove_properties(&proxy->dev);
+err_dev_put:
+	spi_dev_put(proxy);
+	return NULL;
 }
 EXPORT_SYMBOL_GPL(spi_new_device);
 
@@ -664,6 +680,7 @@ static void spi_match_master_to_boardinfo(struct spi_master *master,
  *
  * The board info passed can safely be __initdata ... but be careful of
  * any embedded pointers (platform_data, etc), they're copied as-is.
+ * Device properties are deep-copied though.
  *
  * Return: zero on success, else a negative error code.
  */
@@ -683,6 +700,13 @@ int spi_register_board_info(struct spi_board_info const *info, unsigned n)
 		struct spi_master *master;
 
 		memcpy(&bi->board_info, info, sizeof(*info));
+		if (info->properties) {
+			bi->board_info.properties =
+					property_entries_dup(info->properties);
+			if (IS_ERR(bi->board_info.properties))
+				return PTR_ERR(bi->board_info.properties);
+		}
+
 		mutex_lock(&board_lock);
 		list_add_tail(&bi->list, &board_list);
 		list_for_each_entry(master, &spi_master_list, list)
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 75c6bd0ac605..5a8c4b24f2dc 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -23,6 +23,7 @@
 #include <linux/scatterlist.h>
 
 struct dma_chan;
+struct property_entry;
 struct spi_master;
 struct spi_transfer;
 struct spi_flash_read_message;
@@ -1209,6 +1210,7 @@ int spi_flash_read(struct spi_device *spi,
  * @modalias: Initializes spi_device.modalias; identifies the driver.
  * @platform_data: Initializes spi_device.platform_data; the particular
  *	data stored there is driver-specific.
+ * @properties: Additional device properties for the device.
  * @controller_data: Initializes spi_device.controller_data; some
  *	controllers need hints about hardware setup, e.g. for DMA.
  * @irq: Initializes spi_device.irq; depends on how the board is wired.
@@ -1241,10 +1243,12 @@ struct spi_board_info {
 	 *
 	 * platform_data goes to spi_device.dev.platform_data,
 	 * controller_data goes to spi_device.controller_data,
+	 * device properties are copied and attached to spi_device,
 	 * irq is copied too
 	 */
 	char		modalias[SPI_NAME_SIZE];
 	const void	*platform_data;
+	const struct property_entry *properties;
 	void		*controller_data;
 	int		irq;
 
-- 
cgit v1.2.3


From 572d3c6444979a6a49c6b464110563f578e8dece Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Thu, 9 Feb 2017 18:03:57 -0800
Subject: HID: i2c-hid: support regulator power on/off

On some boards, we need to enable a regulator before using the HID, and
it's also nice to save power in suspend by disabling it. Support an
optional "vdd-supply" and a companion initialization delay.

Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Caesar Wang <wxt@rock-chips.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: linux-input@vger.kernel.org
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/i2c-hid/i2c-hid.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/i2c/i2c-hid.h   |  6 ++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c
index ea3c3546cef7..a3f6daf0886b 100644
--- a/drivers/hid/i2c-hid/i2c-hid.c
+++ b/drivers/hid/i2c-hid/i2c-hid.c
@@ -994,6 +994,11 @@ static int i2c_hid_of_probe(struct i2c_client *client,
 	}
 	pdata->hid_descriptor_address = val;
 
+	ret = of_property_read_u32(dev->of_node, "post-power-on-delay-ms",
+				   &val);
+	if (!ret)
+		pdata->post_power_delay_ms = val;
+
 	return 0;
 }
 
@@ -1053,6 +1058,24 @@ static int i2c_hid_probe(struct i2c_client *client,
 		ihid->pdata = *platform_data;
 	}
 
+	ihid->pdata.supply = devm_regulator_get(&client->dev, "vdd");
+	if (IS_ERR(ihid->pdata.supply)) {
+		ret = PTR_ERR(ihid->pdata.supply);
+		if (ret != -EPROBE_DEFER)
+			dev_err(&client->dev, "Failed to get regulator: %d\n",
+				ret);
+		return ret;
+	}
+
+	ret = regulator_enable(ihid->pdata.supply);
+	if (ret < 0) {
+		dev_err(&client->dev, "Failed to enable regulator: %d\n",
+			ret);
+		goto err;
+	}
+	if (ihid->pdata.post_power_delay_ms)
+		msleep(ihid->pdata.post_power_delay_ms);
+
 	i2c_set_clientdata(client, ihid);
 
 	ihid->client = client;
@@ -1068,7 +1091,7 @@ static int i2c_hid_probe(struct i2c_client *client,
 	 * real computation later. */
 	ret = i2c_hid_alloc_buffers(ihid, HID_MIN_BUFFER_SIZE);
 	if (ret < 0)
-		goto err;
+		goto err_regulator;
 
 	pm_runtime_get_noresume(&client->dev);
 	pm_runtime_set_active(&client->dev);
@@ -1125,6 +1148,9 @@ err_pm:
 	pm_runtime_put_noidle(&client->dev);
 	pm_runtime_disable(&client->dev);
 
+err_regulator:
+	regulator_disable(ihid->pdata.supply);
+
 err:
 	i2c_hid_free_buffers(ihid);
 	kfree(ihid);
@@ -1149,6 +1175,8 @@ static int i2c_hid_remove(struct i2c_client *client)
 	if (ihid->bufsize)
 		i2c_hid_free_buffers(ihid);
 
+	regulator_disable(ihid->pdata.supply);
+
 	kfree(ihid);
 
 	return 0;
@@ -1199,6 +1227,10 @@ static int i2c_hid_suspend(struct device *dev)
 		else
 			hid_warn(hid, "Failed to enable irq wake: %d\n",
 				wake_status);
+	} else {
+		ret = regulator_disable(ihid->pdata.supply);
+		if (ret < 0)
+			hid_warn(hid, "Failed to disable supply: %d\n", ret);
 	}
 
 	return 0;
@@ -1212,7 +1244,13 @@ static int i2c_hid_resume(struct device *dev)
 	struct hid_device *hid = ihid->hid;
 	int wake_status;
 
-	if (device_may_wakeup(&client->dev) && ihid->irq_wake_enabled) {
+	if (!device_may_wakeup(&client->dev)) {
+		ret = regulator_enable(ihid->pdata.supply);
+		if (ret < 0)
+			hid_warn(hid, "Failed to enable supply: %d\n", ret);
+		if (ihid->pdata.post_power_delay_ms)
+			msleep(ihid->pdata.post_power_delay_ms);
+	} else if (ihid->irq_wake_enabled) {
 		wake_status = disable_irq_wake(client->irq);
 		if (!wake_status)
 			ihid->irq_wake_enabled = false;
diff --git a/include/linux/i2c/i2c-hid.h b/include/linux/i2c/i2c-hid.h
index 7aa901d92058..1fb088239d12 100644
--- a/include/linux/i2c/i2c-hid.h
+++ b/include/linux/i2c/i2c-hid.h
@@ -14,9 +14,13 @@
 
 #include <linux/types.h>
 
+struct regulator;
+
 /**
  * struct i2chid_platform_data - used by hid over i2c implementation.
  * @hid_descriptor_address: i2c register where the HID descriptor is stored.
+ * @supply: regulator for powering on the device.
+ * @post_power_delay_ms: delay after powering on before device is usable.
  *
  * Note that it is the responsibility of the platform driver (or the acpi 5.0
  * driver, or the flattened device tree) to setup the irq related to the gpio in
@@ -31,6 +35,8 @@
  */
 struct i2c_hid_platform_data {
 	u16 hid_descriptor_address;
+	struct regulator *supply;
+	int post_power_delay_ms;
 };
 
 #endif /* __LINUX_I2C_HID_H */
-- 
cgit v1.2.3


From 387ad9674b0013c8756ad20d854ff005b0c313ad Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Mon, 20 Feb 2017 12:19:00 +0200
Subject: kernel: convert cgroup_namespace.count from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h    | 7 ++++---
 kernel/cgroup/cgroup.c    | 2 +-
 kernel/cgroup/namespace.c | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b43fbb141c..44129793c7b8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,7 @@
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
+#include <linux/refcount.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -640,7 +641,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
 #endif	/* CONFIG_CGROUP_DATA */
 
 struct cgroup_namespace {
-	atomic_t		count;
+	refcount_t		count;
 	struct ns_common	ns;
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
@@ -675,12 +676,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
 static inline void get_cgroup_ns(struct cgroup_namespace *ns)
 {
 	if (ns)
-		atomic_inc(&ns->count);
+		refcount_inc(&ns->count);
 }
 
 static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 {
-	if (ns && atomic_dec_and_test(&ns->count))
+	if (ns && refcount_dec_and_test(&ns->count))
 		free_cgroup_ns(ns);
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589c7428..8ee78688e36d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.count		= { .counter = 2, },
+	.count		= REFCOUNT_INIT(2),
 	.user_ns	= &init_user_ns,
 	.ns.ops		= &cgroupns_operations,
 	.ns.inum	= PROC_CGROUP_INIT_INO,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
 		kfree(new_ns);
 		return ERR_PTR(ret);
 	}
-	atomic_set(&new_ns->count, 1);
+	refcount_set(&new_ns->count, 1);
 	new_ns->ns.ops = &cgroupns_operations;
 	return new_ns;
 }
-- 
cgit v1.2.3


From fa5923cea8da3b5d4eb943651922b327b1df673c Mon Sep 17 00:00:00 2001
From: Michał Kępień <kernel@kempniu.pl>
Date: Fri, 17 Feb 2017 08:57:48 +0100
Subject: ALSA: hda - use dell_micmute_led_set() instead of
 dell_app_wmi_led_set()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dell_app_wmi_led_set() method introduced in commit db6d8cc00773
("dell-led: add mic mute led interface") was implemented as an easily
extensible entry point for other modules to set the state of various
LEDs.  However, almost three years later it is still only used to
control the mic mute LED, so it will be replaced with direct calls to
dell_micmute_led_set().

Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Tested-by: Alex Hung <alex.hung@canonical.com>
Reviewed-by: Pali Rohár <pali.rohar@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/dell-led.c         | 20 ++------------------
 include/linux/dell-led.h        |  6 +-----
 sound/pci/hda/dell_wmi_helper.c | 12 ++++++------
 3 files changed, 9 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/dell-led.c b/drivers/leds/dell-led.c
index e8e8f67224c1..f9002d9bb757 100644
--- a/drivers/leds/dell-led.c
+++ b/drivers/leds/dell-led.c
@@ -46,7 +46,7 @@ MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
 #define GLOBAL_MIC_MUTE_ENABLE	0x364
 #define GLOBAL_MIC_MUTE_DISABLE	0x365
 
-static int dell_micmute_led_set(int state)
+int dell_micmute_led_set(int state)
 {
 	struct calling_interface_buffer *buffer;
 	struct calling_interface_token *token;
@@ -69,23 +69,7 @@ static int dell_micmute_led_set(int state)
 
 	return state;
 }
-
-int dell_app_wmi_led_set(int whichled, int on)
-{
-	int state = 0;
-
-	switch (whichled) {
-	case DELL_LED_MICMUTE:
-		state = dell_micmute_led_set(on);
-		break;
-	default:
-		pr_warn("led type %x is not supported\n", whichled);
-		break;
-	}
-
-	return state;
-}
-EXPORT_SYMBOL_GPL(dell_app_wmi_led_set);
+EXPORT_SYMBOL_GPL(dell_micmute_led_set);
 
 struct bios_args {
 	unsigned char length;
diff --git a/include/linux/dell-led.h b/include/linux/dell-led.h
index 7009b8bec77b..3f033c48071e 100644
--- a/include/linux/dell-led.h
+++ b/include/linux/dell-led.h
@@ -1,10 +1,6 @@
 #ifndef __DELL_LED_H__
 #define __DELL_LED_H__
 
-enum {
-	DELL_LED_MICMUTE,
-};
-
-int dell_app_wmi_led_set(int whichled, int on);
+int dell_micmute_led_set(int on);
 
 #endif
diff --git a/sound/pci/hda/dell_wmi_helper.c b/sound/pci/hda/dell_wmi_helper.c
index 19d41da79f93..e128c8096772 100644
--- a/sound/pci/hda/dell_wmi_helper.c
+++ b/sound/pci/hda/dell_wmi_helper.c
@@ -6,7 +6,7 @@
 #include <linux/dell-led.h>
 
 static int dell_led_value;
-static int (*dell_led_set_func)(int, int);
+static int (*dell_led_set_func)(int);
 static void (*dell_old_cap_hook)(struct hda_codec *,
 			         struct snd_kcontrol *,
 				 struct snd_ctl_elem_value *);
@@ -27,7 +27,7 @@ static void update_dell_wmi_micmute_led(struct hda_codec *codec,
 			return;
 		dell_led_value = val;
 		if (dell_led_set_func)
-			dell_led_set_func(DELL_LED_MICMUTE, dell_led_value);
+			dell_led_set_func(dell_led_value);
 	}
 }
 
@@ -40,14 +40,14 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
 
 	if (action == HDA_FIXUP_ACT_PROBE) {
 		if (!dell_led_set_func)
-			dell_led_set_func = symbol_request(dell_app_wmi_led_set);
+			dell_led_set_func = symbol_request(dell_micmute_led_set);
 		if (!dell_led_set_func) {
-			codec_warn(codec, "Failed to find dell wmi symbol dell_app_wmi_led_set\n");
+			codec_warn(codec, "Failed to find dell wmi symbol dell_micmute_led_set\n");
 			return;
 		}
 
 		removefunc = true;
-		if (dell_led_set_func(DELL_LED_MICMUTE, false) >= 0) {
+		if (dell_led_set_func(false) >= 0) {
 			dell_led_value = 0;
 			if (spec->gen.num_adc_nids > 1 && !spec->gen.dyn_adc_switch)
 				codec_dbg(codec, "Skipping micmute LED control due to several ADCs");
@@ -61,7 +61,7 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
 	}
 
 	if (dell_led_set_func && (action == HDA_FIXUP_ACT_FREE || removefunc)) {
-		symbol_put(dell_app_wmi_led_set);
+		symbol_put(dell_micmute_led_set);
 		dell_led_set_func = NULL;
 		dell_old_cap_hook = NULL;
 	}
-- 
cgit v1.2.3


From df789fe752065f2ce761ba434125e335b514899f Mon Sep 17 00:00:00 2001
From: David Forster <dforster@brocade.com>
Date: Thu, 23 Feb 2017 16:27:18 +0000
Subject: ipv6: Provide ipv6 version of "disable_policy" sysctl

This provides equivalent functionality to the existing ipv4
"disable_policy" systcl. ie. Allows IPsec processing to be skipped
on terminating packets on a per-interface basis.

Signed-off-by: David Forster <dforster@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h      |   1 +
 include/uapi/linux/ipv6.h |   1 +
 net/ipv6/addrconf.c       | 114 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 71be5b330d21..f0d79bd054ca 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -70,6 +70,7 @@ struct ipv6_devconf {
 #endif
 	__u32		enhanced_dad;
 	__u32		addr_gen_mode;
+	__s32		disable_policy;
 
 	struct ctl_table_header *sysctl_header;
 };
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 8ef9e75e004e..d8f6a1ac9af4 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -183,6 +183,7 @@ enum {
 	DEVCONF_SEG6_REQUIRE_HMAC,
 	DEVCONF_ENHANCED_DAD,
 	DEVCONF_ADDR_GEN_MODE,
+	DEVCONF_DISABLE_POLICY,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 363172527e43..8c69768a5c46 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -245,6 +245,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 #endif
 	.enhanced_dad           = 1,
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
+	.disable_policy		= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -297,6 +298,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 #endif
 	.enhanced_dad           = 1,
 	.addr_gen_mode		= IN6_ADDR_GEN_MODE_EUI64,
+	.disable_policy		= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -944,6 +946,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 	      const struct in6_addr *peer_addr, int pfxlen,
 	      int scope, u32 flags, u32 valid_lft, u32 prefered_lft)
 {
+	struct net *net = dev_net(idev->dev);
 	struct inet6_ifaddr *ifa = NULL;
 	struct rt6_info *rt;
 	unsigned int hash;
@@ -990,6 +993,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
+	if (net->ipv6.devconf_all->disable_policy ||
+	    idev->cnf.disable_policy)
+		rt->dst.flags |= DST_NOPOLICY;
+
 	neigh_parms_data_state_setall(idev->nd_parms);
 
 	ifa->addr = *addr;
@@ -5003,6 +5010,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 #endif
 	array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
 	array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
+	array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5827,6 +5835,105 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
 	return ret;
 }
 
+static
+void addrconf_set_nopolicy(struct rt6_info *rt, int action)
+{
+	if (rt) {
+		if (action)
+			rt->dst.flags |= DST_NOPOLICY;
+		else
+			rt->dst.flags &= ~DST_NOPOLICY;
+	}
+}
+
+static
+void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
+{
+	struct inet6_ifaddr *ifa;
+
+	read_lock_bh(&idev->lock);
+	list_for_each_entry(ifa, &idev->addr_list, if_list) {
+		spin_lock(&ifa->lock);
+		if (ifa->rt) {
+			struct rt6_info *rt = ifa->rt;
+			struct fib6_table *table = rt->rt6i_table;
+			int cpu;
+
+			read_lock(&table->tb6_lock);
+			addrconf_set_nopolicy(ifa->rt, val);
+			if (rt->rt6i_pcpu) {
+				for_each_possible_cpu(cpu) {
+					struct rt6_info **rtp;
+
+					rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+					addrconf_set_nopolicy(*rtp, val);
+				}
+			}
+			read_unlock(&table->tb6_lock);
+		}
+		spin_unlock(&ifa->lock);
+	}
+	read_unlock_bh(&idev->lock);
+}
+
+static
+int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val)
+{
+	struct inet6_dev *idev;
+	struct net *net;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	*valp = val;
+
+	net = (struct net *)ctl->extra2;
+	if (valp == &net->ipv6.devconf_dflt->disable_policy) {
+		rtnl_unlock();
+		return 0;
+	}
+
+	if (valp == &net->ipv6.devconf_all->disable_policy)  {
+		struct net_device *dev;
+
+		for_each_netdev(net, dev) {
+			idev = __in6_dev_get(dev);
+			if (idev)
+				addrconf_disable_policy_idev(idev, val);
+		}
+	} else {
+		idev = (struct inet6_dev *)ctl->extra1;
+		addrconf_disable_policy_idev(idev, val);
+	}
+
+	rtnl_unlock();
+	return 0;
+}
+
+static
+int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
+				   void __user *buffer, size_t *lenp,
+				   loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	loff_t pos = *ppos;
+	struct ctl_table lctl;
+	int ret;
+
+	lctl = *ctl;
+	lctl.data = &val;
+	ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);
+
+	if (write && (*valp != val))
+		ret = addrconf_disable_policy(ctl, valp, val);
+
+	if (ret)
+		*ppos = pos;
+
+	return ret;
+}
+
 static int minus_one = -1;
 static const int one = 1;
 static const int two_five_five = 255;
@@ -6184,6 +6291,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.mode			= 0644,
 		.proc_handler	= addrconf_sysctl_addr_gen_mode,
 	},
+	{
+		.procname       = "disable_policy",
+		.data           = &ipv6_devconf.disable_policy,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = addrconf_sysctl_disable_policy,
+	},
 	{
 		/* sentinel */
 	}
-- 
cgit v1.2.3


From f3dd3f4797652c311df9c074436d420f1ad3566e Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Mon, 27 Feb 2017 10:26:48 -0800
Subject: vmbus: introduce in-place packet iterator

This is mostly just a refactoring of previous functions
(get_pkt_next_raw, put_pkt_raw and commit_rd_index) to make it easier
to use for other drivers and NAPI.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/hv/ring_buffer.c    | 94 +++++++++++++++++++++++++++++++++++++++++++-
 drivers/net/hyperv/netvsc.c | 34 +++++-----------
 include/linux/hyperv.h      | 96 ++++++++++++++-------------------------------
 3 files changed, 133 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 87799e81af97..c3f1a9e33cef 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -32,6 +32,8 @@
 
 #include "hyperv_vmbus.h"
 
+#define VMBUS_PKT_TRAILER	8
+
 /*
  * When we write to the ring buffer, check if the host needs to
  * be signaled. Here is the details of this protocol:
@@ -336,6 +338,12 @@ int hv_ringbuffer_write(struct vmbus_channel *channel,
 	return 0;
 }
 
+static inline void
+init_cached_read_index(struct hv_ring_buffer_info *rbi)
+{
+	rbi->cached_read_index = rbi->ring_buffer->read_index;
+}
+
 int hv_ringbuffer_read(struct vmbus_channel *channel,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
 		       u64 *requestid, bool raw)
@@ -366,7 +374,8 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 		return ret;
 	}
 
-	init_cached_read_index(channel);
+	init_cached_read_index(inring_info);
+
 	next_read_location = hv_get_next_read_location(inring_info);
 	next_read_location = hv_copyfrom_ringbuffer(inring_info, &desc,
 						    sizeof(desc),
@@ -410,3 +419,86 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 
 	return ret;
 }
+
+/*
+ * Determine number of bytes available in ring buffer after
+ * the current iterator (priv_read_index) location.
+ *
+ * This is similar to hv_get_bytes_to_read but with private
+ * read index instead.
+ */
+static u32 hv_pkt_iter_avail(const struct hv_ring_buffer_info *rbi)
+{
+	u32 priv_read_loc = rbi->priv_read_index;
+	u32 write_loc = READ_ONCE(rbi->ring_buffer->write_index);
+
+	if (write_loc >= priv_read_loc)
+		return write_loc - priv_read_loc;
+	else
+		return (rbi->ring_datasize - priv_read_loc) + write_loc;
+}
+
+/*
+ * Get first vmbus packet from ring buffer after read_index
+ *
+ * If ring buffer is empty, returns NULL and no other action needed.
+ */
+struct vmpacket_descriptor *hv_pkt_iter_first(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *rbi = &channel->inbound;
+
+	/* set state for later hv_signal_on_read() */
+	init_cached_read_index(rbi);
+
+	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
+		return NULL;
+
+	return hv_get_ring_buffer(rbi) + rbi->priv_read_index;
+}
+EXPORT_SYMBOL_GPL(hv_pkt_iter_first);
+
+/*
+ * Get next vmbus packet from ring buffer.
+ *
+ * Advances the current location (priv_read_index) and checks for more
+ * data. If the end of the ring buffer is reached, then return NULL.
+ */
+struct vmpacket_descriptor *
+__hv_pkt_iter_next(struct vmbus_channel *channel,
+		   const struct vmpacket_descriptor *desc)
+{
+	struct hv_ring_buffer_info *rbi = &channel->inbound;
+	u32 packetlen = desc->len8 << 3;
+	u32 dsize = rbi->ring_datasize;
+
+	/* bump offset to next potential packet */
+	rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
+	if (rbi->priv_read_index >= dsize)
+		rbi->priv_read_index -= dsize;
+
+	/* more data? */
+	if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
+		return NULL;
+	else
+		return hv_get_ring_buffer(rbi) + rbi->priv_read_index;
+}
+EXPORT_SYMBOL_GPL(__hv_pkt_iter_next);
+
+/*
+ * Update host ring buffer after iterating over packets.
+ */
+void hv_pkt_iter_close(struct vmbus_channel *channel)
+{
+	struct hv_ring_buffer_info *rbi = &channel->inbound;
+
+	/*
+	 * Make sure all reads are done before we update the read index since
+	 * the writer may start writing to the read area once the read index
+	 * is updated.
+	 */
+	virt_rmb();
+	rbi->ring_buffer->read_index = rbi->priv_read_index;
+
+	hv_signal_on_read(channel);
+}
+EXPORT_SYMBOL_GPL(hv_pkt_iter_close);
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 5dedbc36c326..3681fb59bdbe 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -647,14 +647,11 @@ static void netvsc_send_tx_complete(struct netvsc_device *net_device,
 static void netvsc_send_completion(struct netvsc_device *net_device,
 				   struct vmbus_channel *incoming_channel,
 				   struct hv_device *device,
-				   struct vmpacket_descriptor *packet)
+				   const struct vmpacket_descriptor *desc)
 {
-	struct nvsp_message *nvsp_packet;
+	struct nvsp_message *nvsp_packet = hv_pkt_data(desc);
 	struct net_device *ndev = hv_get_drvdata(device);
 
-	nvsp_packet = (struct nvsp_message *)((unsigned long)packet +
-					      (packet->offset8 << 3));
-
 	switch (nvsp_packet->hdr.msg_type) {
 	case NVSP_MSG_TYPE_INIT_COMPLETE:
 	case NVSP_MSG1_TYPE_SEND_RECV_BUF_COMPLETE:
@@ -668,7 +665,7 @@ static void netvsc_send_completion(struct netvsc_device *net_device,
 
 	case NVSP_MSG1_TYPE_SEND_RNDIS_PKT_COMPLETE:
 		netvsc_send_tx_complete(net_device, incoming_channel,
-					device, packet);
+					device, desc);
 		break;
 
 	default:
@@ -1071,9 +1068,11 @@ static void netvsc_receive(struct net_device *ndev,
 		   struct net_device_context *net_device_ctx,
 		   struct hv_device *device,
 		   struct vmbus_channel *channel,
-		   struct vmtransfer_page_packet_header *vmxferpage_packet,
+		   const struct vmpacket_descriptor *desc,
 		   struct nvsp_message *nvsp)
 {
+	const struct vmtransfer_page_packet_header *vmxferpage_packet
+		= container_of(desc, const struct vmtransfer_page_packet_header, d);
 	char *recv_buf = net_device->recv_buf;
 	u32 status = NVSP_STAT_SUCCESS;
 	int i;
@@ -1185,12 +1184,10 @@ static void netvsc_process_raw_pkt(struct hv_device *device,
 				   struct netvsc_device *net_device,
 				   struct net_device *ndev,
 				   u64 request_id,
-				   struct vmpacket_descriptor *desc)
+				   const struct vmpacket_descriptor *desc)
 {
 	struct net_device_context *net_device_ctx = netdev_priv(ndev);
-	struct nvsp_message *nvmsg
-		= (struct nvsp_message *)((unsigned long)desc
-					  + (desc->offset8 << 3));
+	struct nvsp_message *nvmsg = hv_pkt_data(desc);
 
 	switch (desc->type) {
 	case VM_PKT_COMP:
@@ -1199,9 +1196,7 @@ static void netvsc_process_raw_pkt(struct hv_device *device,
 
 	case VM_PKT_DATA_USING_XFER_PAGES:
 		netvsc_receive(ndev, net_device, net_device_ctx,
-			       device, channel,
-			       (struct vmtransfer_page_packet_header *)desc,
-			       nvmsg);
+			       device, channel, desc, nvmsg);
 		break;
 
 	case VM_PKT_DATA_INBAND:
@@ -1223,7 +1218,6 @@ void netvsc_channel_cb(void *context)
 	struct netvsc_device *net_device;
 	struct vmpacket_descriptor *desc;
 	struct net_device *ndev;
-	bool need_to_commit = false;
 
 	if (channel->primary_channel != NULL)
 		device = channel->primary_channel->device_obj;
@@ -1239,20 +1233,12 @@ void netvsc_channel_cb(void *context)
 	    netvsc_channel_idle(net_device, q_idx))
 		return;
 
-	/* commit_rd_index() -> hv_signal_on_read() needs this. */
-	init_cached_read_index(channel);
-
-	while ((desc = get_next_pkt_raw(channel)) != NULL) {
+	foreach_vmbus_pkt(desc, channel) {
 		netvsc_process_raw_pkt(device, channel, net_device,
 				       ndev, desc->trans_id, desc);
 
-		put_pkt_raw(channel, desc);
-		need_to_commit = true;
 	}
 
-	if (need_to_commit)
-		commit_rd_index(channel);
-
 	netvsc_chk_recv_comp(net_device, channel, q_idx);
 }
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 62bbf3c1aa4a..36162485d663 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1504,14 +1504,6 @@ static inline  void hv_signal_on_read(struct vmbus_channel *channel)
 	return;
 }
 
-static inline void
-init_cached_read_index(struct vmbus_channel *channel)
-{
-	struct hv_ring_buffer_info *rbi = &channel->inbound;
-
-	rbi->cached_read_index = rbi->ring_buffer->read_index;
-}
-
 /*
  * Mask off host interrupt callback notifications
  */
@@ -1545,76 +1537,48 @@ static inline u32 hv_end_read(struct hv_ring_buffer_info *rbi)
 /*
  * An API to support in-place processing of incoming VMBUS packets.
  */
-#define VMBUS_PKT_TRAILER	8
 
-static inline struct vmpacket_descriptor *
-get_next_pkt_raw(struct vmbus_channel *channel)
+/* Get data payload associated with descriptor */
+static inline void *hv_pkt_data(const struct vmpacket_descriptor *desc)
 {
-	struct hv_ring_buffer_info *ring_info = &channel->inbound;
-	u32 priv_read_loc = ring_info->priv_read_index;
-	void *ring_buffer = hv_get_ring_buffer(ring_info);
-	u32 dsize = ring_info->ring_datasize;
-	/*
-	 * delta is the difference between what is available to read and
-	 * what was already consumed in place. We commit read index after
-	 * the whole batch is processed.
-	 */
-	u32 delta = priv_read_loc >= ring_info->ring_buffer->read_index ?
-		priv_read_loc - ring_info->ring_buffer->read_index :
-		(dsize - ring_info->ring_buffer->read_index) + priv_read_loc;
-	u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta);
-
-	if (bytes_avail_toread < sizeof(struct vmpacket_descriptor))
-		return NULL;
-
-	return ring_buffer + priv_read_loc;
+	return (void *)((unsigned long)desc + (desc->offset8 << 3));
 }
 
-/*
- * A helper function to step through packets "in-place"
- * This API is to be called after each successful call
- * get_next_pkt_raw().
- */
-static inline void put_pkt_raw(struct vmbus_channel *channel,
-				struct vmpacket_descriptor *desc)
+/* Get data size associated with descriptor */
+static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc)
 {
-	struct hv_ring_buffer_info *ring_info = &channel->inbound;
-	u32 packetlen = desc->len8 << 3;
-	u32 dsize = ring_info->ring_datasize;
-
-	/*
-	 * Include the packet trailer.
-	 */
-	ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
-	ring_info->priv_read_index %= dsize;
+	return (desc->len8 << 3) - (desc->offset8 << 3);
 }
 
+
+struct vmpacket_descriptor *
+hv_pkt_iter_first(struct vmbus_channel *channel);
+
+struct vmpacket_descriptor *
+__hv_pkt_iter_next(struct vmbus_channel *channel,
+		   const struct vmpacket_descriptor *pkt);
+
+void hv_pkt_iter_close(struct vmbus_channel *channel);
+
 /*
- * This call commits the read index and potentially signals the host.
- * Here is the pattern for using the "in-place" consumption APIs:
- *
- * init_cached_read_index();
- *
- * while (get_next_pkt_raw() {
- *	process the packet "in-place";
- *	put_pkt_raw();
- * }
- * if (packets processed in place)
- *	commit_rd_index();
+ * Get next packet descriptor from iterator
+ * If at end of list, return NULL and update host.
  */
-static inline void commit_rd_index(struct vmbus_channel *channel)
+static inline struct vmpacket_descriptor *
+hv_pkt_iter_next(struct vmbus_channel *channel,
+		 const struct vmpacket_descriptor *pkt)
 {
-	struct hv_ring_buffer_info *ring_info = &channel->inbound;
-	/*
-	 * Make sure all reads are done before we update the read index since
-	 * the writer may start writing to the read area once the read index
-	 * is updated.
-	 */
-	virt_rmb();
-	ring_info->ring_buffer->read_index = ring_info->priv_read_index;
+	struct vmpacket_descriptor *nxt;
+
+	nxt = __hv_pkt_iter_next(channel, pkt);
+	if (!nxt)
+		hv_pkt_iter_close(channel);
 
-	hv_signal_on_read(channel);
+	return nxt;
 }
 
+#define foreach_vmbus_pkt(pkt, channel) \
+	for (pkt = hv_pkt_iter_first(channel); pkt; \
+	    pkt = hv_pkt_iter_next(channel, pkt))
 
 #endif /* _HYPERV_H */
-- 
cgit v1.2.3


From 9b4b5a797cf8a8d904df979891a8de53f2cb9694 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Wed, 4 Jan 2017 20:23:51 +0100
Subject: dm table: add flag to allow target to handle its own integrity
 metadata

Add DM_TARGET_INTEGRITY flag that specifies bio integrity metadata is
not inherited but implemented in the target itself.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c         | 11 +++++++++++
 include/linux/device-mapper.h |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5a..b0600840e734 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -47,6 +47,7 @@ struct dm_table {
 	bool integrity_supported:1;
 	bool singleton:1;
 	bool all_blk_mq:1;
+	unsigned integrity_added:1;
 
 	/*
 	 * Indicates the rw permissions for the new logical
@@ -725,6 +726,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 		t->immutable_target_type = tgt->type;
 	}
 
+	if (dm_target_has_integrity(tgt->type))
+		t->integrity_added = 1;
+
 	tgt->table = t;
 	tgt->begin = start;
 	tgt->len = len;
@@ -1168,6 +1172,10 @@ static int dm_table_register_integrity(struct dm_table *t)
 	struct mapped_device *md = t->md;
 	struct gendisk *template_disk = NULL;
 
+	/* If target handles integrity itself do not register it here. */
+	if (t->integrity_added)
+		return 0;
+
 	template_disk = dm_table_get_integrity_disk(t);
 	if (!template_disk)
 		return 0;
@@ -1394,6 +1402,9 @@ static void dm_table_verify_integrity(struct dm_table *t)
 {
 	struct gendisk *template_disk = NULL;
 
+	if (t->integrity_added)
+		return;
+
 	if (t->integrity_supported) {
 		/*
 		 * Verify that the original integrity profile
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..874462153f14 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -221,6 +221,12 @@ struct target_type {
  */
 typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio);
 
+/*
+ * A target implements own bio data integrity.
+ */
+#define DM_TARGET_INTEGRITY		0x00000010
+#define dm_target_has_integrity(type)	((type)->features & DM_TARGET_INTEGRITY)
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
-- 
cgit v1.2.3


From 0edae0b3ffa6fc968d63932347a4d74b0ad0340b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 3 Mar 2017 12:16:16 -0800
Subject: pstore: Add kernel-doc for struct pstore_info

This adds documentation for struct pstore_info, which also includes
the basic API the backends need to implement.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/pstore.h | 133 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 128 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 0da29cae009b..56477ce6806a 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -30,7 +30,7 @@
 #include <linux/time.h>
 #include <linux/types.h>
 
-/* types */
+/* pstore record types (see fs/pstore/inode.c for filename templates) */
 enum pstore_type_id {
 	PSTORE_TYPE_DMESG	= 0,
 	PSTORE_TYPE_MCE		= 1,
@@ -47,14 +47,138 @@ enum pstore_type_id {
 
 struct module;
 
+/**
+ * struct pstore_info - backend pstore driver structure
+ *
+ * @owner:	module which is repsonsible for this backend driver
+ * @name:	name of the backend driver
+ *
+ * @buf_lock:	spinlock to serialize access to @buf
+ * @buf:	preallocated crash dump buffer
+ * @bufsize:	size of @buf available for crash dump writes
+ *
+ * @read_mutex:	serializes @open, @read, @close, and @erase callbacks
+ * @flags:	bitfield of frontends the backend can accept writes for
+ * @data:	backend-private pointer passed back during callbacks
+ *
+ * Callbacks:
+ *
+ * @open:
+ *	Notify backend that pstore is starting a full read of backend
+ *	records. Followed by one or more @read calls, and a final @close.
+ *
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns 0 on success, and non-zero on error.
+ *
+ * @close:
+ *	Notify backend that pstore has finished a full read of backend
+ *	records. Always preceded by an @open call and one or more @read
+ *	calls.
+ *
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns 0 on success, and non-zero on error. (Though pstore will
+ *	ignore the error.)
+ *
+ * @read:
+ *	Read next available backend record. Called after a successful
+ *	@open.
+ *
+ *	@id:	out: unique identifier for the record
+ *	@type:	out: pstore record type
+ *	@count: out: for PSTORE_TYPE_DMESG, the Oops count.
+ *	@time:	out: timestamp for the record
+ *	@buf:	out: kmalloc copy of record contents, to be freed by pstore
+ *	@compressed:
+ *		out: if the record contents are compressed
+ *	@ecc_notice_size:
+ *		out: ECC information
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns record size on success, zero when no more records are
+ *	available, or negative on error.
+ *
+ * @write:
+ *	Perform a frontend notification of a write to a backend record. The
+ *	data to be stored has already been written to the registered @buf
+ *	of the @psi structure.
+ *
+ *	@type:	in: pstore record type to write
+ *	@reason:
+ *		in: pstore write reason
+ *	@id:	out: unique identifier for the record
+ *	@part:	in: position in a multipart write
+ *	@count:	in: increasing from 0 since boot, the number of this Oops
+ *	@compressed:
+ *		in: if the record is compressed
+ *	@size:	in: size of the write
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns 0 on success, and non-zero on error.
+ *
+ * @write_buf:
+ *	Perform a frontend write to a backend record, using a specified
+ *	buffer. Unlike @write, this does not use the @psi @buf.
+ *
+ *	@type:	in: pstore record type to write
+ *	@reason:
+ *		in: pstore write reason
+ *	@id:	out: unique identifier for the record
+ *	@part:	in: position in a multipart write
+ *	@buf:	in: pointer to contents to write to backend record
+ *	@compressed:
+ *		in: if the record is compressed
+ *	@size:	in: size of the write
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns 0 on success, and non-zero on error.
+ *
+ * @write_buf_user:
+ *	Perform a frontend write to a backend record, using a specified
+ *	buffer that is coming directly from userspace.
+ *
+ *	@type:	in: pstore record type to write
+ *	@reason:
+ *		in: pstore write reason
+ *	@id:	out: unique identifier for the record
+ *	@part:	in: position in a multipart write
+ *	@buf:	in: pointer to userspace contents to write to backend record
+ *	@compressed:
+ *		in: if the record is compressed
+ *	@size:	in: size of the write
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns 0 on success, and non-zero on error.
+ *
+ * @erase:
+ *	Delete a record from backend storage.  Different backends
+ *	identify records differently, so all possible methods of
+ *	identification are included to help the backend locate the
+ *	record to remove.
+ *
+ *	@type:	in: pstore record type to write
+ *	@id:	in: per-type unique identifier for the record
+ *	@count:	in: Oops count
+ *	@time:	in: timestamp for the record
+ *	@psi:	in: pointer to the struct pstore_info for the backend
+ *
+ *	Returns 0 on success, and non-zero on error.
+ *
+ */
 struct pstore_info {
 	struct module	*owner;
 	char		*name;
-	spinlock_t	buf_lock;	/* serialize access to 'buf' */
+
+	spinlock_t	buf_lock;
 	char		*buf;
 	size_t		bufsize;
-	struct mutex	read_mutex;	/* serialize open/read/close */
+
+	struct mutex	read_mutex;
+
 	int		flags;
+	void		*data;
+
 	int		(*open)(struct pstore_info *psi);
 	int		(*close)(struct pstore_info *psi);
 	ssize_t		(*read)(u64 *id, enum pstore_type_id *type,
@@ -76,11 +200,10 @@ struct pstore_info {
 	int		(*erase)(enum pstore_type_id type, u64 id,
 			int count, struct timespec time,
 			struct pstore_info *psi);
-	void		*data;
 };
 
+/* Supported frontends */
 #define PSTORE_FLAGS_DMESG	(1 << 0)
-#define PSTORE_FLAGS_FRAGILE	PSTORE_FLAGS_DMESG
 #define PSTORE_FLAGS_CONSOLE	(1 << 1)
 #define PSTORE_FLAGS_FTRACE	(1 << 2)
 #define PSTORE_FLAGS_PMSG	(1 << 3)
-- 
cgit v1.2.3


From 9abdcccc3d5f3c72f25cd48160f60d911353bee9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 3 Mar 2017 16:59:29 -0800
Subject: pstore: Extract common arguments into structure

The read/mkfile pair pass the same arguments and should be cleared
between calls. Move to a structure and wipe it after every loop.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/platform.c   | 55 +++++++++++++++++++++++++++-----------------------
 include/linux/pstore.h | 28 ++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 320a673ecb5b..f45228eac3e6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -766,16 +766,9 @@ EXPORT_SYMBOL_GPL(pstore_unregister);
 void pstore_get_records(int quiet)
 {
 	struct pstore_info *psi = psinfo;
-	char			*buf = NULL;
-	ssize_t			size;
-	u64			id;
-	int			count;
-	enum pstore_type_id	type;
-	struct timespec		time;
+	struct pstore_record	record = { .psi = psi, };
 	int			failed = 0, rc;
-	bool			compressed;
 	int			unzipped_len = -1;
-	ssize_t			ecc_notice_size = 0;
 
 	if (!psi)
 		return;
@@ -784,39 +777,51 @@ void pstore_get_records(int quiet)
 	if (psi->open && psi->open(psi))
 		goto out;
 
-	while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
-				 &ecc_notice_size, psi)) > 0) {
-		if (compressed && (type == PSTORE_TYPE_DMESG)) {
+	while ((record.size = psi->read(&record.id, &record.type,
+				 &record.count, &record.time,
+				 &record.buf, &record.compressed,
+				 &record.ecc_notice_size,
+				 record.psi)) > 0) {
+		if (record.compressed &&
+		    record.type == PSTORE_TYPE_DMESG) {
 			if (big_oops_buf)
-				unzipped_len = pstore_decompress(buf,
-							big_oops_buf, size,
+				unzipped_len = pstore_decompress(
+							record.buf,
+							big_oops_buf,
+							record.size,
 							big_oops_buf_sz);
 
 			if (unzipped_len > 0) {
-				if (ecc_notice_size)
+				if (record.ecc_notice_size)
 					memcpy(big_oops_buf + unzipped_len,
-					       buf + size, ecc_notice_size);
-				kfree(buf);
-				buf = big_oops_buf;
-				size = unzipped_len;
-				compressed = false;
+					       record.buf + record.size,
+					       record.ecc_notice_size);
+				kfree(record.buf);
+				record.buf = big_oops_buf;
+				record.size = unzipped_len;
+				record.compressed = false;
 			} else {
 				pr_err("decompression failed;returned %d\n",
 				       unzipped_len);
-				compressed = true;
+				record.compressed = true;
 			}
 		}
-		rc = pstore_mkfile(type, psi->name, id, count, buf,
-				   compressed, size + ecc_notice_size,
-				   time, psi);
+		rc = pstore_mkfile(record.type, psi->name, record.id,
+				   record.count, record.buf,
+				   record.compressed,
+				   record.size + record.ecc_notice_size,
+				   record.time, record.psi);
 		if (unzipped_len < 0) {
 			/* Free buffer other than big oops */
-			kfree(buf);
-			buf = NULL;
+			kfree(record.buf);
+			record.buf = NULL;
 		} else
 			unzipped_len = -1;
 		if (rc && (rc != -EEXIST || !quiet))
 			failed++;
+
+		memset(&record, 0, sizeof(record));
+		record.psi = psi;
 	}
 	if (psi->close)
 		psi->close(psi);
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 56477ce6806a..745468072d6e 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -30,6 +30,8 @@
 #include <linux/time.h>
 #include <linux/types.h>
 
+struct module;
+
 /* pstore record types (see fs/pstore/inode.c for filename templates) */
 enum pstore_type_id {
 	PSTORE_TYPE_DMESG	= 0,
@@ -45,7 +47,31 @@ enum pstore_type_id {
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
-struct module;
+struct pstore_info;
+/**
+ * struct pstore_record - details of a pstore record entry
+ * @psi:	pstore backend driver information
+ * @type:	pstore record type
+ * @id:		per-type unique identifier for record
+ * @time:	timestamp of the record
+ * @count:	for PSTORE_TYPE_DMESG, the Oops count.
+ * @compressed:	for PSTORE_TYPE_DMESG, whether the buffer is compressed
+ * @buf:	pointer to record contents
+ * @size:	size of @buf
+ * @ecc_notice_size:
+ *		ECC information for @buf
+ */
+struct pstore_record {
+	struct pstore_info	*psi;
+	enum pstore_type_id	type;
+	u64			id;
+	struct timespec		time;
+	int			count;
+	bool			compressed;
+	char			*buf;
+	ssize_t			size;
+	ssize_t			ecc_notice_size;
+};
 
 /**
  * struct pstore_info - backend pstore driver structure
-- 
cgit v1.2.3


From 125cc42baf8ab2149c207f8a360ea25668b8422d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 3 Mar 2017 22:09:18 -0800
Subject: pstore: Replace arguments for read() API

The argument list for the pstore_read() interface is unwieldy. This changes
passes the new struct pstore_record instead. The erst backend was already
doing something similar internally.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/powerpc/kernel/nvram_64.c    |  61 +++++++++++-----------
 drivers/acpi/apei/erst.c          |  38 ++++++--------
 drivers/firmware/efi/efi-pstore.c | 104 ++++++++++++++++----------------------
 fs/pstore/platform.c              |   7 +--
 fs/pstore/ram.c                   |  53 ++++++++++---------
 include/linux/pstore.h            |  20 +++-----
 6 files changed, 124 insertions(+), 159 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index d5e2b8309939..7f192001d09a 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -442,10 +442,7 @@ static int nvram_pstore_write(enum pstore_type_id type,
  * Returns the length of the data we read from each partition.
  * Returns 0 if we've been called before.
  */
-static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
-				int *count, struct timespec *time, char **buf,
-				bool *compressed, ssize_t *ecc_notice_size,
-				struct pstore_info *psi)
+static ssize_t nvram_pstore_read(struct pstore_record *record)
 {
 	struct oops_log_info *oops_hdr;
 	unsigned int err_type, id_no, size = 0;
@@ -459,40 +456,40 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 	switch (nvram_type_ids[read_type]) {
 	case PSTORE_TYPE_DMESG:
 		part = &oops_log_partition;
-		*type = PSTORE_TYPE_DMESG;
+		record->type = PSTORE_TYPE_DMESG;
 		break;
 	case PSTORE_TYPE_PPC_COMMON:
 		sig = NVRAM_SIG_SYS;
 		part = &common_partition;
-		*type = PSTORE_TYPE_PPC_COMMON;
-		*id = PSTORE_TYPE_PPC_COMMON;
-		time->tv_sec = 0;
-		time->tv_nsec = 0;
+		record->type = PSTORE_TYPE_PPC_COMMON;
+		record->id = PSTORE_TYPE_PPC_COMMON;
+		record->time.tv_sec = 0;
+		record->time.tv_nsec = 0;
 		break;
 #ifdef CONFIG_PPC_PSERIES
 	case PSTORE_TYPE_PPC_RTAS:
 		part = &rtas_log_partition;
-		*type = PSTORE_TYPE_PPC_RTAS;
-		time->tv_sec = last_rtas_event;
-		time->tv_nsec = 0;
+		record->type = PSTORE_TYPE_PPC_RTAS;
+		record->time.tv_sec = last_rtas_event;
+		record->time.tv_nsec = 0;
 		break;
 	case PSTORE_TYPE_PPC_OF:
 		sig = NVRAM_SIG_OF;
 		part = &of_config_partition;
-		*type = PSTORE_TYPE_PPC_OF;
-		*id = PSTORE_TYPE_PPC_OF;
-		time->tv_sec = 0;
-		time->tv_nsec = 0;
+		record->type = PSTORE_TYPE_PPC_OF;
+		record->id = PSTORE_TYPE_PPC_OF;
+		record->time.tv_sec = 0;
+		record->time.tv_nsec = 0;
 		break;
 #endif
 #ifdef CONFIG_PPC_POWERNV
 	case PSTORE_TYPE_PPC_OPAL:
 		sig = NVRAM_SIG_FW;
 		part = &skiboot_partition;
-		*type = PSTORE_TYPE_PPC_OPAL;
-		*id = PSTORE_TYPE_PPC_OPAL;
-		time->tv_sec = 0;
-		time->tv_nsec = 0;
+		record->type = PSTORE_TYPE_PPC_OPAL;
+		record->id = PSTORE_TYPE_PPC_OPAL;
+		record->time.tv_sec = 0;
+		record->time.tv_nsec = 0;
 		break;
 #endif
 	default:
@@ -520,10 +517,10 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 		return 0;
 	}
 
-	*count = 0;
+	record->count = 0;
 
 	if (part->os_partition)
-		*id = id_no;
+		record->id = id_no;
 
 	if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) {
 		size_t length, hdr_size;
@@ -533,28 +530,28 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 			/* Old format oops header had 2-byte record size */
 			hdr_size = sizeof(u16);
 			length = be16_to_cpu(oops_hdr->version);
-			time->tv_sec = 0;
-			time->tv_nsec = 0;
+			record->time.tv_sec = 0;
+			record->time.tv_nsec = 0;
 		} else {
 			hdr_size = sizeof(*oops_hdr);
 			length = be16_to_cpu(oops_hdr->report_length);
-			time->tv_sec = be64_to_cpu(oops_hdr->timestamp);
-			time->tv_nsec = 0;
+			record->time.tv_sec = be64_to_cpu(oops_hdr->timestamp);
+			record->time.tv_nsec = 0;
 		}
-		*buf = kmemdup(buff + hdr_size, length, GFP_KERNEL);
+		record->buf = kmemdup(buff + hdr_size, length, GFP_KERNEL);
 		kfree(buff);
-		if (*buf == NULL)
+		if (record->buf == NULL)
 			return -ENOMEM;
 
-		*ecc_notice_size = 0;
+		record->ecc_notice_size = 0;
 		if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
-			*compressed = true;
+			record->compressed = true;
 		else
-			*compressed = false;
+			record->compressed = false;
 		return length;
 	}
 
-	*buf = buff;
+	record->buf = buff;
 	return part->size;
 }
 
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index ec4f507b524f..bbefb7522f80 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -925,10 +925,7 @@ static int erst_check_table(struct acpi_table_erst *erst_tab)
 
 static int erst_open_pstore(struct pstore_info *psi);
 static int erst_close_pstore(struct pstore_info *psi);
-static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, int *count,
-			   struct timespec *time, char **buf,
-			   bool *compressed, ssize_t *ecc_notice_size,
-			   struct pstore_info *psi);
+static ssize_t erst_reader(struct pstore_record *record);
 static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
 		       u64 *id, unsigned int part, int count, bool compressed,
 		       size_t size, struct pstore_info *psi);
@@ -986,10 +983,7 @@ static int erst_close_pstore(struct pstore_info *psi)
 	return 0;
 }
 
-static ssize_t erst_reader(u64 *id, enum pstore_type_id *type, int *count,
-			   struct timespec *time, char **buf,
-			   bool *compressed, ssize_t *ecc_notice_size,
-			   struct pstore_info *psi)
+static ssize_t erst_reader(struct pstore_record *record)
 {
 	int rc;
 	ssize_t len = 0;
@@ -1027,33 +1021,33 @@ skip:
 	if (uuid_le_cmp(rcd->hdr.creator_id, CPER_CREATOR_PSTORE) != 0)
 		goto skip;
 
-	*buf = kmalloc(len, GFP_KERNEL);
-	if (*buf == NULL) {
+	record->buf = kmalloc(len, GFP_KERNEL);
+	if (record->buf == NULL) {
 		rc = -ENOMEM;
 		goto out;
 	}
-	memcpy(*buf, rcd->data, len - sizeof(*rcd));
-	*id = record_id;
-	*compressed = false;
-	*ecc_notice_size = 0;
+	memcpy(record->buf, rcd->data, len - sizeof(*rcd));
+	record->id = record_id;
+	record->compressed = false;
+	record->ecc_notice_size = 0;
 	if (uuid_le_cmp(rcd->sec_hdr.section_type,
 			CPER_SECTION_TYPE_DMESG_Z) == 0) {
-		*type = PSTORE_TYPE_DMESG;
-		*compressed = true;
+		record->type = PSTORE_TYPE_DMESG;
+		record->compressed = true;
 	} else if (uuid_le_cmp(rcd->sec_hdr.section_type,
 			CPER_SECTION_TYPE_DMESG) == 0)
-		*type = PSTORE_TYPE_DMESG;
+		record->type = PSTORE_TYPE_DMESG;
 	else if (uuid_le_cmp(rcd->sec_hdr.section_type,
 			     CPER_SECTION_TYPE_MCE) == 0)
-		*type = PSTORE_TYPE_MCE;
+		record->type = PSTORE_TYPE_MCE;
 	else
-		*type = PSTORE_TYPE_UNKNOWN;
+		record->type = PSTORE_TYPE_UNKNOWN;
 
 	if (rcd->hdr.validation_bits & CPER_VALID_TIMESTAMP)
-		time->tv_sec = rcd->hdr.timestamp;
+		record->time.tv_sec = rcd->hdr.timestamp;
 	else
-		time->tv_sec = 0;
-	time->tv_nsec = 0;
+		record->time.tv_sec = 0;
+	record->time.tv_nsec = 0;
 
 out:
 	kfree(rcd);
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index f402ba2eed46..bda24129e85b 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -28,26 +28,16 @@ static int efi_pstore_close(struct pstore_info *psi)
 	return 0;
 }
 
-struct pstore_read_data {
-	u64 *id;
-	enum pstore_type_id *type;
-	int *count;
-	struct timespec *timespec;
-	bool *compressed;
-	ssize_t *ecc_notice_size;
-	char **buf;
-};
-
 static inline u64 generic_id(unsigned long timestamp,
 			     unsigned int part, int count)
 {
 	return ((u64) timestamp * 100 + part) * 1000 + count;
 }
 
-static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
+static int efi_pstore_read_func(struct efivar_entry *entry,
+				struct pstore_record *record)
 {
 	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	struct pstore_read_data *cb_data = data;
 	char name[DUMP_NAME_LEN], data_type;
 	int i;
 	int cnt;
@@ -61,37 +51,37 @@ static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
 		name[i] = entry->var.VariableName[i];
 
 	if (sscanf(name, "dump-type%u-%u-%d-%lu-%c",
-		   cb_data->type, &part, &cnt, &time, &data_type) == 5) {
-		*cb_data->id = generic_id(time, part, cnt);
-		*cb_data->count = cnt;
-		cb_data->timespec->tv_sec = time;
-		cb_data->timespec->tv_nsec = 0;
+		   &record->type, &part, &cnt, &time, &data_type) == 5) {
+		record->id = generic_id(time, part, cnt);
+		record->count = cnt;
+		record->time.tv_sec = time;
+		record->time.tv_nsec = 0;
 		if (data_type == 'C')
-			*cb_data->compressed = true;
+			record->compressed = true;
 		else
-			*cb_data->compressed = false;
-		*cb_data->ecc_notice_size = 0;
+			record->compressed = false;
+		record->ecc_notice_size = 0;
 	} else if (sscanf(name, "dump-type%u-%u-%d-%lu",
-		   cb_data->type, &part, &cnt, &time) == 4) {
-		*cb_data->id = generic_id(time, part, cnt);
-		*cb_data->count = cnt;
-		cb_data->timespec->tv_sec = time;
-		cb_data->timespec->tv_nsec = 0;
-		*cb_data->compressed = false;
-		*cb_data->ecc_notice_size = 0;
+		   &record->type, &part, &cnt, &time) == 4) {
+		record->id = generic_id(time, part, cnt);
+		record->count = cnt;
+		record->time.tv_sec = time;
+		record->time.tv_nsec = 0;
+		record->compressed = false;
+		record->ecc_notice_size = 0;
 	} else if (sscanf(name, "dump-type%u-%u-%lu",
-			  cb_data->type, &part, &time) == 3) {
+			  &record->type, &part, &time) == 3) {
 		/*
 		 * Check if an old format,
 		 * which doesn't support holding
 		 * multiple logs, remains.
 		 */
-		*cb_data->id = generic_id(time, part, 0);
-		*cb_data->count = 0;
-		cb_data->timespec->tv_sec = time;
-		cb_data->timespec->tv_nsec = 0;
-		*cb_data->compressed = false;
-		*cb_data->ecc_notice_size = 0;
+		record->id = generic_id(time, part, 0);
+		record->count = 0;
+		record->time.tv_sec = time;
+		record->time.tv_nsec = 0;
+		record->compressed = false;
+		record->ecc_notice_size = 0;
 	} else
 		return 0;
 
@@ -99,7 +89,7 @@ static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
 	__efivar_entry_get(entry, &entry->var.Attributes,
 			   &entry->var.DataSize, entry->var.Data);
 	size = entry->var.DataSize;
-	memcpy(*cb_data->buf, entry->var.Data,
+	memcpy(record->buf, entry->var.Data,
 	       (size_t)min_t(unsigned long, EFIVARS_DATA_SIZE_MAX, size));
 
 	return size;
@@ -164,7 +154,7 @@ static int efi_pstore_scan_sysfs_exit(struct efivar_entry *pos,
 /**
  * efi_pstore_sysfs_entry_iter
  *
- * @data: function-specific data to pass to callback
+ * @record: pstore record to pass to callback
  * @pos: entry to begin iterating from
  *
  * You MUST call efivar_enter_iter_begin() before this function, and
@@ -175,7 +165,8 @@ static int efi_pstore_scan_sysfs_exit(struct efivar_entry *pos,
  * the next entry of the last one passed to efi_pstore_read_func().
  * To begin iterating from the beginning of the list @pos must be %NULL.
  */
-static int efi_pstore_sysfs_entry_iter(void *data, struct efivar_entry **pos)
+static int efi_pstore_sysfs_entry_iter(struct pstore_record *record,
+				       struct efivar_entry **pos)
 {
 	struct efivar_entry *entry, *n;
 	struct list_head *head = &efivar_sysfs_list;
@@ -186,7 +177,7 @@ static int efi_pstore_sysfs_entry_iter(void *data, struct efivar_entry **pos)
 		list_for_each_entry_safe(entry, n, head, list) {
 			efi_pstore_scan_sysfs_enter(entry, n, head);
 
-			size = efi_pstore_read_func(entry, data);
+			size = efi_pstore_read_func(entry, record);
 			ret = efi_pstore_scan_sysfs_exit(entry, n, head,
 							 size < 0);
 			if (ret)
@@ -201,7 +192,7 @@ static int efi_pstore_sysfs_entry_iter(void *data, struct efivar_entry **pos)
 	list_for_each_entry_safe_from((*pos), n, head, list) {
 		efi_pstore_scan_sysfs_enter((*pos), n, head);
 
-		size = efi_pstore_read_func((*pos), data);
+		size = efi_pstore_read_func((*pos), record);
 		ret = efi_pstore_scan_sysfs_exit((*pos), n, head, size < 0);
 		if (ret)
 			return ret;
@@ -225,36 +216,27 @@ static int efi_pstore_sysfs_entry_iter(void *data, struct efivar_entry **pos)
  * size < 0: Failed to get data of entry logging via efi_pstore_write(),
  *           and pstore will stop reading entry.
  */
-static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
-			       int *count, struct timespec *timespec,
-			       char **buf, bool *compressed,
-			       ssize_t *ecc_notice_size,
-			       struct pstore_info *psi)
+static ssize_t efi_pstore_read(struct pstore_record *record)
 {
-	struct pstore_read_data data;
+	struct efivar_entry *entry = (struct efivar_entry *)record->psi->data;
 	ssize_t size;
 
-	data.id = id;
-	data.type = type;
-	data.count = count;
-	data.timespec = timespec;
-	data.compressed = compressed;
-	data.ecc_notice_size = ecc_notice_size;
-	data.buf = buf;
-
-	*data.buf = kzalloc(EFIVARS_DATA_SIZE_MAX, GFP_KERNEL);
-	if (!*data.buf)
+	record->buf = kzalloc(EFIVARS_DATA_SIZE_MAX, GFP_KERNEL);
+	if (!record->buf)
 		return -ENOMEM;
 
 	if (efivar_entry_iter_begin()) {
-		kfree(*data.buf);
-		return -EINTR;
+		size = -EINTR;
+		goto out;
 	}
-	size = efi_pstore_sysfs_entry_iter(&data,
-					   (struct efivar_entry **)&psi->data);
+	size = efi_pstore_sysfs_entry_iter(record, &entry);
 	efivar_entry_iter_end();
-	if (size <= 0)
-		kfree(*data.buf);
+
+out:
+	if (size <= 0) {
+		kfree(record->buf);
+		record->buf = NULL;
+	}
 	return size;
 }
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 168e03fd5e58..47968c2f2d0d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -807,12 +807,7 @@ void pstore_get_records(int quiet)
 	if (psi->open && psi->open(psi))
 		goto out;
 
-	while ((record.size = psi->read(&record.id, &record.type,
-				 &record.count, &record.time,
-				 &record.buf, &record.compressed,
-				 &record.ecc_notice_size,
-				 record.psi)) > 0) {
-
+	while ((record.size = psi->read(&record)) > 0) {
 		decompress_record(&record);
 		rc = pstore_mkfile(&record);
 
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 11f918d34b1e..ca6e2a814e37 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
 	return 0;
 }
 
-static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
-				   int *count, struct timespec *time,
-				   char **buf, bool *compressed,
-				   ssize_t *ecc_notice_size,
-				   struct pstore_info *psi)
+static ssize_t ramoops_pstore_read(struct pstore_record *record)
 {
 	ssize_t size = 0;
-	struct ramoops_context *cxt = psi->data;
+	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz = NULL;
 	int header_length = 0;
 	bool free_prz = false;
 
-	/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
+	/*
+	 * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
 	 * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
 	 * valid time stamps, so it is initialized to zero.
 	 */
-	time->tv_sec = 0;
-	time->tv_nsec = 0;
-	*compressed = false;
+	record->time.tv_sec = 0;
+	record->time.tv_nsec = 0;
+	record->compressed = false;
 
 	/* Find the next valid persistent_ram_zone for DMESG */
 	while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
 		prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
-					   cxt->max_dump_cnt, id, type,
+					   cxt->max_dump_cnt, &record->id,
+					   &record->type,
 					   PSTORE_TYPE_DMESG, 1);
 		if (!prz_ok(prz))
 			continue;
 		header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
-						      time, compressed);
+						      &record->time,
+						      &record->compressed);
 		/* Clear and skip this DMESG record if it has no valid header */
 		if (!header_length) {
 			persistent_ram_free_old(prz);
@@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
-					   1, id, type, PSTORE_TYPE_CONSOLE, 0);
+					   1, &record->id, &record->type,
+					   PSTORE_TYPE_CONSOLE, 0);
 
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
-					   1, id, type, PSTORE_TYPE_PMSG, 0);
+					   1, &record->id, &record->type,
+					   PSTORE_TYPE_PMSG, 0);
 
 	/* ftrace is last since it may want to dynamically allocate memory. */
 	if (!prz_ok(prz)) {
 		if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
 			prz = ramoops_get_next_prz(cxt->fprzs,
-					&cxt->ftrace_read_cnt, 1, id, type,
-					PSTORE_TYPE_FTRACE, 0);
+					&cxt->ftrace_read_cnt, 1, &record->id,
+					&record->type, PSTORE_TYPE_FTRACE, 0);
 		} else {
 			/*
 			 * Build a new dummy record which combines all the
@@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 			while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
 				prz_next = ramoops_get_next_prz(cxt->fprzs,
 						&cxt->ftrace_read_cnt,
-						cxt->max_ftrace_cnt, id,
-						type, PSTORE_TYPE_FTRACE, 0);
+						cxt->max_ftrace_cnt,
+						&record->id,
+						&record->type,
+						PSTORE_TYPE_FTRACE, 0);
 
 				if (!prz_ok(prz_next))
 					continue;
@@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 				if (size)
 					goto out;
 			}
-			*id = 0;
+			record->id = 0;
 			prz = tmp_prz;
 		}
 	}
@@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	size = persistent_ram_old_size(prz) - header_length;
 
 	/* ECC correction notice */
-	*ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+	record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
 
-	*buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
-	if (*buf == NULL) {
+	record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
+	if (record->buf == NULL) {
 		size = -ENOMEM;
 		goto out;
 	}
 
-	memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+	memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length,
+	       size);
 
-	persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
+	persistent_ram_ecc_string(prz, record->buf + size,
+				  record->ecc_notice_size + 1);
 
 out:
 	if (free_prz) {
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 745468072d6e..22a46ebbe041 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -111,16 +111,11 @@ struct pstore_record {
  *	Read next available backend record. Called after a successful
  *	@open.
  *
- *	@id:	out: unique identifier for the record
- *	@type:	out: pstore record type
- *	@count: out: for PSTORE_TYPE_DMESG, the Oops count.
- *	@time:	out: timestamp for the record
- *	@buf:	out: kmalloc copy of record contents, to be freed by pstore
- *	@compressed:
- *		out: if the record contents are compressed
- *	@ecc_notice_size:
- *		out: ECC information
- *	@psi:	in: pointer to the struct pstore_info for the backend
+ *	@record:
+ *		pointer to record to populate. @buf should be allocated
+ *		by the backend and filled. At least @type and @id should
+ *		be populated, since these are used when creating pstorefs
+ *		file names.
  *
  *	Returns record size on success, zero when no more records are
  *	available, or negative on error.
@@ -207,10 +202,7 @@ struct pstore_info {
 
 	int		(*open)(struct pstore_info *psi);
 	int		(*close)(struct pstore_info *psi);
-	ssize_t		(*read)(u64 *id, enum pstore_type_id *type,
-			int *count, struct timespec *time, char **buf,
-			bool *compressed, ssize_t *ecc_notice_size,
-			struct pstore_info *psi);
+	ssize_t		(*read)(struct pstore_record *record);
 	int		(*write)(enum pstore_type_id type,
 			enum kmsg_dump_reason reason, u64 *id,
 			unsigned int part, int count, bool compressed,
-- 
cgit v1.2.3


From 76cc9580e3fbd323651d06e8184a5a54e0e1066e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 3 Mar 2017 23:28:53 -0800
Subject: pstore: Replace arguments for write() API

Similar to the pstore_info read() callback, there were too many arguments.
This switches to the new struct pstore_record pointer instead. This adds
"reason" and "part" to the record structure as well.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/powerpc/kernel/nvram_64.c    | 27 +++++------------
 drivers/acpi/apei/erst.c          | 18 +++++-------
 drivers/firmware/efi/efi-pstore.c | 18 +++++-------
 fs/pstore/platform.c              | 62 ++++++++++++++++++++++-----------------
 include/linux/pstore.h            | 36 +++++++++++------------
 5 files changed, 76 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 7f192001d09a..caf2e1f36d6b 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -389,51 +389,40 @@ static int nvram_pstore_open(struct pstore_info *psi)
 
 /**
  * nvram_pstore_write - pstore write callback for nvram
- * @type:               Type of message logged
- * @reason:             reason behind dump (oops/panic)
- * @id:                 identifier to indicate the write performed
- * @part:               pstore writes data to registered buffer in parts,
- *                      part number will indicate the same.
- * @count:              Indicates oops count
- * @compressed:         Flag to indicate the log is compressed
- * @size:               number of bytes written to the registered buffer
- * @psi:                registered pstore_info structure
+ * @record:             pstore record to write, with @id to be set
  *
  * Called by pstore_dump() when an oops or panic report is logged in the
  * printk buffer.
  * Returns 0 on successful write.
  */
-static int nvram_pstore_write(enum pstore_type_id type,
-				enum kmsg_dump_reason reason,
-				u64 *id, unsigned int part, int count,
-				bool compressed, size_t size,
-				struct pstore_info *psi)
+static int nvram_pstore_write(struct pstore_record *record)
 {
 	int rc;
 	unsigned int err_type = ERR_TYPE_KERNEL_PANIC;
 	struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf;
 
 	/* part 1 has the recent messages from printk buffer */
-	if (part > 1 || (type != PSTORE_TYPE_DMESG))
+	if (record->part > 1 || (record->type != PSTORE_TYPE_DMESG))
 		return -1;
 
 	if (clobbering_unread_rtas_event())
 		return -1;
 
 	oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION);
-	oops_hdr->report_length = cpu_to_be16(size);
+	oops_hdr->report_length = cpu_to_be16(record->size);
 	oops_hdr->timestamp = cpu_to_be64(ktime_get_real_seconds());
 
-	if (compressed)
+	if (record->compressed)
 		err_type = ERR_TYPE_KERNEL_PANIC_GZ;
 
 	rc = nvram_write_os_partition(&oops_log_partition, oops_buf,
-		(int) (sizeof(*oops_hdr) + size), err_type, count);
+		(int) (sizeof(*oops_hdr) + record->size), err_type,
+		record->count);
 
 	if (rc != 0)
 		return rc;
 
-	*id = part;
+	record->id = record->part;
 	return 0;
 }
 
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index bbefb7522f80..440588d189e7 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -926,9 +926,7 @@ static int erst_check_table(struct acpi_table_erst *erst_tab)
 static int erst_open_pstore(struct pstore_info *psi);
 static int erst_close_pstore(struct pstore_info *psi);
 static ssize_t erst_reader(struct pstore_record *record);
-static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
-		       u64 *id, unsigned int part, int count, bool compressed,
-		       size_t size, struct pstore_info *psi);
+static int erst_writer(struct pstore_record *record);
 static int erst_clearer(enum pstore_type_id type, u64 id, int count,
 			struct timespec time, struct pstore_info *psi);
 
@@ -1054,9 +1052,7 @@ out:
 	return (rc < 0) ? rc : (len - sizeof(*rcd));
 }
 
-static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
-		       u64 *id, unsigned int part, int count, bool compressed,
-		       size_t size, struct pstore_info *psi)
+static int erst_writer(struct pstore_record *record)
 {
 	struct cper_pstore_record *rcd = (struct cper_pstore_record *)
 					(erst_info.buf - sizeof(*rcd));
@@ -1071,21 +1067,21 @@ static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
 	/* timestamp valid. platform_id, partition_id are invalid */
 	rcd->hdr.validation_bits = CPER_VALID_TIMESTAMP;
 	rcd->hdr.timestamp = get_seconds();
-	rcd->hdr.record_length = sizeof(*rcd) + size;
+	rcd->hdr.record_length = sizeof(*rcd) + record->size;
 	rcd->hdr.creator_id = CPER_CREATOR_PSTORE;
 	rcd->hdr.notification_type = CPER_NOTIFY_MCE;
 	rcd->hdr.record_id = cper_next_record_id();
 	rcd->hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
 
 	rcd->sec_hdr.section_offset = sizeof(*rcd);
-	rcd->sec_hdr.section_length = size;
+	rcd->sec_hdr.section_length = record->size;
 	rcd->sec_hdr.revision = CPER_SEC_REV;
 	/* fru_id and fru_text is invalid */
 	rcd->sec_hdr.validation_bits = 0;
 	rcd->sec_hdr.flags = CPER_SEC_PRIMARY;
-	switch (type) {
+	switch (record->type) {
 	case PSTORE_TYPE_DMESG:
-		if (compressed)
+		if (record->compressed)
 			rcd->sec_hdr.section_type = CPER_SECTION_TYPE_DMESG_Z;
 		else
 			rcd->sec_hdr.section_type = CPER_SECTION_TYPE_DMESG;
@@ -1099,7 +1095,7 @@ static int erst_writer(enum pstore_type_id type, enum kmsg_dump_reason reason,
 	rcd->sec_hdr.section_severity = CPER_SEV_FATAL;
 
 	ret = erst_write(&rcd->hdr);
-	*id = rcd->hdr.record_id;
+	record->id = rcd->hdr.record_id;
 
 	return ret;
 }
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index bda24129e85b..f81e3ec6f1c0 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -240,30 +240,28 @@ out:
 	return size;
 }
 
-static int efi_pstore_write(enum pstore_type_id type,
-		enum kmsg_dump_reason reason, u64 *id,
-		unsigned int part, int count, bool compressed, size_t size,
-		struct pstore_info *psi)
+static int efi_pstore_write(struct pstore_record *record)
 {
 	char name[DUMP_NAME_LEN];
 	efi_char16_t efi_name[DUMP_NAME_LEN];
 	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
 	int i, ret = 0;
 
-	sprintf(name, "dump-type%u-%u-%d-%lu-%c", type, part, count,
-		get_seconds(), compressed ? 'C' : 'D');
+	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu-%c",
+		 record->type, record->part, record->count,
+		 get_seconds(), record->compressed ? 'C' : 'D');
 
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		efi_name[i] = name[i];
 
 	efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
-			      !pstore_cannot_block_path(reason),
-			      size, psi->buf);
+			      !pstore_cannot_block_path(record->reason),
+			      record->size, record->psi->buf);
 
-	if (reason == KMSG_DUMP_OOPS)
+	if (record->reason == KMSG_DUMP_OOPS)
 		efivar_run_worker();
 
-	*id = part;
+	record->id = record->part;
 	return ret;
 };
 
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 47968c2f2d0d..879658b4c679 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 {
 	unsigned long	total = 0;
 	const char	*why;
-	u64		id;
 	unsigned int	part = 1;
 	unsigned long	flags = 0;
 	int		is_locked;
@@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	oopscount++;
 	while (total < kmsg_bytes) {
 		char *dst;
-		unsigned long size;
-		int hsize;
+		size_t dst_size;
+		int header_size;
 		int zipped_len = -1;
-		size_t len;
-		bool compressed = false;
-		size_t total_len;
+		size_t dump_size;
+		struct pstore_record record = {
+			.type = PSTORE_TYPE_DMESG,
+			.count = oopscount,
+			.reason = reason,
+			.part = part,
+			.compressed = false,
+			.buf = psinfo->buf,
+			.psi = psinfo,
+		};
 
 		if (big_oops_buf && is_locked) {
 			dst = big_oops_buf;
-			size = big_oops_buf_sz;
+			dst_size = big_oops_buf_sz;
 		} else {
 			dst = psinfo->buf;
-			size = psinfo->bufsize;
+			dst_size = psinfo->bufsize;
 		}
 
-		hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part);
-		size -= hsize;
+		/* Write dump header. */
+		header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
+				 oopscount, part);
+		dst_size -= header_size;
 
-		if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
-					  size, &len))
+		/* Write dump contents. */
+		if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+					  dst_size, &dump_size))
 			break;
 
 		if (big_oops_buf && is_locked) {
 			zipped_len = pstore_compress(dst, psinfo->buf,
-						hsize + len, psinfo->bufsize);
+						header_size + dump_size,
+						psinfo->bufsize);
 
 			if (zipped_len > 0) {
-				compressed = true;
-				total_len = zipped_len;
+				record.compressed = true;
+				record.size = zipped_len;
 			} else {
-				total_len = copy_kmsg_to_buffer(hsize, len);
+				record.size = copy_kmsg_to_buffer(header_size,
+								  dump_size);
 			}
 		} else {
-			total_len = hsize + len;
+			record.size = header_size + dump_size;
 		}
 
-		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-				    oopscount, compressed, total_len, psinfo);
+		ret = psinfo->write(&record);
 		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_new_entry = 1;
 
-		total += total_len;
+		total += record.size;
 		part++;
 	}
 	if (is_locked)
@@ -618,14 +628,12 @@ static void pstore_register_console(void) {}
 static void pstore_unregister_console(void) {}
 #endif
 
-static int pstore_write_compat(enum pstore_type_id type,
-			       enum kmsg_dump_reason reason,
-			       u64 *id, unsigned int part, int count,
-			       bool compressed, size_t size,
-			       struct pstore_info *psi)
+static int pstore_write_compat(struct pstore_record *record)
 {
-	return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
-			     size, psi);
+	return record->psi->write_buf(record->type, record->reason,
+				      &record->id, record->part,
+				      psinfo->buf, record->compressed,
+				      record->size, record->psi);
 }
 
 static int pstore_write_buf_user_compat(enum pstore_type_id type,
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 22a46ebbe041..9335f75c3ddb 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -54,23 +54,32 @@ struct pstore_info;
  * @type:	pstore record type
  * @id:		per-type unique identifier for record
  * @time:	timestamp of the record
- * @count:	for PSTORE_TYPE_DMESG, the Oops count.
- * @compressed:	for PSTORE_TYPE_DMESG, whether the buffer is compressed
  * @buf:	pointer to record contents
  * @size:	size of @buf
  * @ecc_notice_size:
  *		ECC information for @buf
+ *
+ * Valid for PSTORE_TYPE_DMESG @type:
+ *
+ * @count:	Oops count since boot
+ * @reason:	kdump reason for notification
+ * @part:	position in a multipart record
+ * @compressed:	whether the buffer is compressed
+ *
  */
 struct pstore_record {
 	struct pstore_info	*psi;
 	enum pstore_type_id	type;
 	u64			id;
 	struct timespec		time;
-	int			count;
-	bool			compressed;
 	char			*buf;
 	ssize_t			size;
 	ssize_t			ecc_notice_size;
+
+	int			count;
+	enum kmsg_dump_reason	reason;
+	unsigned int		part;
+	bool			compressed;
 };
 
 /**
@@ -125,16 +134,10 @@ struct pstore_record {
  *	data to be stored has already been written to the registered @buf
  *	of the @psi structure.
  *
- *	@type:	in: pstore record type to write
- *	@reason:
- *		in: pstore write reason
- *	@id:	out: unique identifier for the record
- *	@part:	in: position in a multipart write
- *	@count:	in: increasing from 0 since boot, the number of this Oops
- *	@compressed:
- *		in: if the record is compressed
- *	@size:	in: size of the write
- *	@psi:	in: pointer to the struct pstore_info for the backend
+ *	@record:
+ *		pointer to record metadata. Note that @buf is NULL, since
+ *		the @buf registered with @psi is what has been written. The
+ *		backend is expected to update @id.
  *
  *	Returns 0 on success, and non-zero on error.
  *
@@ -203,10 +206,7 @@ struct pstore_info {
 	int		(*open)(struct pstore_info *psi);
 	int		(*close)(struct pstore_info *psi);
 	ssize_t		(*read)(struct pstore_record *record);
-	int		(*write)(enum pstore_type_id type,
-			enum kmsg_dump_reason reason, u64 *id,
-			unsigned int part, int count, bool compressed,
-			size_t size, struct pstore_info *psi);
+	int		(*write)(struct pstore_record *record);
 	int		(*write_buf)(enum pstore_type_id type,
 			enum kmsg_dump_reason reason, u64 *id,
 			unsigned int part, const char *buf, bool compressed,
-- 
cgit v1.2.3


From a61072aae693ba08390f92eed1dd0573fa5c3cd9 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sat, 4 Mar 2017 23:31:19 -0800
Subject: pstore: Replace arguments for erase() API

This removes the argument list for the erase() callback and replaces it
with a pointer to the backend record details to be removed.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/acpi/apei/erst.c          |  8 +++-----
 drivers/firmware/efi/efi-pstore.c | 26 +++++++++++---------------
 fs/pstore/inode.c                 | 12 +++++-------
 fs/pstore/ram.c                   | 15 +++++++--------
 include/linux/pstore.h            | 16 +++++-----------
 5 files changed, 31 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 440588d189e7..7207e5fc9d3d 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -927,8 +927,7 @@ static int erst_open_pstore(struct pstore_info *psi);
 static int erst_close_pstore(struct pstore_info *psi);
 static ssize_t erst_reader(struct pstore_record *record);
 static int erst_writer(struct pstore_record *record);
-static int erst_clearer(enum pstore_type_id type, u64 id, int count,
-			struct timespec time, struct pstore_info *psi);
+static int erst_clearer(struct pstore_record *record);
 
 static struct pstore_info erst_info = {
 	.owner		= THIS_MODULE,
@@ -1100,10 +1099,9 @@ static int erst_writer(struct pstore_record *record)
 	return ret;
 }
 
-static int erst_clearer(enum pstore_type_id type, u64 id, int count,
-			struct timespec time, struct pstore_info *psi)
+static int erst_clearer(struct pstore_record *record)
 {
-	return erst_clear(id);
+	return erst_clear(record->id);
 }
 
 static int __init erst_init(void)
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index f81e3ec6f1c0..93d8cdbe7ef4 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -266,10 +266,7 @@ static int efi_pstore_write(struct pstore_record *record)
 };
 
 struct pstore_erase_data {
-	u64 id;
-	enum pstore_type_id type;
-	int count;
-	struct timespec time;
+	struct pstore_record *record;
 	efi_char16_t *name;
 };
 
@@ -295,8 +292,9 @@ static int efi_pstore_erase_func(struct efivar_entry *entry, void *data)
 		 * Check if an old format, which doesn't support
 		 * holding multiple logs, remains.
 		 */
-		sprintf(name_old, "dump-type%u-%u-%lu", ed->type,
-			(unsigned int)ed->id, ed->time.tv_sec);
+		snprintf(name_old, sizeof(name_old), "dump-type%u-%u-%lu",
+			ed->record->type, (unsigned int)ed->record->id,
+			ed->record->time.tv_sec);
 
 		for (i = 0; i < DUMP_NAME_LEN; i++)
 			efi_name_old[i] = name_old[i];
@@ -321,8 +319,7 @@ static int efi_pstore_erase_func(struct efivar_entry *entry, void *data)
 	return 1;
 }
 
-static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
-			    struct timespec time, struct pstore_info *psi)
+static int efi_pstore_erase(struct pstore_record *record)
 {
 	struct pstore_erase_data edata;
 	struct efivar_entry *entry = NULL;
@@ -331,17 +328,16 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 	int found, i;
 	unsigned int part;
 
-	do_div(id, 1000);
-	part = do_div(id, 100);
-	sprintf(name, "dump-type%u-%u-%d-%lu", type, part, count, time.tv_sec);
+	do_div(record->id, 1000);
+	part = do_div(record->id, 100);
+	snprintf(name, sizeof(name), "dump-type%u-%u-%d-%lu",
+		 record->type, record->part, record->count,
+		 record->time.tv_sec);
 
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		efi_name[i] = name[i];
 
-	edata.id = part;
-	edata.type = type;
-	edata.count = count;
-	edata.time = time;
+	edata.record = record;
 	edata.name = efi_name;
 
 	if (efivar_entry_iter_begin())
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 0ea281b457fa..06504b69575b 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -210,14 +210,12 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 	if (err)
 		return err;
 
-	if (record->psi->erase) {
-		mutex_lock(&record->psi->read_mutex);
-		record->psi->erase(record->type, record->id, record->count,
-			      d_inode(dentry)->i_ctime, record->psi);
-		mutex_unlock(&record->psi->read_mutex);
-	} else {
+	if (!record->psi->erase)
 		return -EPERM;
-	}
+
+	mutex_lock(&record->psi->read_mutex);
+	record->psi->erase(record);
+	mutex_unlock(&record->psi->read_mutex);
 
 	return simple_unlink(dir, dentry);
 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index ca6e2a814e37..a18575fe32e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -469,25 +469,24 @@ static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
 	return -EINVAL;
 }
 
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
-				struct timespec time, struct pstore_info *psi)
+static int ramoops_pstore_erase(struct pstore_record *record)
 {
-	struct ramoops_context *cxt = psi->data;
+	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz;
 
-	switch (type) {
+	switch (record->type) {
 	case PSTORE_TYPE_DMESG:
-		if (id >= cxt->max_dump_cnt)
+		if (record->id >= cxt->max_dump_cnt)
 			return -EINVAL;
-		prz = cxt->dprzs[id];
+		prz = cxt->dprzs[record->id];
 		break;
 	case PSTORE_TYPE_CONSOLE:
 		prz = cxt->cprz;
 		break;
 	case PSTORE_TYPE_FTRACE:
-		if (id >= cxt->max_ftrace_cnt)
+		if (record->id >= cxt->max_ftrace_cnt)
 			return -EINVAL;
-		prz = cxt->fprzs[id];
+		prz = cxt->fprzs[record->id];
 		break;
 	case PSTORE_TYPE_PMSG:
 		prz = cxt->mprz;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 9335f75c3ddb..2cd1979d1f9a 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -177,15 +177,11 @@ struct pstore_record {
  *
  * @erase:
  *	Delete a record from backend storage.  Different backends
- *	identify records differently, so all possible methods of
- *	identification are included to help the backend locate the
- *	record to remove.
+ *	identify records differently, so entire original record is
+ *	passed back to assist in identification of what the backend
+ *	should remove from storage.
  *
- *	@type:	in: pstore record type to write
- *	@id:	in: per-type unique identifier for the record
- *	@count:	in: Oops count
- *	@time:	in: timestamp for the record
- *	@psi:	in: pointer to the struct pstore_info for the backend
+ *	@record:	pointer to record metadata.
  *
  *	Returns 0 on success, and non-zero on error.
  *
@@ -215,9 +211,7 @@ struct pstore_info {
 			enum kmsg_dump_reason reason, u64 *id,
 			unsigned int part, const char __user *buf,
 			bool compressed, size_t size, struct pstore_info *psi);
-	int		(*erase)(enum pstore_type_id type, u64 id,
-			int count, struct timespec time,
-			struct pstore_info *psi);
+	int		(*erase)(struct pstore_record *record);
 };
 
 /* Supported frontends */
-- 
cgit v1.2.3


From b10b471145f28c219d9ddcc309a67e053776865a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 5 Mar 2017 00:27:54 -0800
Subject: pstore: Replace arguments for write_buf() API

As with the other API updates, this removes the long argument list in favor
of passing a single pstore recaord.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/ftrace.c     |  9 +++++++--
 fs/pstore/platform.c   | 30 +++++++++++++++++++++---------
 fs/pstore/ram.c        | 44 ++++++++++++++++++++++----------------------
 include/linux/pstore.h | 21 +++++----------------
 4 files changed, 55 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 899d0ba0bd6c..a5506ec6995e 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 {
 	unsigned long flags;
 	struct pstore_ftrace_record rec = {};
+	struct pstore_record record = {
+		.type = PSTORE_TYPE_FTRACE,
+		.buf = (char *)&rec,
+		.size = sizeof(rec),
+		.psi = psinfo,
+	};
 
 	if (unlikely(oops_in_progress))
 		return;
@@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	rec.parent_ip = parent_ip;
 	pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
 	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
-	psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
-			  0, sizeof(rec), psinfo);
+	psinfo->write_buf(&record);
 
 	local_irq_restore(flags);
 }
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index aa3d6e572ede..5eecf9012459 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -587,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 	const char *e = s + c;
 
 	while (s < e) {
+		struct pstore_record record = {
+			.type = PSTORE_TYPE_CONSOLE,
+			.psi = psinfo,
+		};
 		unsigned long flags;
-		u64 id;
 
 		if (c > psinfo->bufsize)
 			c = psinfo->bufsize;
@@ -599,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 		} else {
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
-		psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
-				  s, 0, c, psinfo);
+		record.buf = (char *)s;
+		record.size = c;
+		psinfo->write_buf(&record);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
@@ -630,10 +634,9 @@ static void pstore_unregister_console(void) {}
 
 static int pstore_write_compat(struct pstore_record *record)
 {
-	return record->psi->write_buf(record->type, record->reason,
-				      &record->id, record->part,
-				      psinfo->buf, record->compressed,
-				      record->size, record->psi);
+	record->buf = psinfo->buf;
+
+	return record->psi->write_buf(record);
 }
 
 static int pstore_write_buf_user_compat(enum pstore_type_id type,
@@ -653,6 +656,15 @@ static int pstore_write_buf_user_compat(enum pstore_type_id type,
 		bufsize = psinfo->bufsize;
 	spin_lock_irqsave(&psinfo->buf_lock, flags);
 	for (i = 0; i < size; ) {
+		struct pstore_record record = {
+			.type = type,
+			.reason = reason,
+			.id = id,
+			.part = part,
+			.buf = psinfo->buf,
+			.compressed = compressed,
+			.psi = psi,
+		};
 		size_t c = min(size - i, bufsize);
 
 		ret = __copy_from_user(psinfo->buf, buf + i, c);
@@ -660,8 +672,8 @@ static int pstore_write_buf_user_compat(enum pstore_type_id type,
 			ret = -EFAULT;
 			break;
 		}
-		ret = psi->write_buf(type, reason, id, part, psinfo->buf,
-				     compressed, c, psi);
+		record.size = c;
+		ret = psi->write_buf(&record);
 		if (unlikely(ret < 0))
 			break;
 		i += c;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index a18575fe32e9..a7cdde60b1f9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -378,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
 	return len;
 }
 
-static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
-					    enum kmsg_dump_reason reason,
-					    u64 *id, unsigned int part,
-					    const char *buf,
-					    bool compressed, size_t size,
-					    struct pstore_info *psi)
+static int notrace ramoops_pstore_write_buf(struct pstore_record *record)
 {
-	struct ramoops_context *cxt = psi->data;
+	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz;
-	size_t hlen;
+	size_t size, hlen;
 
-	if (type == PSTORE_TYPE_CONSOLE) {
+	if (record->type == PSTORE_TYPE_CONSOLE) {
 		if (!cxt->cprz)
 			return -ENOMEM;
-		persistent_ram_write(cxt->cprz, buf, size);
+		persistent_ram_write(cxt->cprz, record->buf, record->size);
 		return 0;
-	} else if (type == PSTORE_TYPE_FTRACE) {
+	} else if (record->type == PSTORE_TYPE_FTRACE) {
 		int zonenum;
 
 		if (!cxt->fprzs)
@@ -407,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 		else
 			zonenum = 0;
 
-		persistent_ram_write(cxt->fprzs[zonenum], buf, size);
+		persistent_ram_write(cxt->fprzs[zonenum], record->buf,
+				     record->size);
 		return 0;
-	} else if (type == PSTORE_TYPE_PMSG) {
+	} else if (record->type == PSTORE_TYPE_PMSG) {
 		pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
 		return -EINVAL;
 	}
 
-	if (type != PSTORE_TYPE_DMESG)
+	if (record->type != PSTORE_TYPE_DMESG)
 		return -EINVAL;
 
-	/* Out of the various dmesg dump types, ramoops is currently designed
+	/*
+	 * Out of the various dmesg dump types, ramoops is currently designed
 	 * to only store crash logs, rather than storing general kernel logs.
 	 */
-	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC)
+	if (record->reason != KMSG_DUMP_OOPS &&
+	    record->reason != KMSG_DUMP_PANIC)
 		return -EINVAL;
 
 	/* Skip Oopes when configured to do so. */
-	if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
+	if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
 		return -EINVAL;
 
-	/* Explicitly only take the first part of any new crash.
+	/*
+	 * Explicitly only take the first part of any new crash.
 	 * If our buffer is larger than kmsg_bytes, this can never happen,
 	 * and if our buffer is smaller than kmsg_bytes, we don't want the
 	 * report split across multiple records.
 	 */
-	if (part != 1)
+	if (record->part != 1)
 		return -ENOSPC;
 
 	if (!cxt->dprzs)
@@ -441,10 +439,12 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 
 	prz = cxt->dprzs[cxt->dump_write_cnt];
 
-	hlen = ramoops_write_kmsg_hdr(prz, compressed);
+	/* Build header and append record contents. */
+	hlen = ramoops_write_kmsg_hdr(prz, record->compressed);
+	size = record->size;
 	if (size + hlen > prz->buffer_size)
 		size = prz->buffer_size - hlen;
-	persistent_ram_write(prz, buf, size);
+	persistent_ram_write(prz, record->buf, size);
 
 	cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
 
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 2cd1979d1f9a..cbf5e561778d 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -142,19 +142,11 @@ struct pstore_record {
  *	Returns 0 on success, and non-zero on error.
  *
  * @write_buf:
- *	Perform a frontend write to a backend record, using a specified
- *	buffer. Unlike @write, this does not use the @psi @buf.
+ *	Perform a frontend write to a backend record. The record contains
+ *	all metadata and the buffer to write to backend storage. (Unlike
+ *	@write, this does not use the @psi @buf.)
  *
- *	@type:	in: pstore record type to write
- *	@reason:
- *		in: pstore write reason
- *	@id:	out: unique identifier for the record
- *	@part:	in: position in a multipart write
- *	@buf:	in: pointer to contents to write to backend record
- *	@compressed:
- *		in: if the record is compressed
- *	@size:	in: size of the write
- *	@psi:	in: pointer to the struct pstore_info for the backend
+ *	@record:	pointer to record metadata.
  *
  *	Returns 0 on success, and non-zero on error.
  *
@@ -203,10 +195,7 @@ struct pstore_info {
 	int		(*close)(struct pstore_info *psi);
 	ssize_t		(*read)(struct pstore_record *record);
 	int		(*write)(struct pstore_record *record);
-	int		(*write_buf)(enum pstore_type_id type,
-			enum kmsg_dump_reason reason, u64 *id,
-			unsigned int part, const char *buf, bool compressed,
-			size_t size, struct pstore_info *psi);
+	int		(*write_buf)(struct pstore_record *record);
 	int		(*write_buf_user)(enum pstore_type_id type,
 			enum kmsg_dump_reason reason, u64 *id,
 			unsigned int part, const char __user *buf,
-- 
cgit v1.2.3


From fdd0311863b32b42bb2c54e60c987bbbabc0c430 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 5 Mar 2017 00:56:38 -0800
Subject: pstore: Replace arguments for write_buf_user() API

Removes argument list in favor of pstore record, though the user buffer
remains passed separately since it must carry the __user annotation.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/platform.c   | 35 ++++++++++++-----------------------
 fs/pstore/pmsg.c       |  9 ++++++---
 fs/pstore/ram.c        | 14 +++++---------
 include/linux/pstore.h | 23 +++++++----------------
 4 files changed, 30 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 5eecf9012459..1e6642a2063e 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -639,47 +639,36 @@ static int pstore_write_compat(struct pstore_record *record)
 	return record->psi->write_buf(record);
 }
 
-static int pstore_write_buf_user_compat(enum pstore_type_id type,
-			       enum kmsg_dump_reason reason,
-			       u64 *id, unsigned int part,
-			       const char __user *buf,
-			       bool compressed, size_t size,
-			       struct pstore_info *psi)
+static int pstore_write_buf_user_compat(struct pstore_record *record,
+					const char __user *buf)
 {
 	unsigned long flags = 0;
-	size_t i, bufsize = size;
+	size_t i, bufsize, total_size = record->size;
 	long ret = 0;
 
-	if (unlikely(!access_ok(VERIFY_READ, buf, size)))
+	if (unlikely(!access_ok(VERIFY_READ, buf, total_size)))
 		return -EFAULT;
+	bufsize = total_size;
 	if (bufsize > psinfo->bufsize)
 		bufsize = psinfo->bufsize;
+	record->buf = psinfo->buf;
 	spin_lock_irqsave(&psinfo->buf_lock, flags);
-	for (i = 0; i < size; ) {
-		struct pstore_record record = {
-			.type = type,
-			.reason = reason,
-			.id = id,
-			.part = part,
-			.buf = psinfo->buf,
-			.compressed = compressed,
-			.psi = psi,
-		};
-		size_t c = min(size - i, bufsize);
+	for (i = 0; i < total_size; ) {
+		size_t c = min(total_size - i, bufsize);
 
-		ret = __copy_from_user(psinfo->buf, buf + i, c);
+		ret = __copy_from_user(record->buf, buf + i, c);
 		if (unlikely(ret != 0)) {
 			ret = -EFAULT;
 			break;
 		}
-		record.size = c;
-		ret = psi->write_buf(&record);
+		record->size = c;
+		ret = record->psi->write_buf(record);
 		if (unlikely(ret < 0))
 			break;
 		i += c;
 	}
 	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
-	return unlikely(ret < 0) ? ret : size;
+	return unlikely(ret < 0) ? ret : total_size;
 }
 
 /*
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 78f6176c020f..ce35907602de 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -23,7 +23,11 @@ static DEFINE_MUTEX(pmsg_lock);
 static ssize_t write_pmsg(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
-	u64 id;
+	struct pstore_record record = {
+		.type = PSTORE_TYPE_PMSG,
+		.size = count,
+		.psi = psinfo,
+	};
 	int ret;
 
 	if (!count)
@@ -34,8 +38,7 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
 		return -EFAULT;
 
 	mutex_lock(&pmsg_lock);
-	ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
-				     psinfo);
+	ret = psinfo->write_buf_user(&record, buf);
 	mutex_unlock(&pmsg_lock);
 	return ret ? ret : count;
 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index a7cdde60b1f9..d85e1adae1b6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -451,19 +451,15 @@ static int notrace ramoops_pstore_write_buf(struct pstore_record *record)
 	return 0;
 }
 
-static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
-						 enum kmsg_dump_reason reason,
-						 u64 *id, unsigned int part,
-						 const char __user *buf,
-						 bool compressed, size_t size,
-						 struct pstore_info *psi)
+static int notrace ramoops_pstore_write_buf_user(struct pstore_record *record,
+						 const char __user *buf)
 {
-	if (type == PSTORE_TYPE_PMSG) {
-		struct ramoops_context *cxt = psi->data;
+	if (record->type == PSTORE_TYPE_PMSG) {
+		struct ramoops_context *cxt = record->psi->data;
 
 		if (!cxt->mprz)
 			return -ENOMEM;
-		return persistent_ram_write_user(cxt->mprz, buf, size);
+		return persistent_ram_write_user(cxt->mprz, buf, record->size);
 	}
 
 	return -EINVAL;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index cbf5e561778d..9b85d3eeca83 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -152,18 +152,11 @@ struct pstore_record {
  *
  * @write_buf_user:
  *	Perform a frontend write to a backend record, using a specified
- *	buffer that is coming directly from userspace.
- *
- *	@type:	in: pstore record type to write
- *	@reason:
- *		in: pstore write reason
- *	@id:	out: unique identifier for the record
- *	@part:	in: position in a multipart write
- *	@buf:	in: pointer to userspace contents to write to backend record
- *	@compressed:
- *		in: if the record is compressed
- *	@size:	in: size of the write
- *	@psi:	in: pointer to the struct pstore_info for the backend
+ *	buffer that is coming directly from userspace, instead of the
+ *	@record @buf.
+ *
+ *	@record:	pointer to record metadata.
+ *	@buf:		pointer to userspace contents to write to backend
  *
  *	Returns 0 on success, and non-zero on error.
  *
@@ -196,10 +189,8 @@ struct pstore_info {
 	ssize_t		(*read)(struct pstore_record *record);
 	int		(*write)(struct pstore_record *record);
 	int		(*write_buf)(struct pstore_record *record);
-	int		(*write_buf_user)(enum pstore_type_id type,
-			enum kmsg_dump_reason reason, u64 *id,
-			unsigned int part, const char __user *buf,
-			bool compressed, size_t size, struct pstore_info *psi);
+	int		(*write_buf_user)(struct pstore_record *record,
+					  const char __user *buf);
 	int		(*erase)(struct pstore_record *record);
 };
 
-- 
cgit v1.2.3


From 4c9ec219766a217468fb94a281c416455a884dda Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 5 Mar 2017 22:41:10 -0800
Subject: pstore: Remove write_buf() callback

Now that write() and write_buf() are functionally identical, this removes
write_buf(), and renames write_buf_user() to write_user(). Additionally
adds sanity-checks for pstore_info's declared functions and flags at
registration time.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/ftrace.c     |  4 ++--
 fs/pstore/platform.c   | 35 ++++++++++++++++++++---------------
 fs/pstore/pmsg.c       |  4 ++--
 fs/pstore/ram.c        | 10 +++++-----
 include/linux/pstore.h | 29 ++++++++++-------------------
 5 files changed, 39 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index a5506ec6995e..06aab07b6bb7 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -53,7 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	rec.parent_ip = parent_ip;
 	pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
 	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
-	psinfo->write_buf(&record);
+	psinfo->write(&record);
 
 	local_irq_restore(flags);
 }
@@ -122,7 +122,7 @@ void pstore_register_ftrace(void)
 {
 	struct dentry *file;
 
-	if (!psinfo->write_buf)
+	if (!psinfo->write)
 		return;
 
 	pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 1e6642a2063e..e79f170fa79b 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -604,7 +604,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 		}
 		record.buf = (char *)s;
 		record.size = c;
-		psinfo->write_buf(&record);
+		psinfo->write(&record);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
@@ -632,15 +632,8 @@ static void pstore_register_console(void) {}
 static void pstore_unregister_console(void) {}
 #endif
 
-static int pstore_write_compat(struct pstore_record *record)
-{
-	record->buf = psinfo->buf;
-
-	return record->psi->write_buf(record);
-}
-
-static int pstore_write_buf_user_compat(struct pstore_record *record,
-					const char __user *buf)
+static int pstore_write_user_compat(struct pstore_record *record,
+				    const char __user *buf)
 {
 	unsigned long flags = 0;
 	size_t i, bufsize, total_size = record->size;
@@ -662,7 +655,7 @@ static int pstore_write_buf_user_compat(struct pstore_record *record,
 			break;
 		}
 		record->size = c;
-		ret = record->psi->write_buf(record);
+		ret = record->psi->write(record);
 		if (unlikely(ret < 0))
 			break;
 		i += c;
@@ -687,6 +680,20 @@ int pstore_register(struct pstore_info *psi)
 		return -EPERM;
 	}
 
+	/* Sanity check flags. */
+	if (!psi->flags) {
+		pr_warn("backend '%s' must support at least one frontend\n",
+			psi->name);
+		return -EINVAL;
+	}
+
+	/* Check for required functions. */
+	if (!psi->read || !psi->write) {
+		pr_warn("backend '%s' must implement read() and write()\n",
+			psi->name);
+		return -EINVAL;
+	}
+
 	spin_lock(&pstore_lock);
 	if (psinfo) {
 		pr_warn("backend '%s' already loaded: ignoring '%s'\n",
@@ -695,10 +702,8 @@ int pstore_register(struct pstore_info *psi)
 		return -EBUSY;
 	}
 
-	if (!psi->write)
-		psi->write = pstore_write_compat;
-	if (!psi->write_buf_user)
-		psi->write_buf_user = pstore_write_buf_user_compat;
+	if (!psi->write_user)
+		psi->write_user = pstore_write_user_compat;
 	psinfo = psi;
 	mutex_init(&psinfo->read_mutex);
 	spin_unlock(&pstore_lock);
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index ce35907602de..c16a2477e106 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -33,12 +33,12 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
 	if (!count)
 		return 0;
 
-	/* check outside lock, page in any data. write_buf_user also checks */
+	/* check outside lock, page in any data. write_user also checks */
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
 	mutex_lock(&pmsg_lock);
-	ret = psinfo->write_buf_user(&record, buf);
+	ret = psinfo->write_user(&record, buf);
 	mutex_unlock(&pmsg_lock);
 	return ret ? ret : count;
 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index d85e1adae1b6..5523df7f17ef 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -378,7 +378,7 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
 	return len;
 }
 
-static int notrace ramoops_pstore_write_buf(struct pstore_record *record)
+static int notrace ramoops_pstore_write(struct pstore_record *record)
 {
 	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz;
@@ -451,8 +451,8 @@ static int notrace ramoops_pstore_write_buf(struct pstore_record *record)
 	return 0;
 }
 
-static int notrace ramoops_pstore_write_buf_user(struct pstore_record *record,
-						 const char __user *buf)
+static int notrace ramoops_pstore_write_user(struct pstore_record *record,
+					     const char __user *buf)
 {
 	if (record->type == PSTORE_TYPE_PMSG) {
 		struct ramoops_context *cxt = record->psi->data;
@@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = {
 		.name	= "ramoops",
 		.open	= ramoops_pstore_open,
 		.read	= ramoops_pstore_read,
-		.write_buf	= ramoops_pstore_write_buf,
-		.write_buf_user	= ramoops_pstore_write_buf_user,
+		.write	= ramoops_pstore_write,
+		.write_user	= ramoops_pstore_write_user,
 		.erase	= ramoops_pstore_erase,
 	},
 };
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 9b85d3eeca83..e2233f50f428 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -130,27 +130,19 @@ struct pstore_record {
  *	available, or negative on error.
  *
  * @write:
- *	Perform a frontend notification of a write to a backend record. The
- *	data to be stored has already been written to the registered @buf
- *	of the @psi structure.
+ *	A newly generated record needs to be written to backend storage.
  *
  *	@record:
- *		pointer to record metadata. Note that @buf is NULL, since
- *		the @buf registered with @psi is what has been written. The
- *		backend is expected to update @id.
+ *		pointer to record metadata. When @type is PSTORE_TYPE_DMESG,
+ *		@buf will be pointing to the preallocated @psi.buf, since
+ *		memory allocation may be broken during an Oops. Regardless,
+ *		@buf must be proccesed or copied before returning. The
+ *		backend is also expected to write @id with something that
+ 8		can help identify this record to a future @erase callback.
  *
  *	Returns 0 on success, and non-zero on error.
  *
- * @write_buf:
- *	Perform a frontend write to a backend record. The record contains
- *	all metadata and the buffer to write to backend storage. (Unlike
- *	@write, this does not use the @psi @buf.)
- *
- *	@record:	pointer to record metadata.
- *
- *	Returns 0 on success, and non-zero on error.
- *
- * @write_buf_user:
+ * @write_user:
  *	Perform a frontend write to a backend record, using a specified
  *	buffer that is coming directly from userspace, instead of the
  *	@record @buf.
@@ -188,9 +180,8 @@ struct pstore_info {
 	int		(*close)(struct pstore_info *psi);
 	ssize_t		(*read)(struct pstore_record *record);
 	int		(*write)(struct pstore_record *record);
-	int		(*write_buf)(struct pstore_record *record);
-	int		(*write_buf_user)(struct pstore_record *record,
-					  const char __user *buf);
+	int		(*write_user)(struct pstore_record *record,
+				      const char __user *buf);
 	int		(*erase)(struct pstore_record *record);
 };
 
-- 
cgit v1.2.3


From af085d9084b48530153f51e6cad19fd0b1a13ed7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Feb 2017 19:42:28 -0600
Subject: stacktrace/x86: add function for detecting reliable stack traces

For live patching and possibly other use cases, a stack trace is only
useful if it can be assured that it's completely reliable.  Add a new
save_stack_trace_tsk_reliable() function to achieve that.

Note that if the target task isn't the current task, and the target task
is allowed to run, then it could be writing the stack while the unwinder
is reading it, resulting in possible corruption.  So the caller of
save_stack_trace_tsk_reliable() must ensure that the task is either
'current' or inactive.

save_stack_trace_tsk_reliable() relies on the x86 unwinder's detection
of pt_regs on the stack.  If the pt_regs are not user-mode registers
from a syscall, then they indicate an in-kernel interrupt or exception
(e.g. preemption or a page fault), in which case the stack is considered
unreliable due to the nature of frame pointers.

It also relies on the x86 unwinder's detection of other issues, such as:

- corrupted stack data
- stack grows the wrong way
- stack walk doesn't reach the bottom
- user didn't provide a large enough entries array

Such issues are reported by checking unwind_error() and !unwind_done().

Also add CONFIG_HAVE_RELIABLE_STACKTRACE so arch-independent code can
determine at build time whether the function is implemented.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Ingo Molnar <mingo@kernel.org>	# for the x86 changes
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/Kconfig                   |  6 +++
 arch/x86/Kconfig               |  1 +
 arch/x86/include/asm/unwind.h  |  6 +++
 arch/x86/kernel/stacktrace.c   | 96 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/unwind_frame.c |  2 +
 include/linux/stacktrace.h     |  9 ++--
 kernel/stacktrace.c            | 12 +++++-
 7 files changed, 126 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..6ad00ad73459 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -713,6 +713,12 @@ config HAVE_STACK_VALIDATION
 	  Architecture supports the 'objtool check' host tool command, which
 	  performs compile-time stack metadata validation.
 
+config HAVE_RELIABLE_STACKTRACE
+	bool
+	help
+	  Architecture has a save_stack_trace_tsk_reliable() function which
+	  only returns a stack trace if it can guarantee the trace is reliable.
+
 config HAVE_ARCH_HASH
 	bool
 	default n
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..2a26852c11b6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -160,6 +160,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 6fa75b17aec3..137e9cce2ab4 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -11,6 +11,7 @@ struct unwind_state {
 	unsigned long stack_mask;
 	struct task_struct *task;
 	int graph_idx;
+	bool error;
 #ifdef CONFIG_FRAME_POINTER
 	unsigned long *bp, *orig_sp;
 	struct pt_regs *regs;
@@ -40,6 +41,11 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 	__unwind_start(state, task, regs, first_frame);
 }
 
+static inline bool unwind_error(struct unwind_state *state)
+{
+	return state->error;
+}
+
 #ifdef CONFIG_FRAME_POINTER
 
 static inline
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8e2b79b88e51..8dabd7bf1673 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -76,6 +76,101 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
+#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE
+
+#define STACKTRACE_DUMP_ONCE(task) ({				\
+	static bool __section(.data.unlikely) __dumped;		\
+								\
+	if (!__dumped) {					\
+		__dumped = true;				\
+		WARN_ON(1);					\
+		show_stack(task, NULL);				\
+	}							\
+})
+
+static int __save_stack_trace_reliable(struct stack_trace *trace,
+				       struct task_struct *task)
+{
+	struct unwind_state state;
+	struct pt_regs *regs;
+	unsigned long addr;
+
+	for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+
+		regs = unwind_get_entry_regs(&state);
+		if (regs) {
+			/*
+			 * Kernel mode registers on the stack indicate an
+			 * in-kernel interrupt or exception (e.g., preemption
+			 * or a page fault), which can make frame pointers
+			 * unreliable.
+			 */
+			if (!user_mode(regs))
+				return -EINVAL;
+
+			/*
+			 * The last frame contains the user mode syscall
+			 * pt_regs.  Skip it and finish the unwind.
+			 */
+			unwind_next_frame(&state);
+			if (!unwind_done(&state)) {
+				STACKTRACE_DUMP_ONCE(task);
+				return -EINVAL;
+			}
+			break;
+		}
+
+		addr = unwind_get_return_address(&state);
+
+		/*
+		 * A NULL or invalid return address probably means there's some
+		 * generated code which __kernel_text_address() doesn't know
+		 * about.
+		 */
+		if (!addr) {
+			STACKTRACE_DUMP_ONCE(task);
+			return -EINVAL;
+		}
+
+		if (save_stack_address(trace, addr, false))
+			return -EINVAL;
+	}
+
+	/* Check for stack corruption */
+	if (unwind_error(&state)) {
+		STACKTRACE_DUMP_ONCE(task);
+		return -EINVAL;
+	}
+
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+
+	return 0;
+}
+
+/*
+ * This function returns an error if it detects any unreliable features of the
+ * stack.  Otherwise it guarantees that the stack trace is reliable.
+ *
+ * If the task is not 'current', the caller *must* ensure the task is inactive.
+ */
+int save_stack_trace_tsk_reliable(struct task_struct *tsk,
+				  struct stack_trace *trace)
+{
+	int ret;
+
+	if (!try_get_task_stack(tsk))
+		return -EINVAL;
+
+	ret = __save_stack_trace_reliable(trace, tsk);
+
+	put_task_stack(tsk);
+
+	return ret;
+}
+#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */
+
 /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
 
 struct stack_frame_user {
@@ -138,4 +233,3 @@ void save_stack_trace_user(struct stack_trace *trace)
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
-
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 478d15dbaee4..5ed43910e04b 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -225,6 +225,8 @@ bool unwind_next_frame(struct unwind_state *state)
 	return true;
 
 bad_address:
+	state->error = true;
+
 	/*
 	 * When unwinding a non-current task, the task might actually be
 	 * running on another CPU, in which case it could be modifying its
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 0a34489a46b6..4205f71a5f0e 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -18,6 +18,8 @@ extern void save_stack_trace_regs(struct pt_regs *regs,
 				  struct stack_trace *trace);
 extern void save_stack_trace_tsk(struct task_struct *tsk,
 				struct stack_trace *trace);
+extern int save_stack_trace_tsk_reliable(struct task_struct *tsk,
+					 struct stack_trace *trace);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 extern int snprint_stack_trace(char *buf, size_t size,
@@ -29,12 +31,13 @@ extern void save_stack_trace_user(struct stack_trace *trace);
 # define save_stack_trace_user(trace)              do { } while (0)
 #endif
 
-#else
+#else /* !CONFIG_STACKTRACE */
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)		do { } while (0)
 # define save_stack_trace_user(trace)			do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 # define snprint_stack_trace(buf, size, trace, spaces)	do { } while (0)
-#endif
+# define save_stack_trace_tsk_reliable(tsk, trace)	({ -ENOSYS; })
+#endif /* CONFIG_STACKTRACE */
 
-#endif
+#endif /* __LINUX_STACKTRACE_H */
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c15a9124e83..f8edee9c792d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size,
 EXPORT_SYMBOL_GPL(snprint_stack_trace);
 
 /*
- * Architectures that do not implement save_stack_trace_tsk or
- * save_stack_trace_regs get this weak alias and a once-per-bootup warning
+ * Architectures that do not implement save_stack_trace_*()
+ * get these weak aliases and once-per-bootup warnings
  * (whenever this facility is utilized - for example by procfs):
  */
 __weak void
@@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
 	WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
 }
+
+__weak int
+save_stack_trace_tsk_reliable(struct task_struct *tsk,
+			      struct stack_trace *trace)
+{
+	WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");
+	return -ENOSYS;
+}
-- 
cgit v1.2.3


From 46c5a0113f843be5c55b1c40dd486538891156d4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Feb 2017 19:42:30 -0600
Subject: livepatch: create temporary klp_update_patch_state() stub

Create temporary stubs for klp_update_patch_state() so we can add
TIF_PATCH_PENDING to different architectures in separate patches without
breaking build bisectability.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h | 5 ++++-
 kernel/livepatch/core.c   | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 9072f04db616..5cc20e588a22 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -123,10 +123,13 @@ void arch_klp_init_object_loaded(struct klp_patch *patch,
 int klp_module_coming(struct module *mod);
 void klp_module_going(struct module *mod);
 
+void klp_update_patch_state(struct task_struct *task);
+
 #else /* !CONFIG_LIVEPATCH */
 
 static inline int klp_module_coming(struct module *mod) { return 0; }
-static inline void klp_module_going(struct module *mod) { }
+static inline void klp_module_going(struct module *mod) {}
+static inline void klp_update_patch_state(struct task_struct *task) {}
 
 #endif /* CONFIG_LIVEPATCH */
 
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index af4643873e71..217b39d71176 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -64,6 +64,9 @@ static LIST_HEAD(klp_ops);
 
 static struct kobject *klp_root_kobj;
 
+/* TODO: temporary stub */
+void klp_update_patch_state(struct task_struct *task) {}
+
 static struct klp_ops *klp_find_ops(unsigned long old_addr)
 {
 	struct klp_ops *ops;
-- 
cgit v1.2.3


From 0dade9f374f1c15f9b43ab01ab75a3b459bba5f6 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Feb 2017 19:42:35 -0600
Subject: livepatch: separate enabled and patched states

Once we have a consistency model, patches and their objects will be
enabled and disabled at different times.  For example, when a patch is
disabled, its loaded objects' funcs can remain registered with ftrace
indefinitely until the unpatching operation is complete and they're no
longer in use.

It's less confusing if we give them different names: patches can be
enabled or disabled; objects (and their funcs) can be patched or
unpatched:

- Enabled means that a patch is logically enabled (but not necessarily
  fully applied).

- Patched means that an object's funcs are registered with ftrace and
  added to the klp_ops func stack.

Also, since these states are binary, represent them with booleans
instead of ints.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h | 17 ++++-------
 kernel/livepatch/core.c   | 72 +++++++++++++++++++++++------------------------
 2 files changed, 42 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 5cc20e588a22..9787a63b57ac 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -28,11 +28,6 @@
 
 #include <asm/livepatch.h>
 
-enum klp_state {
-	KLP_DISABLED,
-	KLP_ENABLED
-};
-
 /**
  * struct klp_func - function structure for live patching
  * @old_name:	name of the function to be patched
@@ -41,8 +36,8 @@ enum klp_state {
  *		can be found (optional)
  * @old_addr:	the address of the function being patched
  * @kobj:	kobject for sysfs resources
- * @state:	tracks function-level patch application state
  * @stack_node:	list node for klp_ops func_stack list
+ * @patched:	the func has been added to the klp_ops list
  */
 struct klp_func {
 	/* external */
@@ -60,8 +55,8 @@ struct klp_func {
 	/* internal */
 	unsigned long old_addr;
 	struct kobject kobj;
-	enum klp_state state;
 	struct list_head stack_node;
+	bool patched;
 };
 
 /**
@@ -71,7 +66,7 @@ struct klp_func {
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
  * 		(NULL for vmlinux)
- * @state:	tracks object-level patch application state
+ * @patched:	the object's funcs have been added to the klp_ops list
  */
 struct klp_object {
 	/* external */
@@ -81,7 +76,7 @@ struct klp_object {
 	/* internal */
 	struct kobject kobj;
 	struct module *mod;
-	enum klp_state state;
+	bool patched;
 };
 
 /**
@@ -90,7 +85,7 @@ struct klp_object {
  * @objs:	object entries for kernel objects to be patched
  * @list:	list node for global list of registered patches
  * @kobj:	kobject for sysfs resources
- * @state:	tracks patch-level application state
+ * @enabled:	the patch is enabled (but operation may be incomplete)
  */
 struct klp_patch {
 	/* external */
@@ -100,7 +95,7 @@ struct klp_patch {
 	/* internal */
 	struct list_head list;
 	struct kobject kobj;
-	enum klp_state state;
+	bool enabled;
 };
 
 #define klp_for_each_object(patch, obj) \
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 217b39d71176..2dbd355cee07 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -348,11 +348,11 @@ static unsigned long klp_get_ftrace_location(unsigned long faddr)
 }
 #endif
 
-static void klp_disable_func(struct klp_func *func)
+static void klp_unpatch_func(struct klp_func *func)
 {
 	struct klp_ops *ops;
 
-	if (WARN_ON(func->state != KLP_ENABLED))
+	if (WARN_ON(!func->patched))
 		return;
 	if (WARN_ON(!func->old_addr))
 		return;
@@ -378,10 +378,10 @@ static void klp_disable_func(struct klp_func *func)
 		list_del_rcu(&func->stack_node);
 	}
 
-	func->state = KLP_DISABLED;
+	func->patched = false;
 }
 
-static int klp_enable_func(struct klp_func *func)
+static int klp_patch_func(struct klp_func *func)
 {
 	struct klp_ops *ops;
 	int ret;
@@ -389,7 +389,7 @@ static int klp_enable_func(struct klp_func *func)
 	if (WARN_ON(!func->old_addr))
 		return -EINVAL;
 
-	if (WARN_ON(func->state != KLP_DISABLED))
+	if (WARN_ON(func->patched))
 		return -EINVAL;
 
 	ops = klp_find_ops(func->old_addr);
@@ -437,7 +437,7 @@ static int klp_enable_func(struct klp_func *func)
 		list_add_rcu(&func->stack_node, &ops->func_stack);
 	}
 
-	func->state = KLP_ENABLED;
+	func->patched = true;
 
 	return 0;
 
@@ -448,36 +448,36 @@ err:
 	return ret;
 }
 
-static void klp_disable_object(struct klp_object *obj)
+static void klp_unpatch_object(struct klp_object *obj)
 {
 	struct klp_func *func;
 
 	klp_for_each_func(obj, func)
-		if (func->state == KLP_ENABLED)
-			klp_disable_func(func);
+		if (func->patched)
+			klp_unpatch_func(func);
 
-	obj->state = KLP_DISABLED;
+	obj->patched = false;
 }
 
-static int klp_enable_object(struct klp_object *obj)
+static int klp_patch_object(struct klp_object *obj)
 {
 	struct klp_func *func;
 	int ret;
 
-	if (WARN_ON(obj->state != KLP_DISABLED))
+	if (WARN_ON(obj->patched))
 		return -EINVAL;
 
 	if (WARN_ON(!klp_is_object_loaded(obj)))
 		return -EINVAL;
 
 	klp_for_each_func(obj, func) {
-		ret = klp_enable_func(func);
+		ret = klp_patch_func(func);
 		if (ret) {
-			klp_disable_object(obj);
+			klp_unpatch_object(obj);
 			return ret;
 		}
 	}
-	obj->state = KLP_ENABLED;
+	obj->patched = true;
 
 	return 0;
 }
@@ -488,17 +488,17 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	/* enforce stacking: only the last enabled patch can be disabled */
 	if (!list_is_last(&patch->list, &klp_patches) &&
-	    list_next_entry(patch, list)->state == KLP_ENABLED)
+	    list_next_entry(patch, list)->enabled)
 		return -EBUSY;
 
 	pr_notice("disabling patch '%s'\n", patch->mod->name);
 
 	klp_for_each_object(patch, obj) {
-		if (obj->state == KLP_ENABLED)
-			klp_disable_object(obj);
+		if (obj->patched)
+			klp_unpatch_object(obj);
 	}
 
-	patch->state = KLP_DISABLED;
+	patch->enabled = false;
 
 	return 0;
 }
@@ -522,7 +522,7 @@ int klp_disable_patch(struct klp_patch *patch)
 		goto err;
 	}
 
-	if (patch->state == KLP_DISABLED) {
+	if (!patch->enabled) {
 		ret = -EINVAL;
 		goto err;
 	}
@@ -540,12 +540,12 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	struct klp_object *obj;
 	int ret;
 
-	if (WARN_ON(patch->state != KLP_DISABLED))
+	if (WARN_ON(patch->enabled))
 		return -EINVAL;
 
 	/* enforce stacking: only the first disabled patch can be enabled */
 	if (patch->list.prev != &klp_patches &&
-	    list_prev_entry(patch, list)->state == KLP_DISABLED)
+	    !list_prev_entry(patch, list)->enabled)
 		return -EBUSY;
 
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
@@ -554,12 +554,12 @@ static int __klp_enable_patch(struct klp_patch *patch)
 		if (!klp_is_object_loaded(obj))
 			continue;
 
-		ret = klp_enable_object(obj);
+		ret = klp_patch_object(obj);
 		if (ret)
 			goto unregister;
 	}
 
-	patch->state = KLP_ENABLED;
+	patch->enabled = true;
 
 	return 0;
 
@@ -617,20 +617,20 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
 	if (ret)
 		return -EINVAL;
 
-	if (val != KLP_DISABLED && val != KLP_ENABLED)
+	if (val > 1)
 		return -EINVAL;
 
 	patch = container_of(kobj, struct klp_patch, kobj);
 
 	mutex_lock(&klp_mutex);
 
-	if (val == patch->state) {
+	if (patch->enabled == val) {
 		/* already in requested state */
 		ret = -EINVAL;
 		goto err;
 	}
 
-	if (val == KLP_ENABLED) {
+	if (val) {
 		ret = __klp_enable_patch(patch);
 		if (ret)
 			goto err;
@@ -655,7 +655,7 @@ static ssize_t enabled_show(struct kobject *kobj,
 	struct klp_patch *patch;
 
 	patch = container_of(kobj, struct klp_patch, kobj);
-	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
 }
 
 static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
@@ -749,7 +749,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 		return -EINVAL;
 
 	INIT_LIST_HEAD(&func->stack_node);
-	func->state = KLP_DISABLED;
+	func->patched = false;
 
 	/* The format for the sysfs directory is <function,sympos> where sympos
 	 * is the nth occurrence of this symbol in kallsyms for the patched
@@ -804,7 +804,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
 	if (!obj->funcs)
 		return -EINVAL;
 
-	obj->state = KLP_DISABLED;
+	obj->patched = false;
 	obj->mod = NULL;
 
 	klp_find_object_module(obj);
@@ -845,7 +845,7 @@ static int klp_init_patch(struct klp_patch *patch)
 
 	mutex_lock(&klp_mutex);
 
-	patch->state = KLP_DISABLED;
+	patch->enabled = false;
 
 	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
 				   klp_root_kobj, "%s", patch->mod->name);
@@ -891,7 +891,7 @@ int klp_unregister_patch(struct klp_patch *patch)
 		goto out;
 	}
 
-	if (patch->state == KLP_ENABLED) {
+	if (patch->enabled) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -978,13 +978,13 @@ int klp_module_coming(struct module *mod)
 				goto err;
 			}
 
-			if (patch->state == KLP_DISABLED)
+			if (!patch->enabled)
 				break;
 
 			pr_notice("applying patch '%s' to loading module '%s'\n",
 				  patch->mod->name, obj->mod->name);
 
-			ret = klp_enable_object(obj);
+			ret = klp_patch_object(obj);
 			if (ret) {
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
@@ -1035,10 +1035,10 @@ void klp_module_going(struct module *mod)
 			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
 				continue;
 
-			if (patch->state != KLP_DISABLED) {
+			if (patch->enabled) {
 				pr_notice("reverting patch '%s' on unloading module '%s'\n",
 					  patch->mod->name, obj->mod->name);
-				klp_disable_object(obj);
+				klp_unpatch_object(obj);
 			}
 
 			klp_free_object_loaded(obj);
-- 
cgit v1.2.3


From f5e547f4ac785c65a39211f0b8e4ffc4fe09112d Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Feb 2017 19:42:39 -0600
Subject: livepatch: store function sizes

For the consistency model we'll need to know the sizes of the old and
new functions to determine if they're on the stacks of any tasks.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h |  3 +++
 kernel/livepatch/core.c   | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 9787a63b57ac..6602b34bed2b 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -37,6 +37,8 @@
  * @old_addr:	the address of the function being patched
  * @kobj:	kobject for sysfs resources
  * @stack_node:	list node for klp_ops func_stack list
+ * @old_size:	size of the old function
+ * @new_size:	size of the new function
  * @patched:	the func has been added to the klp_ops list
  */
 struct klp_func {
@@ -56,6 +58,7 @@ struct klp_func {
 	unsigned long old_addr;
 	struct kobject kobj;
 	struct list_head stack_node;
+	unsigned long old_size, new_size;
 	bool patched;
 };
 
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 83c4949862b4..10ba3a1578bd 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -584,6 +584,22 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 					     &func->old_addr);
 		if (ret)
 			return ret;
+
+		ret = kallsyms_lookup_size_offset(func->old_addr,
+						  &func->old_size, NULL);
+		if (!ret) {
+			pr_err("kallsyms size lookup failed for '%s'\n",
+			       func->old_name);
+			return -ENOENT;
+		}
+
+		ret = kallsyms_lookup_size_offset((unsigned long)func->new_func,
+						  &func->new_size, NULL);
+		if (!ret) {
+			pr_err("kallsyms size lookup failed for '%s' replacement\n",
+			       func->old_name);
+			return -ENOENT;
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From d83a7cb375eec21f04c83542395d08b2f6641da2 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Feb 2017 19:42:40 -0600
Subject: livepatch: change to a per-task consistency model

Change livepatch to use a basic per-task consistency model.  This is the
foundation which will eventually enable us to patch those ~10% of
security patches which change function or data semantics.  This is the
biggest remaining piece needed to make livepatch more generally useful.

This code stems from the design proposal made by Vojtech [1] in November
2014.  It's a hybrid of kGraft and kpatch: it uses kGraft's per-task
consistency and syscall barrier switching combined with kpatch's stack
trace switching.  There are also a number of fallback options which make
it quite flexible.

Patches are applied on a per-task basis, when the task is deemed safe to
switch over.  When a patch is enabled, livepatch enters into a
transition state where tasks are converging to the patched state.
Usually this transition state can complete in a few seconds.  The same
sequence occurs when a patch is disabled, except the tasks converge from
the patched state to the unpatched state.

An interrupt handler inherits the patched state of the task it
interrupts.  The same is true for forked tasks: the child inherits the
patched state of the parent.

Livepatch uses several complementary approaches to determine when it's
safe to patch tasks:

1. The first and most effective approach is stack checking of sleeping
   tasks.  If no affected functions are on the stack of a given task,
   the task is patched.  In most cases this will patch most or all of
   the tasks on the first try.  Otherwise it'll keep trying
   periodically.  This option is only available if the architecture has
   reliable stacks (HAVE_RELIABLE_STACKTRACE).

2. The second approach, if needed, is kernel exit switching.  A
   task is switched when it returns to user space from a system call, a
   user space IRQ, or a signal.  It's useful in the following cases:

   a) Patching I/O-bound user tasks which are sleeping on an affected
      function.  In this case you have to send SIGSTOP and SIGCONT to
      force it to exit the kernel and be patched.
   b) Patching CPU-bound user tasks.  If the task is highly CPU-bound
      then it will get patched the next time it gets interrupted by an
      IRQ.
   c) In the future it could be useful for applying patches for
      architectures which don't yet have HAVE_RELIABLE_STACKTRACE.  In
      this case you would have to signal most of the tasks on the
      system.  However this isn't supported yet because there's
      currently no way to patch kthreads without
      HAVE_RELIABLE_STACKTRACE.

3. For idle "swapper" tasks, since they don't ever exit the kernel, they
   instead have a klp_update_patch_state() call in the idle loop which
   allows them to be patched before the CPU enters the idle state.

   (Note there's not yet such an approach for kthreads.)

All the above approaches may be skipped by setting the 'immediate' flag
in the 'klp_patch' struct, which will disable per-task consistency and
patch all tasks immediately.  This can be useful if the patch doesn't
change any function or data semantics.  Note that, even with this flag
set, it's possible that some tasks may still be running with an old
version of the function, until that function returns.

There's also an 'immediate' flag in the 'klp_func' struct which allows
you to specify that certain functions in the patch can be applied
without per-task consistency.  This might be useful if you want to patch
a common function like schedule(), and the function change doesn't need
consistency but the rest of the patch does.

For architectures which don't have HAVE_RELIABLE_STACKTRACE, the user
must set patch->immediate which causes all tasks to be patched
immediately.  This option should be used with care, only when the patch
doesn't change any function or data semantics.

In the future, architectures which don't have HAVE_RELIABLE_STACKTRACE
may be allowed to use per-task consistency if we can come up with
another way to patch kthreads.

The /sys/kernel/livepatch/<patch>/transition file shows whether a patch
is in transition.  Only a single patch (the topmost patch on the stack)
can be in transition at a given time.  A patch can remain in transition
indefinitely, if any of the tasks are stuck in the initial patch state.

A transition can be reversed and effectively canceled by writing the
opposite value to the /sys/kernel/livepatch/<patch>/enabled file while
the transition is in progress.  Then all the tasks will attempt to
converge back to the original patch state.

[1] https://lkml.kernel.org/r/20141107140458.GA21774@suse.cz

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Ingo Molnar <mingo@kernel.org>        # for the scheduler changes
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-kernel-livepatch |   8 +
 Documentation/livepatch/livepatch.txt            | 186 +++++++-
 include/linux/init_task.h                        |   9 +
 include/linux/livepatch.h                        |  42 +-
 include/linux/sched.h                            |   3 +
 kernel/fork.c                                    |   3 +
 kernel/livepatch/Makefile                        |   2 +-
 kernel/livepatch/core.c                          | 105 ++++-
 kernel/livepatch/patch.c                         |  59 +++
 kernel/livepatch/patch.h                         |   1 +
 kernel/livepatch/transition.c                    | 543 +++++++++++++++++++++++
 kernel/livepatch/transition.h                    |  14 +
 kernel/sched/idle.c                              |   4 +
 samples/livepatch/livepatch-sample.c             |  17 +
 14 files changed, 947 insertions(+), 49 deletions(-)
 create mode 100644 kernel/livepatch/transition.c
 create mode 100644 kernel/livepatch/transition.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch
index da87f43aec58..d5d39748382f 100644
--- a/Documentation/ABI/testing/sysfs-kernel-livepatch
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@@ -25,6 +25,14 @@ Description:
 		code is currently applied.  Writing 0 will disable the patch
 		while writing 1 will re-enable the patch.
 
+What:		/sys/kernel/livepatch/<patch>/transition
+Date:		Feb 2017
+KernelVersion:	4.12.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		An attribute which indicates whether the patch is currently in
+		transition.
+
 What:		/sys/kernel/livepatch/<patch>/<object>
 Date:		Nov 2014
 KernelVersion:	3.19.0
diff --git a/Documentation/livepatch/livepatch.txt b/Documentation/livepatch/livepatch.txt
index 9d2096c7160d..4f2aec8d4c12 100644
--- a/Documentation/livepatch/livepatch.txt
+++ b/Documentation/livepatch/livepatch.txt
@@ -72,7 +72,8 @@ example, they add a NULL pointer or a boundary check, fix a race by adding
 a missing memory barrier, or add some locking around a critical section.
 Most of these changes are self contained and the function presents itself
 the same way to the rest of the system. In this case, the functions might
-be updated independently one by one.
+be updated independently one by one.  (This can be done by setting the
+'immediate' flag in the klp_patch struct.)
 
 But there are more complex fixes. For example, a patch might change
 ordering of locking in multiple functions at the same time. Or a patch
@@ -86,20 +87,141 @@ or no data are stored in the modified structures at the moment.
 The theory about how to apply functions a safe way is rather complex.
 The aim is to define a so-called consistency model. It attempts to define
 conditions when the new implementation could be used so that the system
-stays consistent. The theory is not yet finished. See the discussion at
-https://lkml.kernel.org/r/20141107140458.GA21774@suse.cz
-
-The current consistency model is very simple. It guarantees that either
-the old or the new function is called. But various functions get redirected
-one by one without any synchronization.
-
-In other words, the current implementation _never_ modifies the behavior
-in the middle of the call. It is because it does _not_ rewrite the entire
-function in the memory. Instead, the function gets redirected at the
-very beginning. But this redirection is used immediately even when
-some other functions from the same patch have not been redirected yet.
-
-See also the section "Limitations" below.
+stays consistent.
+
+Livepatch has a consistency model which is a hybrid of kGraft and
+kpatch:  it uses kGraft's per-task consistency and syscall barrier
+switching combined with kpatch's stack trace switching.  There are also
+a number of fallback options which make it quite flexible.
+
+Patches are applied on a per-task basis, when the task is deemed safe to
+switch over.  When a patch is enabled, livepatch enters into a
+transition state where tasks are converging to the patched state.
+Usually this transition state can complete in a few seconds.  The same
+sequence occurs when a patch is disabled, except the tasks converge from
+the patched state to the unpatched state.
+
+An interrupt handler inherits the patched state of the task it
+interrupts.  The same is true for forked tasks: the child inherits the
+patched state of the parent.
+
+Livepatch uses several complementary approaches to determine when it's
+safe to patch tasks:
+
+1. The first and most effective approach is stack checking of sleeping
+   tasks.  If no affected functions are on the stack of a given task,
+   the task is patched.  In most cases this will patch most or all of
+   the tasks on the first try.  Otherwise it'll keep trying
+   periodically.  This option is only available if the architecture has
+   reliable stacks (HAVE_RELIABLE_STACKTRACE).
+
+2. The second approach, if needed, is kernel exit switching.  A
+   task is switched when it returns to user space from a system call, a
+   user space IRQ, or a signal.  It's useful in the following cases:
+
+   a) Patching I/O-bound user tasks which are sleeping on an affected
+      function.  In this case you have to send SIGSTOP and SIGCONT to
+      force it to exit the kernel and be patched.
+   b) Patching CPU-bound user tasks.  If the task is highly CPU-bound
+      then it will get patched the next time it gets interrupted by an
+      IRQ.
+   c) In the future it could be useful for applying patches for
+      architectures which don't yet have HAVE_RELIABLE_STACKTRACE.  In
+      this case you would have to signal most of the tasks on the
+      system.  However this isn't supported yet because there's
+      currently no way to patch kthreads without
+      HAVE_RELIABLE_STACKTRACE.
+
+3. For idle "swapper" tasks, since they don't ever exit the kernel, they
+   instead have a klp_update_patch_state() call in the idle loop which
+   allows them to be patched before the CPU enters the idle state.
+
+   (Note there's not yet such an approach for kthreads.)
+
+All the above approaches may be skipped by setting the 'immediate' flag
+in the 'klp_patch' struct, which will disable per-task consistency and
+patch all tasks immediately.  This can be useful if the patch doesn't
+change any function or data semantics.  Note that, even with this flag
+set, it's possible that some tasks may still be running with an old
+version of the function, until that function returns.
+
+There's also an 'immediate' flag in the 'klp_func' struct which allows
+you to specify that certain functions in the patch can be applied
+without per-task consistency.  This might be useful if you want to patch
+a common function like schedule(), and the function change doesn't need
+consistency but the rest of the patch does.
+
+For architectures which don't have HAVE_RELIABLE_STACKTRACE, the user
+must set patch->immediate which causes all tasks to be patched
+immediately.  This option should be used with care, only when the patch
+doesn't change any function or data semantics.
+
+In the future, architectures which don't have HAVE_RELIABLE_STACKTRACE
+may be allowed to use per-task consistency if we can come up with
+another way to patch kthreads.
+
+The /sys/kernel/livepatch/<patch>/transition file shows whether a patch
+is in transition.  Only a single patch (the topmost patch on the stack)
+can be in transition at a given time.  A patch can remain in transition
+indefinitely, if any of the tasks are stuck in the initial patch state.
+
+A transition can be reversed and effectively canceled by writing the
+opposite value to the /sys/kernel/livepatch/<patch>/enabled file while
+the transition is in progress.  Then all the tasks will attempt to
+converge back to the original patch state.
+
+There's also a /proc/<pid>/patch_state file which can be used to
+determine which tasks are blocking completion of a patching operation.
+If a patch is in transition, this file shows 0 to indicate the task is
+unpatched and 1 to indicate it's patched.  Otherwise, if no patch is in
+transition, it shows -1.  Any tasks which are blocking the transition
+can be signaled with SIGSTOP and SIGCONT to force them to change their
+patched state.
+
+
+3.1 Adding consistency model support to new architectures
+---------------------------------------------------------
+
+For adding consistency model support to new architectures, there are a
+few options:
+
+1) Add CONFIG_HAVE_RELIABLE_STACKTRACE.  This means porting objtool, and
+   for non-DWARF unwinders, also making sure there's a way for the stack
+   tracing code to detect interrupts on the stack.
+
+2) Alternatively, ensure that every kthread has a call to
+   klp_update_patch_state() in a safe location.  Kthreads are typically
+   in an infinite loop which does some action repeatedly.  The safe
+   location to switch the kthread's patch state would be at a designated
+   point in the loop where there are no locks taken and all data
+   structures are in a well-defined state.
+
+   The location is clear when using workqueues or the kthread worker
+   API.  These kthreads process independent actions in a generic loop.
+
+   It's much more complicated with kthreads which have a custom loop.
+   There the safe location must be carefully selected on a case-by-case
+   basis.
+
+   In that case, arches without HAVE_RELIABLE_STACKTRACE would still be
+   able to use the non-stack-checking parts of the consistency model:
+
+   a) patching user tasks when they cross the kernel/user space
+      boundary; and
+
+   b) patching kthreads and idle tasks at their designated patch points.
+
+   This option isn't as good as option 1 because it requires signaling
+   user tasks and waking kthreads to patch them.  But it could still be
+   a good backup option for those architectures which don't have
+   reliable stack traces yet.
+
+In the meantime, patches for such architectures can bypass the
+consistency model by setting klp_patch.immediate to true.  This option
+is perfectly fine for patches which don't change the semantics of the
+patched functions.  In practice, this is usable for ~90% of security
+fixes.  Use of this option also means the patch can't be unloaded after
+it has been disabled.
 
 
 4. Livepatch module
@@ -134,7 +256,7 @@ Documentation/livepatch/module-elf-format.txt for more details.
 
 
 4.2. Metadata
-------------
+-------------
 
 The patch is described by several structures that split the information
 into three levels:
@@ -156,6 +278,9 @@ into three levels:
     only for a particular object ( vmlinux or a kernel module ). Note that
     kallsyms allows for searching symbols according to the object name.
 
+    There's also an 'immediate' flag which, when set, patches the
+    function immediately, bypassing the consistency model safety checks.
+
   + struct klp_object defines an array of patched functions (struct
     klp_func) in the same object. Where the object is either vmlinux
     (NULL) or a module name.
@@ -172,10 +297,13 @@ into three levels:
     This structure handles all patched functions consistently and eventually,
     synchronously. The whole patch is applied only when all patched
     symbols are found. The only exception are symbols from objects
-    (kernel modules) that have not been loaded yet. Also if a more complex
-    consistency model is supported then a selected unit (thread,
-    kernel as a whole) will see the new code from the entire patch
-    only when it is in a safe state.
+    (kernel modules) that have not been loaded yet.
+
+    Setting the 'immediate' flag applies the patch to all tasks
+    immediately, bypassing the consistency model safety checks.
+
+    For more details on how the patch is applied on a per-task basis,
+    see the "Consistency model" section.
 
 
 4.3. Livepatch module handling
@@ -239,9 +367,15 @@ Registered patches might be enabled either by calling klp_enable_patch() or
 by writing '1' to /sys/kernel/livepatch/<name>/enabled. The system will
 start using the new implementation of the patched functions at this stage.
 
-In particular, if an original function is patched for the first time, a
-function specific struct klp_ops is created and an universal ftrace handler
-is registered.
+When a patch is enabled, livepatch enters into a transition state where
+tasks are converging to the patched state.  This is indicated by a value
+of '1' in /sys/kernel/livepatch/<name>/transition.  Once all tasks have
+been patched, the 'transition' value changes to '0'.  For more
+information about this process, see the "Consistency model" section.
+
+If an original function is patched for the first time, a function
+specific struct klp_ops is created and an universal ftrace handler is
+registered.
 
 Functions might be patched multiple times. The ftrace handler is registered
 only once for the given function. Further patches just add an entry to the
@@ -261,6 +395,12 @@ by writing '0' to /sys/kernel/livepatch/<name>/enabled. At this stage
 either the code from the previously enabled patch or even the original
 code gets used.
 
+When a patch is disabled, livepatch enters into a transition state where
+tasks are converging to the unpatched state.  This is indicated by a
+value of '1' in /sys/kernel/livepatch/<name>/transition.  Once all tasks
+have been unpatched, the 'transition' value changes to '0'.  For more
+information about this process, see the "Consistency model" section.
+
 Here all the functions (struct klp_func) associated with the to-be-disabled
 patch are removed from the corresponding struct klp_ops. The ftrace handler
 is unregistered and the struct klp_ops is freed when the func_stack list
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..5a791055b176 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,7 @@
 #include <linux/sched/autogroup.h>
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
+#include <linux/livepatch.h>
 #include <linux/mm_types.h>
 
 #include <asm/thread_info.h>
@@ -202,6 +203,13 @@ extern struct cred init_cred;
 # define INIT_KASAN(tsk)
 #endif
 
+#ifdef CONFIG_LIVEPATCH
+# define INIT_LIVEPATCH(tsk)						\
+	.patch_state = KLP_UNDEFINED,
+#else
+# define INIT_LIVEPATCH(tsk)
+#endif
+
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 # define INIT_TASK_TI(tsk)			\
 	.thread_info = INIT_THREAD_INFO(tsk),	\
@@ -288,6 +296,7 @@ extern struct cred init_cred;
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
+	INIT_LIVEPATCH(tsk)						\
 }
 
 
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 6602b34bed2b..ed90ad1605c1 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -28,18 +28,40 @@
 
 #include <asm/livepatch.h>
 
+/* task patch states */
+#define KLP_UNDEFINED	-1
+#define KLP_UNPATCHED	 0
+#define KLP_PATCHED	 1
+
 /**
  * struct klp_func - function structure for live patching
  * @old_name:	name of the function to be patched
  * @new_func:	pointer to the patched function code
  * @old_sympos: a hint indicating which symbol position the old function
  *		can be found (optional)
+ * @immediate:  patch the func immediately, bypassing safety mechanisms
  * @old_addr:	the address of the function being patched
  * @kobj:	kobject for sysfs resources
  * @stack_node:	list node for klp_ops func_stack list
  * @old_size:	size of the old function
  * @new_size:	size of the new function
  * @patched:	the func has been added to the klp_ops list
+ * @transition:	the func is currently being applied or reverted
+ *
+ * The patched and transition variables define the func's patching state.  When
+ * patching, a func is always in one of the following states:
+ *
+ *   patched=0 transition=0: unpatched
+ *   patched=0 transition=1: unpatched, temporary starting state
+ *   patched=1 transition=1: patched, may be visible to some tasks
+ *   patched=1 transition=0: patched, visible to all tasks
+ *
+ * And when unpatching, it goes in the reverse order:
+ *
+ *   patched=1 transition=0: patched, visible to all tasks
+ *   patched=1 transition=1: patched, may be visible to some tasks
+ *   patched=0 transition=1: unpatched, temporary ending state
+ *   patched=0 transition=0: unpatched
  */
 struct klp_func {
 	/* external */
@@ -53,6 +75,7 @@ struct klp_func {
 	 * in kallsyms for the given object is used.
 	 */
 	unsigned long old_sympos;
+	bool immediate;
 
 	/* internal */
 	unsigned long old_addr;
@@ -60,6 +83,7 @@ struct klp_func {
 	struct list_head stack_node;
 	unsigned long old_size, new_size;
 	bool patched;
+	bool transition;
 };
 
 /**
@@ -68,7 +92,7 @@ struct klp_func {
  * @funcs:	function entries for functions to be patched in the object
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
- * 		(NULL for vmlinux)
+ *		(NULL for vmlinux)
  * @patched:	the object's funcs have been added to the klp_ops list
  */
 struct klp_object {
@@ -86,6 +110,7 @@ struct klp_object {
  * struct klp_patch - patch structure for live patching
  * @mod:	reference to the live patch module
  * @objs:	object entries for kernel objects to be patched
+ * @immediate:  patch all funcs immediately, bypassing safety mechanisms
  * @list:	list node for global list of registered patches
  * @kobj:	kobject for sysfs resources
  * @enabled:	the patch is enabled (but operation may be incomplete)
@@ -94,6 +119,7 @@ struct klp_patch {
 	/* external */
 	struct module *mod;
 	struct klp_object *objs;
+	bool immediate;
 
 	/* internal */
 	struct list_head list;
@@ -121,13 +147,27 @@ void arch_klp_init_object_loaded(struct klp_patch *patch,
 int klp_module_coming(struct module *mod);
 void klp_module_going(struct module *mod);
 
+void klp_copy_process(struct task_struct *child);
 void klp_update_patch_state(struct task_struct *task);
 
+static inline bool klp_patch_pending(struct task_struct *task)
+{
+	return test_tsk_thread_flag(task, TIF_PATCH_PENDING);
+}
+
+static inline bool klp_have_reliable_stack(void)
+{
+	return IS_ENABLED(CONFIG_STACKTRACE) &&
+	       IS_ENABLED(CONFIG_HAVE_RELIABLE_STACKTRACE);
+}
+
 #else /* !CONFIG_LIVEPATCH */
 
 static inline int klp_module_coming(struct module *mod) { return 0; }
 static inline void klp_module_going(struct module *mod) {}
+static inline bool klp_patch_pending(struct task_struct *task) { return false; }
 static inline void klp_update_patch_state(struct task_struct *task) {}
+static inline void klp_copy_process(struct task_struct *child) {}
 
 #endif /* CONFIG_LIVEPATCH */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..e11032010318 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1037,6 +1037,9 @@ struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/* A live task holds one reference: */
 	atomic_t			stack_refcount;
+#endif
+#ifdef CONFIG_LIVEPATCH
+	int patch_state;
 #endif
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..942cbcd07c18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -87,6 +87,7 @@
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
 #include <linux/kcov.h>
+#include <linux/livepatch.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1797,6 +1798,8 @@ static __latent_entropy struct task_struct *copy_process(
 		p->parent_exec_id = current->self_exec_id;
 	}
 
+	klp_copy_process(p);
+
 	spin_lock(&current->sighand->siglock);
 
 	/*
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index e136dad8ff7e..2b8bdb1925da 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
-livepatch-objs := core.o patch.o
+livepatch-objs := core.o patch.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 10ba3a1578bd..3dc3c9049690 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -31,22 +31,22 @@
 #include <linux/moduleloader.h>
 #include <asm/cacheflush.h>
 #include "patch.h"
+#include "transition.h"
 
 /*
- * The klp_mutex protects the global lists and state transitions of any
- * structure reachable from them.  References to any structure must be obtained
- * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
- * ensure it gets consistent data).
+ * klp_mutex is a coarse lock which serializes access to klp data.  All
+ * accesses to klp-related variables and structures must have mutex protection,
+ * except within the following functions which carefully avoid the need for it:
+ *
+ * - klp_ftrace_handler()
+ * - klp_update_patch_state()
  */
-static DEFINE_MUTEX(klp_mutex);
+DEFINE_MUTEX(klp_mutex);
 
 static LIST_HEAD(klp_patches);
 
 static struct kobject *klp_root_kobj;
 
-/* TODO: temporary stub */
-void klp_update_patch_state(struct task_struct *task) {}
-
 static bool klp_is_module(struct klp_object *obj)
 {
 	return obj->name;
@@ -85,7 +85,6 @@ static void klp_find_object_module(struct klp_object *obj)
 	mutex_unlock(&module_mutex);
 }
 
-/* klp_mutex must be held by caller */
 static bool klp_is_patch_registered(struct klp_patch *patch)
 {
 	struct klp_patch *mypatch;
@@ -281,20 +280,27 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static int __klp_disable_patch(struct klp_patch *patch)
 {
-	struct klp_object *obj;
+	if (klp_transition_patch)
+		return -EBUSY;
 
 	/* enforce stacking: only the last enabled patch can be disabled */
 	if (!list_is_last(&patch->list, &klp_patches) &&
 	    list_next_entry(patch, list)->enabled)
 		return -EBUSY;
 
-	pr_notice("disabling patch '%s'\n", patch->mod->name);
+	klp_init_transition(patch, KLP_UNPATCHED);
 
-	klp_for_each_object(patch, obj) {
-		if (obj->patched)
-			klp_unpatch_object(obj);
-	}
+	/*
+	 * Enforce the order of the func->transition writes in
+	 * klp_init_transition() and the TIF_PATCH_PENDING writes in
+	 * klp_start_transition().  In the rare case where klp_ftrace_handler()
+	 * is called shortly after klp_update_patch_state() switches the task,
+	 * this ensures the handler sees that func->transition is set.
+	 */
+	smp_wmb();
 
+	klp_start_transition();
+	klp_try_complete_transition();
 	patch->enabled = false;
 
 	return 0;
@@ -337,6 +343,9 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	struct klp_object *obj;
 	int ret;
 
+	if (klp_transition_patch)
+		return -EBUSY;
+
 	if (WARN_ON(patch->enabled))
 		return -EINVAL;
 
@@ -347,22 +356,36 @@ static int __klp_enable_patch(struct klp_patch *patch)
 
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
+	klp_init_transition(patch, KLP_PATCHED);
+
+	/*
+	 * Enforce the order of the func->transition writes in
+	 * klp_init_transition() and the ops->func_stack writes in
+	 * klp_patch_object(), so that klp_ftrace_handler() will see the
+	 * func->transition updates before the handler is registered and the
+	 * new funcs become visible to the handler.
+	 */
+	smp_wmb();
+
 	klp_for_each_object(patch, obj) {
 		if (!klp_is_object_loaded(obj))
 			continue;
 
 		ret = klp_patch_object(obj);
-		if (ret)
-			goto unregister;
+		if (ret) {
+			pr_warn("failed to enable patch '%s'\n",
+				patch->mod->name);
+
+			klp_cancel_transition();
+			return ret;
+		}
 	}
 
+	klp_start_transition();
+	klp_try_complete_transition();
 	patch->enabled = true;
 
 	return 0;
-
-unregister:
-	WARN_ON(__klp_disable_patch(patch));
-	return ret;
 }
 
 /**
@@ -399,6 +422,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch);
  * /sys/kernel/livepatch
  * /sys/kernel/livepatch/<patch>
  * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/transition
  * /sys/kernel/livepatch/<patch>/<object>
  * /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
  */
@@ -424,7 +448,9 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
 		goto err;
 	}
 
-	if (enabled) {
+	if (patch == klp_transition_patch) {
+		klp_reverse_transition();
+	} else if (enabled) {
 		ret = __klp_enable_patch(patch);
 		if (ret)
 			goto err;
@@ -452,9 +478,21 @@ static ssize_t enabled_show(struct kobject *kobj,
 	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled);
 }
 
+static ssize_t transition_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	struct klp_patch *patch;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+	return snprintf(buf, PAGE_SIZE-1, "%d\n",
+			patch == klp_transition_patch);
+}
+
 static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
 static struct attribute *klp_patch_attrs[] = {
 	&enabled_kobj_attr.attr,
+	&transition_kobj_attr.attr,
 	NULL
 };
 
@@ -544,6 +582,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 
 	INIT_LIST_HEAD(&func->stack_node);
 	func->patched = false;
+	func->transition = false;
 
 	/* The format for the sysfs directory is <function,sympos> where sympos
 	 * is the nth occurrence of this symbol in kallsyms for the patched
@@ -739,6 +778,16 @@ int klp_register_patch(struct klp_patch *patch)
 	if (!klp_initialized())
 		return -ENODEV;
 
+	/*
+	 * Architectures without reliable stack traces have to set
+	 * patch->immediate because there's currently no way to patch kthreads
+	 * with the consistency model.
+	 */
+	if (!klp_have_reliable_stack() && !patch->immediate) {
+		pr_err("This architecture doesn't have support for the livepatch consistency model.\n");
+		return -ENOSYS;
+	}
+
 	/*
 	 * A reference is taken on the patch module to prevent it from being
 	 * unloaded.  Right now, we don't allow patch modules to unload since
@@ -788,7 +837,11 @@ int klp_module_coming(struct module *mod)
 				goto err;
 			}
 
-			if (!patch->enabled)
+			/*
+			 * Only patch the module if the patch is enabled or is
+			 * in transition.
+			 */
+			if (!patch->enabled && patch != klp_transition_patch)
 				break;
 
 			pr_notice("applying patch '%s' to loading module '%s'\n",
@@ -845,7 +898,11 @@ void klp_module_going(struct module *mod)
 			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
 				continue;
 
-			if (patch->enabled) {
+			/*
+			 * Only unpatch the module if the patch is enabled or
+			 * is in transition.
+			 */
+			if (patch->enabled || patch == klp_transition_patch) {
 				pr_notice("reverting patch '%s' on unloading module '%s'\n",
 					  patch->mod->name, obj->mod->name);
 				klp_unpatch_object(obj);
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 5efa2620851a..f8269036bf0b 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -29,6 +29,7 @@
 #include <linux/bug.h>
 #include <linux/printk.h>
 #include "patch.h"
+#include "transition.h"
 
 static LIST_HEAD(klp_ops);
 
@@ -54,15 +55,64 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 {
 	struct klp_ops *ops;
 	struct klp_func *func;
+	int patch_state;
 
 	ops = container_of(fops, struct klp_ops, fops);
 
 	rcu_read_lock();
+
 	func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
 				      stack_node);
+
+	/*
+	 * func should never be NULL because preemption should be disabled here
+	 * and unregister_ftrace_function() does the equivalent of a
+	 * synchronize_sched() before the func_stack removal.
+	 */
 	if (WARN_ON_ONCE(!func))
 		goto unlock;
 
+	/*
+	 * In the enable path, enforce the order of the ops->func_stack and
+	 * func->transition reads.  The corresponding write barrier is in
+	 * __klp_enable_patch().
+	 *
+	 * (Note that this barrier technically isn't needed in the disable
+	 * path.  In the rare case where klp_update_patch_state() runs before
+	 * this handler, its TIF_PATCH_PENDING read and this func->transition
+	 * read need to be ordered.  But klp_update_patch_state() already
+	 * enforces that.)
+	 */
+	smp_rmb();
+
+	if (unlikely(func->transition)) {
+
+		/*
+		 * Enforce the order of the func->transition and
+		 * current->patch_state reads.  Otherwise we could read an
+		 * out-of-date task state and pick the wrong function.  The
+		 * corresponding write barrier is in klp_init_transition().
+		 */
+		smp_rmb();
+
+		patch_state = current->patch_state;
+
+		WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
+
+		if (patch_state == KLP_UNPATCHED) {
+			/*
+			 * Use the previously patched version of the function.
+			 * If no previous patches exist, continue with the
+			 * original function.
+			 */
+			func = list_entry_rcu(func->stack_node.next,
+					      struct klp_func, stack_node);
+
+			if (&func->stack_node == &ops->func_stack)
+				goto unlock;
+		}
+	}
+
 	klp_arch_set_pc(regs, (unsigned long)func->new_func);
 unlock:
 	rcu_read_unlock();
@@ -211,3 +261,12 @@ int klp_patch_object(struct klp_object *obj)
 
 	return 0;
 }
+
+void klp_unpatch_objects(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+
+	klp_for_each_object(patch, obj)
+		if (obj->patched)
+			klp_unpatch_object(obj);
+}
diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h
index 2d0cce02dade..0db227170c36 100644
--- a/kernel/livepatch/patch.h
+++ b/kernel/livepatch/patch.h
@@ -28,5 +28,6 @@ struct klp_ops *klp_find_ops(unsigned long old_addr);
 
 int klp_patch_object(struct klp_object *obj);
 void klp_unpatch_object(struct klp_object *obj);
+void klp_unpatch_objects(struct klp_patch *patch);
 
 #endif /* _LIVEPATCH_PATCH_H */
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
new file mode 100644
index 000000000000..428533ec51b5
--- /dev/null
+++ b/kernel/livepatch/transition.c
@@ -0,0 +1,543 @@
+/*
+ * transition.c - Kernel Live Patching transition functions
+ *
+ * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/stacktrace.h>
+#include "patch.h"
+#include "transition.h"
+#include "../sched/sched.h"
+
+#define MAX_STACK_ENTRIES  100
+#define STACK_ERR_BUF_SIZE 128
+
+extern struct mutex klp_mutex;
+
+struct klp_patch *klp_transition_patch;
+
+static int klp_target_state = KLP_UNDEFINED;
+
+/*
+ * This work can be performed periodically to finish patching or unpatching any
+ * "straggler" tasks which failed to transition in the first attempt.
+ */
+static void klp_transition_work_fn(struct work_struct *work)
+{
+	mutex_lock(&klp_mutex);
+
+	if (klp_transition_patch)
+		klp_try_complete_transition();
+
+	mutex_unlock(&klp_mutex);
+}
+static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn);
+
+/*
+ * The transition to the target patch state is complete.  Clean up the data
+ * structures.
+ */
+static void klp_complete_transition(void)
+{
+	struct klp_object *obj;
+	struct klp_func *func;
+	struct task_struct *g, *task;
+	unsigned int cpu;
+
+	if (klp_target_state == KLP_UNPATCHED) {
+		/*
+		 * All tasks have transitioned to KLP_UNPATCHED so we can now
+		 * remove the new functions from the func_stack.
+		 */
+		klp_unpatch_objects(klp_transition_patch);
+
+		/*
+		 * Make sure klp_ftrace_handler() can no longer see functions
+		 * from this patch on the ops->func_stack.  Otherwise, after
+		 * func->transition gets cleared, the handler may choose a
+		 * removed function.
+		 */
+		synchronize_rcu();
+	}
+
+	if (klp_transition_patch->immediate)
+		goto done;
+
+	klp_for_each_object(klp_transition_patch, obj)
+		klp_for_each_func(obj, func)
+			func->transition = false;
+
+	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
+	if (klp_target_state == KLP_PATCHED)
+		synchronize_rcu();
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, task) {
+		WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+		task->patch_state = KLP_UNDEFINED;
+	}
+	read_unlock(&tasklist_lock);
+
+	for_each_possible_cpu(cpu) {
+		task = idle_task(cpu);
+		WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
+		task->patch_state = KLP_UNDEFINED;
+	}
+
+done:
+	klp_target_state = KLP_UNDEFINED;
+	klp_transition_patch = NULL;
+}
+
+/*
+ * This is called in the error path, to cancel a transition before it has
+ * started, i.e. klp_init_transition() has been called but
+ * klp_start_transition() hasn't.  If the transition *has* been started,
+ * klp_reverse_transition() should be used instead.
+ */
+void klp_cancel_transition(void)
+{
+	klp_target_state = !klp_target_state;
+	klp_complete_transition();
+}
+
+/*
+ * Switch the patched state of the task to the set of functions in the target
+ * patch state.
+ *
+ * NOTE: If task is not 'current', the caller must ensure the task is inactive.
+ * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value.
+ */
+void klp_update_patch_state(struct task_struct *task)
+{
+	rcu_read_lock();
+
+	/*
+	 * This test_and_clear_tsk_thread_flag() call also serves as a read
+	 * barrier (smp_rmb) for two cases:
+	 *
+	 * 1) Enforce the order of the TIF_PATCH_PENDING read and the
+	 *    klp_target_state read.  The corresponding write barrier is in
+	 *    klp_init_transition().
+	 *
+	 * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
+	 *    of func->transition, if klp_ftrace_handler() is called later on
+	 *    the same CPU.  See __klp_disable_patch().
+	 */
+	if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING))
+		task->patch_state = READ_ONCE(klp_target_state);
+
+	rcu_read_unlock();
+}
+
+/*
+ * Determine whether the given stack trace includes any references to a
+ * to-be-patched or to-be-unpatched function.
+ */
+static int klp_check_stack_func(struct klp_func *func,
+				struct stack_trace *trace)
+{
+	unsigned long func_addr, func_size, address;
+	struct klp_ops *ops;
+	int i;
+
+	if (func->immediate)
+		return 0;
+
+	for (i = 0; i < trace->nr_entries; i++) {
+		address = trace->entries[i];
+
+		if (klp_target_state == KLP_UNPATCHED) {
+			 /*
+			  * Check for the to-be-unpatched function
+			  * (the func itself).
+			  */
+			func_addr = (unsigned long)func->new_func;
+			func_size = func->new_size;
+		} else {
+			/*
+			 * Check for the to-be-patched function
+			 * (the previous func).
+			 */
+			ops = klp_find_ops(func->old_addr);
+
+			if (list_is_singular(&ops->func_stack)) {
+				/* original function */
+				func_addr = func->old_addr;
+				func_size = func->old_size;
+			} else {
+				/* previously patched function */
+				struct klp_func *prev;
+
+				prev = list_next_entry(func, stack_node);
+				func_addr = (unsigned long)prev->new_func;
+				func_size = prev->new_size;
+			}
+		}
+
+		if (address >= func_addr && address < func_addr + func_size)
+			return -EAGAIN;
+	}
+
+	return 0;
+}
+
+/*
+ * Determine whether it's safe to transition the task to the target patch state
+ * by looking for any to-be-patched or to-be-unpatched functions on its stack.
+ */
+static int klp_check_stack(struct task_struct *task, char *err_buf)
+{
+	static unsigned long entries[MAX_STACK_ENTRIES];
+	struct stack_trace trace;
+	struct klp_object *obj;
+	struct klp_func *func;
+	int ret;
+
+	trace.skip = 0;
+	trace.nr_entries = 0;
+	trace.max_entries = MAX_STACK_ENTRIES;
+	trace.entries = entries;
+	ret = save_stack_trace_tsk_reliable(task, &trace);
+	WARN_ON_ONCE(ret == -ENOSYS);
+	if (ret) {
+		snprintf(err_buf, STACK_ERR_BUF_SIZE,
+			 "%s: %s:%d has an unreliable stack\n",
+			 __func__, task->comm, task->pid);
+		return ret;
+	}
+
+	klp_for_each_object(klp_transition_patch, obj) {
+		if (!obj->patched)
+			continue;
+		klp_for_each_func(obj, func) {
+			ret = klp_check_stack_func(func, &trace);
+			if (ret) {
+				snprintf(err_buf, STACK_ERR_BUF_SIZE,
+					 "%s: %s:%d is sleeping on function %s\n",
+					 __func__, task->comm, task->pid,
+					 func->old_name);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Try to safely switch a task to the target patch state.  If it's currently
+ * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
+ * if the stack is unreliable, return false.
+ */
+static bool klp_try_switch_task(struct task_struct *task)
+{
+	struct rq *rq;
+	struct rq_flags flags;
+	int ret;
+	bool success = false;
+	char err_buf[STACK_ERR_BUF_SIZE];
+
+	err_buf[0] = '\0';
+
+	/* check if this task has already switched over */
+	if (task->patch_state == klp_target_state)
+		return true;
+
+	/*
+	 * For arches which don't have reliable stack traces, we have to rely
+	 * on other methods (e.g., switching tasks at kernel exit).
+	 */
+	if (!klp_have_reliable_stack())
+		return false;
+
+	/*
+	 * Now try to check the stack for any to-be-patched or to-be-unpatched
+	 * functions.  If all goes well, switch the task to the target patch
+	 * state.
+	 */
+	rq = task_rq_lock(task, &flags);
+
+	if (task_running(rq, task) && task != current) {
+		snprintf(err_buf, STACK_ERR_BUF_SIZE,
+			 "%s: %s:%d is running\n", __func__, task->comm,
+			 task->pid);
+		goto done;
+	}
+
+	ret = klp_check_stack(task, err_buf);
+	if (ret)
+		goto done;
+
+	success = true;
+
+	clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+	task->patch_state = klp_target_state;
+
+done:
+	task_rq_unlock(rq, task, &flags);
+
+	/*
+	 * Due to console deadlock issues, pr_debug() can't be used while
+	 * holding the task rq lock.  Instead we have to use a temporary buffer
+	 * and print the debug message after releasing the lock.
+	 */
+	if (err_buf[0] != '\0')
+		pr_debug("%s", err_buf);
+
+	return success;
+
+}
+
+/*
+ * Try to switch all remaining tasks to the target patch state by walking the
+ * stacks of sleeping tasks and looking for any to-be-patched or
+ * to-be-unpatched functions.  If such functions are found, the task can't be
+ * switched yet.
+ *
+ * If any tasks are still stuck in the initial patch state, schedule a retry.
+ */
+void klp_try_complete_transition(void)
+{
+	unsigned int cpu;
+	struct task_struct *g, *task;
+	bool complete = true;
+
+	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+
+	/*
+	 * If the patch can be applied or reverted immediately, skip the
+	 * per-task transitions.
+	 */
+	if (klp_transition_patch->immediate)
+		goto success;
+
+	/*
+	 * Try to switch the tasks to the target patch state by walking their
+	 * stacks and looking for any to-be-patched or to-be-unpatched
+	 * functions.  If such functions are found on a stack, or if the stack
+	 * is deemed unreliable, the task can't be switched yet.
+	 *
+	 * Usually this will transition most (or all) of the tasks on a system
+	 * unless the patch includes changes to a very common function.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, task)
+		if (!klp_try_switch_task(task))
+			complete = false;
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Ditto for the idle "swapper" tasks.
+	 */
+	get_online_cpus();
+	for_each_possible_cpu(cpu) {
+		task = idle_task(cpu);
+		if (cpu_online(cpu)) {
+			if (!klp_try_switch_task(task))
+				complete = false;
+		} else if (task->patch_state != klp_target_state) {
+			/* offline idle tasks can be switched immediately */
+			clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+			task->patch_state = klp_target_state;
+		}
+	}
+	put_online_cpus();
+
+	if (!complete) {
+		/*
+		 * Some tasks weren't able to be switched over.  Try again
+		 * later and/or wait for other methods like kernel exit
+		 * switching.
+		 */
+		schedule_delayed_work(&klp_transition_work,
+				      round_jiffies_relative(HZ));
+		return;
+	}
+
+success:
+	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+	/* we're done, now cleanup the data structures */
+	klp_complete_transition();
+}
+
+/*
+ * Start the transition to the specified target patch state so tasks can begin
+ * switching to it.
+ */
+void klp_start_transition(void)
+{
+	struct task_struct *g, *task;
+	unsigned int cpu;
+
+	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+
+	pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+	/*
+	 * If the patch can be applied or reverted immediately, skip the
+	 * per-task transitions.
+	 */
+	if (klp_transition_patch->immediate)
+		return;
+
+	/*
+	 * Mark all normal tasks as needing a patch state update.  They'll
+	 * switch either in klp_try_complete_transition() or as they exit the
+	 * kernel.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, task)
+		if (task->patch_state != klp_target_state)
+			set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Mark all idle tasks as needing a patch state update.  They'll switch
+	 * either in klp_try_complete_transition() or at the idle loop switch
+	 * point.
+	 */
+	for_each_possible_cpu(cpu) {
+		task = idle_task(cpu);
+		if (task->patch_state != klp_target_state)
+			set_tsk_thread_flag(task, TIF_PATCH_PENDING);
+	}
+}
+
+/*
+ * Initialize the global target patch state and all tasks to the initial patch
+ * state, and initialize all function transition states to true in preparation
+ * for patching or unpatching.
+ */
+void klp_init_transition(struct klp_patch *patch, int state)
+{
+	struct task_struct *g, *task;
+	unsigned int cpu;
+	struct klp_object *obj;
+	struct klp_func *func;
+	int initial_state = !state;
+
+	WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED);
+
+	klp_transition_patch = patch;
+
+	/*
+	 * Set the global target patch state which tasks will switch to.  This
+	 * has no effect until the TIF_PATCH_PENDING flags get set later.
+	 */
+	klp_target_state = state;
+
+	/*
+	 * If the patch can be applied or reverted immediately, skip the
+	 * per-task transitions.
+	 */
+	if (patch->immediate)
+		return;
+
+	/*
+	 * Initialize all tasks to the initial patch state to prepare them for
+	 * switching to the target state.
+	 */
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, task) {
+		WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+		task->patch_state = initial_state;
+	}
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Ditto for the idle "swapper" tasks.
+	 */
+	for_each_possible_cpu(cpu) {
+		task = idle_task(cpu);
+		WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+		task->patch_state = initial_state;
+	}
+
+	/*
+	 * Enforce the order of the task->patch_state initializations and the
+	 * func->transition updates to ensure that klp_ftrace_handler() doesn't
+	 * see a func in transition with a task->patch_state of KLP_UNDEFINED.
+	 *
+	 * Also enforce the order of the klp_target_state write and future
+	 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
+	 * set a task->patch_state to KLP_UNDEFINED.
+	 */
+	smp_wmb();
+
+	/*
+	 * Set the func transition states so klp_ftrace_handler() will know to
+	 * switch to the transition logic.
+	 *
+	 * When patching, the funcs aren't yet in the func_stack and will be
+	 * made visible to the ftrace handler shortly by the calls to
+	 * klp_patch_object().
+	 *
+	 * When unpatching, the funcs are already in the func_stack and so are
+	 * already visible to the ftrace handler.
+	 */
+	klp_for_each_object(patch, obj)
+		klp_for_each_func(obj, func)
+			func->transition = true;
+}
+
+/*
+ * This function can be called in the middle of an existing transition to
+ * reverse the direction of the target patch state.  This can be done to
+ * effectively cancel an existing enable or disable operation if there are any
+ * tasks which are stuck in the initial patch state.
+ */
+void klp_reverse_transition(void)
+{
+	unsigned int cpu;
+	struct task_struct *g, *task;
+
+	klp_transition_patch->enabled = !klp_transition_patch->enabled;
+
+	klp_target_state = !klp_target_state;
+
+	/*
+	 * Clear all TIF_PATCH_PENDING flags to prevent races caused by
+	 * klp_update_patch_state() running in parallel with
+	 * klp_start_transition().
+	 */
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, task)
+		clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
+	read_unlock(&tasklist_lock);
+
+	for_each_possible_cpu(cpu)
+		clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
+
+	/* Let any remaining calls to klp_update_patch_state() complete */
+	synchronize_rcu();
+
+	klp_start_transition();
+}
+
+/* Called from copy_process() during fork */
+void klp_copy_process(struct task_struct *child)
+{
+	child->patch_state = current->patch_state;
+
+	/* TIF_PATCH_PENDING gets copied in setup_thread_stack() */
+}
diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h
new file mode 100644
index 000000000000..ce09b326546c
--- /dev/null
+++ b/kernel/livepatch/transition.h
@@ -0,0 +1,14 @@
+#ifndef _LIVEPATCH_TRANSITION_H
+#define _LIVEPATCH_TRANSITION_H
+
+#include <linux/livepatch.h>
+
+extern struct klp_patch *klp_transition_patch;
+
+void klp_init_transition(struct klp_patch *patch, int state);
+void klp_cancel_transition(void);
+void klp_start_transition(void);
+void klp_try_complete_transition(void);
+void klp_reverse_transition(void);
+
+#endif /* _LIVEPATCH_TRANSITION_H */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index ac6d5176463d..2a25a9ec2c6e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,7 @@
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
 #include <linux/suspend.h>
+#include <linux/livepatch.h>
 
 #include <asm/tlb.h>
 
@@ -265,6 +266,9 @@ static void do_idle(void)
 
 	sched_ttwu_pending();
 	schedule_preempt_disabled();
+
+	if (unlikely(klp_patch_pending(current)))
+		klp_update_patch_state(current);
 }
 
 bool cpu_in_idle(unsigned long pc)
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
index e34f871e69b1..629e0dca0887 100644
--- a/samples/livepatch/livepatch-sample.c
+++ b/samples/livepatch/livepatch-sample.c
@@ -17,6 +17,8 @@
  * along with this program; if not, see <http://www.gnu.org/licenses/>.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/livepatch.h>
@@ -69,6 +71,21 @@ static int livepatch_init(void)
 {
 	int ret;
 
+	if (!klp_have_reliable_stack() && !patch.immediate) {
+		/*
+		 * WARNING: Be very careful when using 'patch.immediate' in
+		 * your patches.  It's ok to use it for simple patches like
+		 * this, but for more complex patches which change function
+		 * semantics, locking semantics, or data structures, it may not
+		 * be safe.  Use of this option will also prevent removal of
+		 * the patch.
+		 *
+		 * See Documentation/livepatch/livepatch.txt for more details.
+		 */
+		patch.immediate = true;
+		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
+	}
+
 	ret = klp_register_patch(&patch);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 3ec24776bfd09668079df7dca0c0136d80820ab4 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 6 Mar 2017 11:20:29 -0600
Subject: livepatch: allow removal of a disabled patch

Currently we do not allow patch module to unload since there is no
method to determine if a task is still running in the patched code.

The consistency model gives us the way because when the unpatching
finishes we know that all tasks were marked as safe to call an original
function. Thus every new call to the function calls the original code
and at the same time no task can be somewhere in the patched code,
because it had to leave that code to be marked as safe.

We can safely let the patch module go after that.

Completion is used for synchronization between module removal and sysfs
infrastructure in a similar way to commit 942e443127e9 ("module: Fix
mod->mkobj.kobj potentially freed too early").

Note that we still do not allow the removal for immediate model, that is
no consistency model. The module refcount may increase in this case if
somebody disables and enables the patch several times. This should not
cause any harm.

With this change a call to try_module_get() is moved to
__klp_enable_patch from klp_register_patch to make module reference
counting symmetric (module_put() is in a patch disable path) and to
allow to take a new reference to a disabled module when being enabled.

Finally, we need to be very careful about possible races between
klp_unregister_patch(), kobject_put() functions and operations
on the related sysfs files.

kobject_put(&patch->kobj) must be called without klp_mutex. Otherwise,
it might be blocked by enabled_store() that needs the mutex as well.
In addition, enabled_store() must check if the patch was not
unregisted in the meantime.

There is no need to do the same for other kobject_put() callsites
at the moment. Their sysfs operations neither take the lock nor
they access any data that might be freed in the meantime.

There was an attempt to use kobjects the right way and prevent these
races by design. But it made the patch definition more complicated
and opened another can of worms. See
https://lkml.kernel.org/r/1464018848-4303-1-git-send-email-pmladek@suse.com

[Thanks to Petr Mladek for improving the commit message.]

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/livepatch.txt | 28 ++++--------
 include/linux/livepatch.h             |  3 ++
 kernel/livepatch/core.c               | 80 ++++++++++++++++++++++-------------
 kernel/livepatch/transition.c         | 37 ++++++++++++++--
 samples/livepatch/livepatch-sample.c  |  1 -
 5 files changed, 96 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/livepatch/livepatch.txt b/Documentation/livepatch/livepatch.txt
index 4f2aec8d4c12..ecdb18104ab0 100644
--- a/Documentation/livepatch/livepatch.txt
+++ b/Documentation/livepatch/livepatch.txt
@@ -316,8 +316,15 @@ section "Livepatch life-cycle" below for more details about these
 two operations.
 
 Module removal is only safe when there are no users of the underlying
-functions.  The immediate consistency model is not able to detect this;
-therefore livepatch modules cannot be removed. See "Limitations" below.
+functions. The immediate consistency model is not able to detect this. The
+code just redirects the functions at the very beginning and it does not
+check if the functions are in use. In other words, it knows when the
+functions get called but it does not know when the functions return.
+Therefore it cannot be decided when the livepatch module can be safely
+removed. This is solved by a hybrid consistency model. When the system is
+transitioned to a new patch state (patched/unpatched) it is guaranteed that
+no task sleeps or runs in the old code.
+
 
 5. Livepatch life-cycle
 =======================
@@ -469,23 +476,6 @@ The current Livepatch implementation has several limitations:
     by "notrace".
 
 
-  + Livepatch modules can not be removed.
-
-    The current implementation just redirects the functions at the very
-    beginning. It does not check if the functions are in use. In other
-    words, it knows when the functions get called but it does not
-    know when the functions return. Therefore it can not decide when
-    the livepatch module can be safely removed.
-
-    This will get most likely solved once a more complex consistency model
-    is supported. The idea is that a safe state for patching should also
-    mean a safe state for removing the patch.
-
-    Note that the patch itself might get disabled by writing zero
-    to /sys/kernel/livepatch/<patch>/enabled. It causes that the new
-    code will not longer get called. But it does not guarantee
-    that anyone is not sleeping anywhere in the new code.
-
 
   + Livepatch works reliably only when the dynamic ftrace is located at
     the very beginning of the function.
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index ed90ad1605c1..194991ef9347 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -23,6 +23,7 @@
 
 #include <linux/module.h>
 #include <linux/ftrace.h>
+#include <linux/completion.h>
 
 #if IS_ENABLED(CONFIG_LIVEPATCH)
 
@@ -114,6 +115,7 @@ struct klp_object {
  * @list:	list node for global list of registered patches
  * @kobj:	kobject for sysfs resources
  * @enabled:	the patch is enabled (but operation may be incomplete)
+ * @finish:	for waiting till it is safe to remove the patch module
  */
 struct klp_patch {
 	/* external */
@@ -125,6 +127,7 @@ struct klp_patch {
 	struct list_head list;
 	struct kobject kobj;
 	bool enabled;
+	struct completion finish;
 };
 
 #define klp_for_each_object(patch, obj) \
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3dc3c9049690..6844c1213df8 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -29,6 +29,7 @@
 #include <linux/livepatch.h>
 #include <linux/elf.h>
 #include <linux/moduleloader.h>
+#include <linux/completion.h>
 #include <asm/cacheflush.h>
 #include "patch.h"
 #include "transition.h"
@@ -354,6 +355,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	    !list_prev_entry(patch, list)->enabled)
 		return -EBUSY;
 
+	/*
+	 * A reference is taken on the patch module to prevent it from being
+	 * unloaded.
+	 *
+	 * Note: For immediate (no consistency model) patches we don't allow
+	 * patch modules to unload since there is no safe/sane method to
+	 * determine if a thread is still running in the patched code contained
+	 * in the patch module once the ftrace registration is successful.
+	 */
+	if (!try_module_get(patch->mod))
+		return -ENODEV;
+
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
 	klp_init_transition(patch, KLP_PATCHED);
@@ -442,6 +455,15 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 	mutex_lock(&klp_mutex);
 
+	if (!klp_is_patch_registered(patch)) {
+		/*
+		 * Module with the patch could either disappear meanwhile or is
+		 * not properly initialized yet.
+		 */
+		ret = -EINVAL;
+		goto err;
+	}
+
 	if (patch->enabled == enabled) {
 		/* already in requested state */
 		ret = -EINVAL;
@@ -498,10 +520,10 @@ static struct attribute *klp_patch_attrs[] = {
 
 static void klp_kobj_release_patch(struct kobject *kobj)
 {
-	/*
-	 * Once we have a consistency model we'll need to module_put() the
-	 * patch module here.  See klp_register_patch() for more details.
-	 */
+	struct klp_patch *patch;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+	complete(&patch->finish);
 }
 
 static struct kobj_type klp_ktype_patch = {
@@ -572,7 +594,6 @@ static void klp_free_patch(struct klp_patch *patch)
 	klp_free_objects_limited(patch, NULL);
 	if (!list_empty(&patch->list))
 		list_del(&patch->list);
-	kobject_put(&patch->kobj);
 }
 
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
@@ -695,11 +716,14 @@ static int klp_init_patch(struct klp_patch *patch)
 	mutex_lock(&klp_mutex);
 
 	patch->enabled = false;
+	init_completion(&patch->finish);
 
 	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
 				   klp_root_kobj, "%s", patch->mod->name);
-	if (ret)
-		goto unlock;
+	if (ret) {
+		mutex_unlock(&klp_mutex);
+		return ret;
+	}
 
 	klp_for_each_object(patch, obj) {
 		ret = klp_init_object(patch, obj);
@@ -715,9 +739,12 @@ static int klp_init_patch(struct klp_patch *patch)
 
 free:
 	klp_free_objects_limited(patch, obj);
-	kobject_put(&patch->kobj);
-unlock:
+
 	mutex_unlock(&klp_mutex);
+
+	kobject_put(&patch->kobj);
+	wait_for_completion(&patch->finish);
+
 	return ret;
 }
 
@@ -731,23 +758,29 @@ unlock:
  */
 int klp_unregister_patch(struct klp_patch *patch)
 {
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&klp_mutex);
 
 	if (!klp_is_patch_registered(patch)) {
 		ret = -EINVAL;
-		goto out;
+		goto err;
 	}
 
 	if (patch->enabled) {
 		ret = -EBUSY;
-		goto out;
+		goto err;
 	}
 
 	klp_free_patch(patch);
 
-out:
+	mutex_unlock(&klp_mutex);
+
+	kobject_put(&patch->kobj);
+	wait_for_completion(&patch->finish);
+
+	return 0;
+err:
 	mutex_unlock(&klp_mutex);
 	return ret;
 }
@@ -760,12 +793,13 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch);
  * Initializes the data structure associated with the patch and
  * creates the sysfs interface.
  *
+ * There is no need to take the reference on the patch module here. It is done
+ * later when the patch is enabled.
+ *
  * Return: 0 on success, otherwise error
  */
 int klp_register_patch(struct klp_patch *patch)
 {
-	int ret;
-
 	if (!patch || !patch->mod)
 		return -EINVAL;
 
@@ -788,21 +822,7 @@ int klp_register_patch(struct klp_patch *patch)
 		return -ENOSYS;
 	}
 
-	/*
-	 * A reference is taken on the patch module to prevent it from being
-	 * unloaded.  Right now, we don't allow patch modules to unload since
-	 * there is currently no method to determine if a thread is still
-	 * running in the patched code contained in the patch module once
-	 * the ftrace registration is successful.
-	 */
-	if (!try_module_get(patch->mod))
-		return -ENODEV;
-
-	ret = klp_init_patch(patch);
-	if (ret)
-		module_put(patch->mod);
-
-	return ret;
+	return klp_init_patch(patch);
 }
 EXPORT_SYMBOL_GPL(klp_register_patch);
 
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 428533ec51b5..0ab7abd53b0b 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -59,6 +59,7 @@ static void klp_complete_transition(void)
 	struct klp_func *func;
 	struct task_struct *g, *task;
 	unsigned int cpu;
+	bool immediate_func = false;
 
 	if (klp_target_state == KLP_UNPATCHED) {
 		/*
@@ -79,9 +80,16 @@ static void klp_complete_transition(void)
 	if (klp_transition_patch->immediate)
 		goto done;
 
-	klp_for_each_object(klp_transition_patch, obj)
-		klp_for_each_func(obj, func)
+	klp_for_each_object(klp_transition_patch, obj) {
+		klp_for_each_func(obj, func) {
 			func->transition = false;
+			if (func->immediate)
+				immediate_func = true;
+		}
+	}
+
+	if (klp_target_state == KLP_UNPATCHED && !immediate_func)
+		module_put(klp_transition_patch->mod);
 
 	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
 	if (klp_target_state == KLP_PATCHED)
@@ -113,8 +121,31 @@ done:
  */
 void klp_cancel_transition(void)
 {
-	klp_target_state = !klp_target_state;
+	struct klp_patch *patch = klp_transition_patch;
+	struct klp_object *obj;
+	struct klp_func *func;
+	bool immediate_func = false;
+
+	if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
+		return;
+
+	klp_target_state = KLP_UNPATCHED;
 	klp_complete_transition();
+
+	/*
+	 * In the enable error path, even immediate patches can be safely
+	 * removed because the transition hasn't been started yet.
+	 *
+	 * klp_complete_transition() doesn't have a module_put() for immediate
+	 * patches, so do it here.
+	 */
+	klp_for_each_object(patch, obj)
+		klp_for_each_func(obj, func)
+			if (func->immediate)
+				immediate_func = true;
+
+	if (patch->immediate || immediate_func)
+		module_put(patch->mod);
 }
 
 /*
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
index 629e0dca0887..84795223f15f 100644
--- a/samples/livepatch/livepatch-sample.c
+++ b/samples/livepatch/livepatch-sample.c
@@ -99,7 +99,6 @@ static int livepatch_init(void)
 
 static void livepatch_exit(void)
 {
-	WARN_ON(klp_disable_patch(&patch));
 	WARN_ON(klp_unregister_patch(&patch));
 }
 
-- 
cgit v1.2.3


From 923386f761f5ff35da2ac778839876fe4a2f165f Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 18 Dec 2015 15:36:57 +0200
Subject: clk: ti: remove un-used definitions from public clk_hw_omap struct

Clksel support has been deprecated a while back, so remove these from
the struct also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 include/linux/clk/ti.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 6110fe09ed18..07308db5a15d 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -129,8 +129,6 @@ struct clk_hw_omap_ops {
  * @enable_bit: bitshift to write to enable/disable the clock (see @enable_reg)
  * @flags: see "struct clk.flags possibilities" above
  * @clksel_reg: for clksel clks, register va containing src/divisor select
- * @clksel_mask: bitmask in @clksel_reg for the src/divisor selector
- * @clksel: for clksel clks, pointer to struct clksel for this clock
  * @dpll_data: for DPLLs, pointer to struct dpll_data for this clock
  * @clkdm_name: clockdomain name that this clock is contained in
  * @clkdm: pointer to struct clockdomain, resolved from @clkdm_name at runtime
@@ -145,8 +143,6 @@ struct clk_hw_omap {
 	u8			enable_bit;
 	u8			flags;
 	void __iomem		*clksel_reg;
-	u32			clksel_mask;
-	const struct clksel	*clksel;
 	struct dpll_data	*dpll_data;
 	const char		*clkdm_name;
 	struct clockdomain	*clkdm;
-- 
cgit v1.2.3


From b6f27b2db2df395d65b02a758861c7fc54edbec1 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 30 Sep 2016 14:10:11 +0300
Subject: clk: ti: add clkdm_lookup to the exported functions

This will be needed to move some additional clockdomain functionality
under clock driver.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/clock.c | 1 +
 include/linux/clk/ti.h      | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 1270afdcacdf..6fac82609c96 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -57,6 +57,7 @@ u16 cpu_mask;
 static struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.clkdm_clk_enable = clkdm_clk_enable,
 	.clkdm_clk_disable = clkdm_clk_disable,
+	.clkdm_lookup = clkdm_lookup,
 	.cm_wait_module_ready = omap_cm_wait_module_ready,
 	.cm_split_idlest_reg = cm_split_idlest_reg,
 };
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 07308db5a15d..bc7fd8f0fb5d 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -213,6 +213,7 @@ struct clk_omap_reg {
  * @clk_writel: pointer to register write function
  * @clkdm_clk_enable: pointer to clockdomain enable function
  * @clkdm_clk_disable: pointer to clockdomain disable function
+ * @clkdm_lookup: pointer to clockdomain lookup function
  * @cm_wait_module_ready: pointer to CM module wait ready function
  * @cm_split_idlest_reg: pointer to CM module function to split idlest reg
  *
@@ -228,6 +229,7 @@ struct ti_clk_ll_ops {
 	int	(*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk);
 	int	(*clkdm_clk_disable)(struct clockdomain *clkdm,
 				     struct clk *clk);
+	struct clockdomain * (*clkdm_lookup)(const char *name);
 	int	(*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
 					u8 idlest_shift);
 	int	(*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
-- 
cgit v1.2.3


From 2e1a294c0f2273a6d3537c91965ca46a6483bd8c Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Fri, 30 Sep 2016 14:13:38 +0300
Subject: clk: ti: move omap2_init_clk_clkdm under TI clock driver

This is not needed outside the driver, so move it inside it and remove
the prototype from the public header also.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/clock.c  | 32 --------------------------------
 drivers/clk/ti/clock.h       |  1 +
 drivers/clk/ti/clockdomain.c | 30 ++++++++++++++++++++++++++++++
 include/linux/clk/ti.h       |  1 -
 4 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index 6fac82609c96..ae5b23c19b83 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -79,38 +79,6 @@ int __init omap2_clk_setup_ll_ops(void)
  * OMAP2+ specific clock functions
  */
 
-/* Public functions */
-
-/**
- * omap2_init_clk_clkdm - look up a clockdomain name, store pointer in clk
- * @clk: OMAP clock struct ptr to use
- *
- * Convert a clockdomain name stored in a struct clk 'clk' into a
- * clockdomain pointer, and save it into the struct clk.  Intended to be
- * called during clk_register().  No return value.
- */
-void omap2_init_clk_clkdm(struct clk_hw *hw)
-{
-	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
-	struct clockdomain *clkdm;
-	const char *clk_name;
-
-	if (!clk->clkdm_name)
-		return;
-
-	clk_name = __clk_get_name(hw->clk);
-
-	clkdm = clkdm_lookup(clk->clkdm_name);
-	if (clkdm) {
-		pr_debug("clock: associated clk %s to clkdm %s\n",
-			 clk_name, clk->clkdm_name);
-		clk->clkdm = clkdm;
-	} else {
-		pr_debug("clock: could not associate clk %s to clkdm %s\n",
-			 clk_name, clk->clkdm_name);
-	}
-}
-
 /**
  * ti_clk_init_features - init clock features struct for the SoC
  *
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index ee6d22507a3d..ecf82d86118c 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -228,6 +228,7 @@ extern const struct clk_hw_omap_ops clkhwops_am35xx_ipss_wait;
 extern const struct clk_ops ti_clk_divider_ops;
 extern const struct clk_ops ti_clk_mux_ops;
 
+void omap2_init_clk_clkdm(struct clk_hw *hw);
 int omap2_clkops_enable_clkdm(struct clk_hw *hw);
 void omap2_clkops_disable_clkdm(struct clk_hw *hw);
 
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 6cf9dd189a92..704157d8c0b7 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -103,6 +103,36 @@ void omap2_clkops_disable_clkdm(struct clk_hw *hw)
 	ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
 }
 
+/**
+ * omap2_init_clk_clkdm - look up a clockdomain name, store pointer in clk
+ * @clk: OMAP clock struct ptr to use
+ *
+ * Convert a clockdomain name stored in a struct clk 'clk' into a
+ * clockdomain pointer, and save it into the struct clk.  Intended to be
+ * called during clk_register().  No return value.
+ */
+void omap2_init_clk_clkdm(struct clk_hw *hw)
+{
+	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
+	struct clockdomain *clkdm;
+	const char *clk_name;
+
+	if (!clk->clkdm_name)
+		return;
+
+	clk_name = __clk_get_name(hw->clk);
+
+	clkdm = ti_clk_ll_ops->clkdm_lookup(clk->clkdm_name);
+	if (clkdm) {
+		pr_debug("clock: associated clk %s to clkdm %s\n",
+			 clk_name, clk->clkdm_name);
+		clk->clkdm = clkdm;
+	} else {
+		pr_debug("clock: could not associate clk %s to clkdm %s\n",
+			 clk_name, clk->clkdm_name);
+	}
+}
+
 static void __init of_ti_clockdomain_setup(struct device_node *node)
 {
 	struct clk *clk;
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index bc7fd8f0fb5d..626ae94b7444 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -238,7 +238,6 @@ struct ti_clk_ll_ops {
 
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
 
-void omap2_init_clk_clkdm(struct clk_hw *clk);
 int omap2_clk_disable_autoidle_all(void);
 int omap2_clk_enable_autoidle_all(void);
 int omap2_clk_allow_idle(struct clk *clk);
-- 
cgit v1.2.3


From c91f07801f144920f8467486a1e36e42ed9d9ff2 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Mon, 30 Jan 2017 16:01:36 +0200
Subject: clk: ti: drop unnecessary MEMMAP_ADDRESSING flag

This has been superceded by the usage of ti_clk_ll_ops for now.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 drivers/clk/ti/apll.c      | 1 -
 drivers/clk/ti/dpll.c      | 2 --
 drivers/clk/ti/gate.c      | 4 +---
 drivers/clk/ti/interface.c | 1 -
 include/linux/clk/ti.h     | 2 --
 5 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index 62b5db7b8fe9..5cba28c6ab35 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -194,7 +194,6 @@ static void __init of_dra7_apll_setup(struct device_node *node)
 
 	clk_hw->dpll_data = ad;
 	clk_hw->hw.init = init;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 
 	init->name = node->name;
 	init->ops = &apll_ck_ops;
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index 37624e16cf04..c149bd169f43 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -248,7 +248,6 @@ struct clk *ti_clk_register_dpll(struct ti_clk *setup)
 	clk_hw->dpll_data = dd;
 	clk_hw->ops = &clkhwops_omap3_dpll;
 	clk_hw->hw.init = &init;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 
 	init.name = setup->name;
 	init.ops = ops;
@@ -380,7 +379,6 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	clk_hw->dpll_data = dd;
 	clk_hw->ops = &clkhwops_omap3_dpll;
 	clk_hw->hw.init = init;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 
 	init->name = node->name;
 	init->ops = ops;
diff --git a/drivers/clk/ti/gate.c b/drivers/clk/ti/gate.c
index b3291dbca5ed..5ff62e2c63d0 100644
--- a/drivers/clk/ti/gate.c
+++ b/drivers/clk/ti/gate.c
@@ -113,7 +113,7 @@ static struct clk *_register_gate(struct device *dev, const char *name,
 	clk_hw->enable_bit = bit_idx;
 	clk_hw->ops = hw_ops;
 
-	clk_hw->flags = MEMMAP_ADDRESSING | clk_gate_flags;
+	clk_hw->flags = clk_gate_flags;
 
 	init.parent_names = &parent_name;
 	init.num_parents = 1;
@@ -203,7 +203,6 @@ struct clk_hw *ti_clk_build_component_gate(struct ti_clk_gate *setup)
 		ops = &clkhwops_iclk_wait;
 
 	gate->ops = ops;
-	gate->flags = MEMMAP_ADDRESSING;
 
 	return &gate->hw;
 }
@@ -269,7 +268,6 @@ _of_ti_composite_gate_clk_setup(struct device_node *node,
 
 	gate->enable_bit = val;
 	gate->ops = hw_ops;
-	gate->flags = MEMMAP_ADDRESSING;
 
 	if (!ti_clk_add_component(node, &gate->hw, CLK_COMPONENT_TYPE_GATE))
 		return;
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index 7927e1a2ba02..42d9fd4f5f6a 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -47,7 +47,6 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 
 	clk_hw->hw.init = &init;
 	clk_hw->ops = ops;
-	clk_hw->flags = MEMMAP_ADDRESSING;
 	clk_hw->enable_reg = reg;
 	clk_hw->enable_bit = bit_idx;
 
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index 626ae94b7444..affdabd0b6a1 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -168,7 +168,6 @@ struct clk_hw_omap {
  *     should be used.  This is a temporary solution - a better approach
  *     would be to associate clock type-specific data with the clock,
  *     similar to the struct dpll_data approach.
- * MEMMAP_ADDRESSING: Use memmap addressing to access clock registers.
  */
 #define ENABLE_REG_32BIT	(1 << 0)	/* Use 32-bit access */
 #define CLOCK_IDLE_CONTROL	(1 << 1)
@@ -176,7 +175,6 @@ struct clk_hw_omap {
 #define ENABLE_ON_INIT		(1 << 3)	/* Enable upon framework init */
 #define INVERT_ENABLE		(1 << 4)	/* 0 enables, 1 disables */
 #define CLOCK_CLKOUTX2		(1 << 5)
-#define MEMMAP_ADDRESSING	(1 << 6)
 
 /* CM_CLKEN_PLL*.EN* bit values - not all are available for every DPLL */
 #define DPLL_LOW_POWER_STOP	0x1
-- 
cgit v1.2.3


From 6c0afb503937a12a8d20a805fcf263e31afa9871 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Thu, 9 Feb 2017 11:24:37 +0200
Subject: clk: ti: convert to use proper register definition for all accesses

Currently, TI clock driver uses an encapsulated struct that is cast into
a void pointer to store all register addresses. This can be considered
as rather nasty hackery, and prevents from expanding the register
address field also. Instead, replace all the code to use proper struct
in place for this, which contains all the previously used data.

This patch is rather large as it is touching multiple files, but this
can't be split up as we need to avoid any boot breakage.

Signed-off-by: Tero Kristo <t-kristo@ti.com>
Acked-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/clkt2xxx_dpllcore.c |  3 +-
 arch/arm/mach-omap2/clock.c             |  2 +-
 arch/arm/mach-omap2/clock.h             |  2 ++
 arch/arm/mach-omap2/cm.h                |  5 +--
 arch/arm/mach-omap2/cm2xxx.c            |  9 ++---
 arch/arm/mach-omap2/cm3xxx.c            | 10 ++----
 arch/arm/mach-omap2/cm_common.c         |  2 +-
 drivers/clk/ti/apll.c                   | 47 ++++++++++++-------------
 drivers/clk/ti/autoidle.c               | 18 +++++-----
 drivers/clk/ti/clk-3xxx.c               | 55 +++++++++++++++--------------
 drivers/clk/ti/clk.c                    | 47 ++++++++++++-------------
 drivers/clk/ti/clkt_dflt.c              | 61 ++++++++++++---------------------
 drivers/clk/ti/clkt_dpll.c              |  6 ++--
 drivers/clk/ti/clkt_iclk.c              | 29 ++++++++--------
 drivers/clk/ti/clock.h                  | 11 +++---
 drivers/clk/ti/clockdomain.c            |  8 -----
 drivers/clk/ti/divider.c                | 24 +++++++------
 drivers/clk/ti/dpll.c                   | 48 ++++++++++----------------
 drivers/clk/ti/dpll3xxx.c               | 38 ++++++++++----------
 drivers/clk/ti/dpll44xx.c               | 14 ++++----
 drivers/clk/ti/gate.c                   | 32 ++++++++---------
 drivers/clk/ti/interface.c              | 22 ++++++------
 drivers/clk/ti/mux.c                    | 41 +++++++++-------------
 include/linux/clk/ti.h                  | 46 +++++++++++++------------
 24 files changed, 264 insertions(+), 316 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/clkt2xxx_dpllcore.c b/arch/arm/mach-omap2/clkt2xxx_dpllcore.c
index 59cf310bc1e9..e8d417309f33 100644
--- a/arch/arm/mach-omap2/clkt2xxx_dpllcore.c
+++ b/arch/arm/mach-omap2/clkt2xxx_dpllcore.c
@@ -138,7 +138,8 @@ int omap2_reprogram_dpllcore(struct clk_hw *hw, unsigned long rate,
 		if (!dd)
 			return -EINVAL;
 
-		tmpset.cm_clksel1_pll = readl_relaxed(dd->mult_div1_reg);
+		tmpset.cm_clksel1_pll =
+			omap_clk_ll_ops.clk_readl(&dd->mult_div1_reg);
 		tmpset.cm_clksel1_pll &= ~(dd->mult_mask |
 					   dd->div1_mask);
 		div = ((curr_prcm_set->xtal_speed / 1000000) - 1);
diff --git a/arch/arm/mach-omap2/clock.c b/arch/arm/mach-omap2/clock.c
index ae5b23c19b83..42881f21cede 100644
--- a/arch/arm/mach-omap2/clock.c
+++ b/arch/arm/mach-omap2/clock.c
@@ -54,7 +54,7 @@ u16 cpu_mask;
 #define OMAP3PLUS_DPLL_FINT_MIN		32000
 #define OMAP3PLUS_DPLL_FINT_MAX		52000000
 
-static struct ti_clk_ll_ops omap_clk_ll_ops = {
+struct ti_clk_ll_ops omap_clk_ll_ops = {
 	.clkdm_clk_enable = clkdm_clk_enable,
 	.clkdm_clk_disable = clkdm_clk_disable,
 	.clkdm_lookup = clkdm_lookup,
diff --git a/arch/arm/mach-omap2/clock.h b/arch/arm/mach-omap2/clock.h
index 4e66295dca25..cf45550197e6 100644
--- a/arch/arm/mach-omap2/clock.h
+++ b/arch/arm/mach-omap2/clock.h
@@ -64,6 +64,8 @@
 #define OMAP4XXX_EN_DPLL_FRBYPASS		0x6
 #define OMAP4XXX_EN_DPLL_LOCKED			0x7
 
+extern struct ti_clk_ll_ops omap_clk_ll_ops;
+
 extern u16 cpu_mask;
 
 extern const struct clkops clkops_omap2_dflt_wait;
diff --git a/arch/arm/mach-omap2/cm.h b/arch/arm/mach-omap2/cm.h
index 1fe3e6b833d2..de75cbcdc9d1 100644
--- a/arch/arm/mach-omap2/cm.h
+++ b/arch/arm/mach-omap2/cm.h
@@ -23,6 +23,7 @@
 #define MAX_MODULE_READY_TIME		2000
 
 # ifndef __ASSEMBLER__
+#include <linux/clk/ti.h>
 extern void __iomem *cm_base;
 extern void __iomem *cm2_base;
 extern void omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2);
@@ -50,7 +51,7 @@ extern void omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2);
  * @module_disable: ptr to the SoC CM-specific module_disable impl
  */
 struct cm_ll_data {
-	int (*split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
+	int (*split_idlest_reg)(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,
 				u8 *idlest_reg_id);
 	int (*wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
 				 u8 idlest_shift);
@@ -60,7 +61,7 @@ struct cm_ll_data {
 	void (*module_disable)(u8 part, u16 inst, u16 clkctrl_offs);
 };
 
-extern int cm_split_idlest_reg(void __iomem *idlest_reg, s16 *prcm_inst,
+extern int cm_split_idlest_reg(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,
 			       u8 *idlest_reg_id);
 int omap_cm_wait_module_ready(u8 part, s16 prcm_mod, u16 idlest_reg,
 			      u8 idlest_shift);
diff --git a/arch/arm/mach-omap2/cm2xxx.c b/arch/arm/mach-omap2/cm2xxx.c
index 3e5fd3587eb1..cd90b4c6a06b 100644
--- a/arch/arm/mach-omap2/cm2xxx.c
+++ b/arch/arm/mach-omap2/cm2xxx.c
@@ -204,7 +204,7 @@ void omap2xxx_cm_apll96_disable(void)
  * XXX This function is only needed until absolute register addresses are
  * removed from the OMAP struct clk records.
  */
-static int omap2xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
+static int omap2xxx_cm_split_idlest_reg(struct clk_omap_reg *idlest_reg,
 					s16 *prcm_inst,
 					u8 *idlest_reg_id)
 {
@@ -212,10 +212,7 @@ static int omap2xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	u8 idlest_offs;
 	int i;
 
-	if (idlest_reg < cm_base || idlest_reg > (cm_base + 0x0fff))
-		return -EINVAL;
-
-	idlest_offs = (unsigned long)idlest_reg & 0xff;
+	idlest_offs = idlest_reg->offset & 0xff;
 	for (i = 0; i < ARRAY_SIZE(omap2xxx_cm_idlest_offs); i++) {
 		if (idlest_offs == omap2xxx_cm_idlest_offs[i]) {
 			*idlest_reg_id = i + 1;
@@ -226,7 +223,7 @@ static int omap2xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	if (i == ARRAY_SIZE(omap2xxx_cm_idlest_offs))
 		return -EINVAL;
 
-	offs = idlest_reg - cm_base;
+	offs = idlest_reg->offset;
 	offs &= 0xff00;
 	*prcm_inst = offs;
 
diff --git a/arch/arm/mach-omap2/cm3xxx.c b/arch/arm/mach-omap2/cm3xxx.c
index d91ae8206d1e..55b046a719dc 100644
--- a/arch/arm/mach-omap2/cm3xxx.c
+++ b/arch/arm/mach-omap2/cm3xxx.c
@@ -118,7 +118,7 @@ static int omap3xxx_cm_wait_module_ready(u8 part, s16 prcm_mod, u16 idlest_id,
  * XXX This function is only needed until absolute register addresses are
  * removed from the OMAP struct clk records.
  */
-static int omap3xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
+static int omap3xxx_cm_split_idlest_reg(struct clk_omap_reg *idlest_reg,
 					s16 *prcm_inst,
 					u8 *idlest_reg_id)
 {
@@ -126,11 +126,7 @@ static int omap3xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	u8 idlest_offs;
 	int i;
 
-	if (idlest_reg < (cm_base + OMAP3430_IVA2_MOD) ||
-	    idlest_reg > (cm_base + 0x1ffff))
-		return -EINVAL;
-
-	idlest_offs = (unsigned long)idlest_reg & 0xff;
+	idlest_offs = idlest_reg->offset & 0xff;
 	for (i = 0; i < ARRAY_SIZE(omap3xxx_cm_idlest_offs); i++) {
 		if (idlest_offs == omap3xxx_cm_idlest_offs[i]) {
 			*idlest_reg_id = i + 1;
@@ -141,7 +137,7 @@ static int omap3xxx_cm_split_idlest_reg(void __iomem *idlest_reg,
 	if (i == ARRAY_SIZE(omap3xxx_cm_idlest_offs))
 		return -EINVAL;
 
-	offs = idlest_reg - cm_base;
+	offs = idlest_reg->offset;
 	offs &= 0xff00;
 	*prcm_inst = offs;
 
diff --git a/arch/arm/mach-omap2/cm_common.c b/arch/arm/mach-omap2/cm_common.c
index 23e8bcec34e3..bbe41f4c9dc8 100644
--- a/arch/arm/mach-omap2/cm_common.c
+++ b/arch/arm/mach-omap2/cm_common.c
@@ -65,7 +65,7 @@ void __init omap2_set_globals_cm(void __iomem *cm, void __iomem *cm2)
  * or 0 upon success.  XXX This function is only needed until absolute
  * register addresses are removed from the OMAP struct clk records.
  */
-int cm_split_idlest_reg(void __iomem *idlest_reg, s16 *prcm_inst,
+int cm_split_idlest_reg(struct clk_omap_reg *idlest_reg, s16 *prcm_inst,
 			u8 *idlest_reg_id)
 {
 	if (!cm_ll_data->split_idlest_reg) {
diff --git a/drivers/clk/ti/apll.c b/drivers/clk/ti/apll.c
index 5cba28c6ab35..06f486b3488c 100644
--- a/drivers/clk/ti/apll.c
+++ b/drivers/clk/ti/apll.c
@@ -55,20 +55,20 @@ static int dra7_apll_enable(struct clk_hw *hw)
 	state <<= __ffs(ad->idlest_mask);
 
 	/* Check is already locked */
-	v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->idlest_reg);
 
 	if ((v & ad->idlest_mask) == state)
 		return r;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= APLL_FORCE_LOCK << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 
 	state <<= __ffs(ad->idlest_mask);
 
 	while (1) {
-		v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+		v = ti_clk_ll_ops->clk_readl(&ad->idlest_reg);
 		if ((v & ad->idlest_mask) == state)
 			break;
 		if (i > MAX_APLL_WAIT_TRIES)
@@ -99,10 +99,10 @@ static void dra7_apll_disable(struct clk_hw *hw)
 
 	state <<= __ffs(ad->idlest_mask);
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= APLL_AUTO_IDLE << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 }
 
 static int dra7_apll_is_enabled(struct clk_hw *hw)
@@ -113,7 +113,7 @@ static int dra7_apll_is_enabled(struct clk_hw *hw)
 
 	ad = clk->dpll_data;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ad->enable_mask;
 
 	v >>= __ffs(ad->enable_mask);
@@ -185,6 +185,7 @@ static void __init of_dra7_apll_setup(struct device_node *node)
 	struct clk_hw_omap *clk_hw = NULL;
 	struct clk_init_data *init = NULL;
 	const char **parent_names = NULL;
+	int ret;
 
 	ad = kzalloc(sizeof(*ad), GFP_KERNEL);
 	clk_hw = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
@@ -212,10 +213,10 @@ static void __init of_dra7_apll_setup(struct device_node *node)
 
 	init->parent_names = parent_names;
 
-	ad->control_reg = ti_clk_get_reg_addr(node, 0);
-	ad->idlest_reg = ti_clk_get_reg_addr(node, 1);
+	ret = ti_clk_get_reg_addr(node, 0, &ad->control_reg);
+	ret |= ti_clk_get_reg_addr(node, 1, &ad->idlest_reg);
 
-	if (IS_ERR(ad->control_reg) || IS_ERR(ad->idlest_reg))
+	if (ret)
 		goto cleanup;
 
 	ad->idlest_mask = 0x1;
@@ -241,7 +242,7 @@ static int omap2_apll_is_enabled(struct clk_hw *hw)
 	struct dpll_data *ad = clk->dpll_data;
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ad->enable_mask;
 
 	v >>= __ffs(ad->enable_mask);
@@ -267,13 +268,13 @@ static int omap2_apll_enable(struct clk_hw *hw)
 	u32 v;
 	int i = 0;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= OMAP2_EN_APLL_LOCKED << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 
 	while (1) {
-		v = ti_clk_ll_ops->clk_readl(ad->idlest_reg);
+		v = ti_clk_ll_ops->clk_readl(&ad->idlest_reg);
 		if (v & ad->idlest_mask)
 			break;
 		if (i > MAX_APLL_WAIT_TRIES)
@@ -297,10 +298,10 @@ static void omap2_apll_disable(struct clk_hw *hw)
 	struct dpll_data *ad = clk->dpll_data;
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(ad->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->control_reg);
 	v &= ~ad->enable_mask;
 	v |= OMAP2_EN_APLL_STOPPED << __ffs(ad->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 }
 
 static struct clk_ops omap2_apll_ops = {
@@ -315,10 +316,10 @@ static void omap2_apll_set_autoidle(struct clk_hw_omap *clk, u32 val)
 	struct dpll_data *ad = clk->dpll_data;
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(ad->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&ad->autoidle_reg);
 	v &= ~ad->autoidle_mask;
 	v |= val << __ffs(ad->autoidle_mask);
-	ti_clk_ll_ops->clk_writel(v, ad->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &ad->control_reg);
 }
 
 #define OMAP2_APLL_AUTOIDLE_LOW_POWER_STOP	0x3
@@ -347,6 +348,7 @@ static void __init of_omap2_apll_setup(struct device_node *node)
 	struct clk *clk;
 	const char *parent_name;
 	u32 val;
+	int ret;
 
 	ad = kzalloc(sizeof(*ad), GFP_KERNEL);
 	clk_hw = kzalloc(sizeof(*clk_hw), GFP_KERNEL);
@@ -392,12 +394,11 @@ static void __init of_omap2_apll_setup(struct device_node *node)
 
 	ad->idlest_mask = 1 << val;
 
-	ad->control_reg = ti_clk_get_reg_addr(node, 0);
-	ad->autoidle_reg = ti_clk_get_reg_addr(node, 1);
-	ad->idlest_reg = ti_clk_get_reg_addr(node, 2);
+	ret = ti_clk_get_reg_addr(node, 0, &ad->control_reg);
+	ret |= ti_clk_get_reg_addr(node, 1, &ad->autoidle_reg);
+	ret |= ti_clk_get_reg_addr(node, 2, &ad->idlest_reg);
 
-	if (IS_ERR(ad->control_reg) || IS_ERR(ad->autoidle_reg) ||
-	    IS_ERR(ad->idlest_reg))
+	if (ret)
 		goto cleanup;
 
 	clk = clk_register(NULL, &clk_hw->hw);
diff --git a/drivers/clk/ti/autoidle.c b/drivers/clk/ti/autoidle.c
index 345af43465f0..7bb9afbe4058 100644
--- a/drivers/clk/ti/autoidle.c
+++ b/drivers/clk/ti/autoidle.c
@@ -25,7 +25,7 @@
 #include "clock.h"
 
 struct clk_ti_autoidle {
-	void __iomem		*reg;
+	struct clk_omap_reg	reg;
 	u8			shift;
 	u8			flags;
 	const char		*name;
@@ -73,28 +73,28 @@ static void _allow_autoidle(struct clk_ti_autoidle *clk)
 {
 	u32 val;
 
-	val = ti_clk_ll_ops->clk_readl(clk->reg);
+	val = ti_clk_ll_ops->clk_readl(&clk->reg);
 
 	if (clk->flags & AUTOIDLE_LOW)
 		val &= ~(1 << clk->shift);
 	else
 		val |= (1 << clk->shift);
 
-	ti_clk_ll_ops->clk_writel(val, clk->reg);
+	ti_clk_ll_ops->clk_writel(val, &clk->reg);
 }
 
 static void _deny_autoidle(struct clk_ti_autoidle *clk)
 {
 	u32 val;
 
-	val = ti_clk_ll_ops->clk_readl(clk->reg);
+	val = ti_clk_ll_ops->clk_readl(&clk->reg);
 
 	if (clk->flags & AUTOIDLE_LOW)
 		val |= (1 << clk->shift);
 	else
 		val &= ~(1 << clk->shift);
 
-	ti_clk_ll_ops->clk_writel(val, clk->reg);
+	ti_clk_ll_ops->clk_writel(val, &clk->reg);
 }
 
 /**
@@ -140,6 +140,7 @@ int __init of_ti_clk_autoidle_setup(struct device_node *node)
 {
 	u32 shift;
 	struct clk_ti_autoidle *clk;
+	int ret;
 
 	/* Check if this clock has autoidle support or not */
 	if (of_property_read_u32(node, "ti,autoidle-shift", &shift))
@@ -152,11 +153,10 @@ int __init of_ti_clk_autoidle_setup(struct device_node *node)
 
 	clk->shift = shift;
 	clk->name = node->name;
-	clk->reg = ti_clk_get_reg_addr(node, 0);
-
-	if (IS_ERR(clk->reg)) {
+	ret = ti_clk_get_reg_addr(node, 0, &clk->reg);
+	if (ret) {
 		kfree(clk);
-		return -EINVAL;
+		return ret;
 	}
 
 	if (of_property_read_bool(node, "ti,invert-autoidle-bit"))
diff --git a/drivers/clk/ti/clk-3xxx.c b/drivers/clk/ti/clk-3xxx.c
index 11d8aa3ec186..b1251cae98b8 100644
--- a/drivers/clk/ti/clk-3xxx.c
+++ b/drivers/clk/ti/clk-3xxx.c
@@ -52,14 +52,13 @@
  * @idlest_reg and @idlest_bit.  No return value.
  */
 static void omap3430es2_clk_ssi_find_idlest(struct clk_hw_omap *clk,
-					    void __iomem **idlest_reg,
+					    struct clk_omap_reg *idlest_reg,
 					    u8 *idlest_bit,
 					    u8 *idlest_val)
 {
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	*idlest_bit = OMAP3430ES2_ST_SSI_IDLE_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
 }
@@ -85,15 +84,15 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_ssi_wait = {
  * default find_idlest code assumes that they are at the same
  * position.)  No return value.
  */
-static void omap3430es2_clk_dss_usbhost_find_idlest(struct clk_hw_omap *clk,
-						    void __iomem **idlest_reg,
-						    u8 *idlest_bit,
-						    u8 *idlest_val)
+static void
+omap3430es2_clk_dss_usbhost_find_idlest(struct clk_hw_omap *clk,
+					struct clk_omap_reg *idlest_reg,
+					u8 *idlest_bit, u8 *idlest_val)
 {
-	u32 r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
 
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	/* USBHOST_IDLE has same shift */
 	*idlest_bit = OMAP3430ES2_ST_DSS_IDLE_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
@@ -122,15 +121,15 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_dss_usbhost_wait = {
  * shift from the CM_{I,F}CLKEN bit.  Pass back the correct info via
  * @idlest_reg and @idlest_bit.  No return value.
  */
-static void omap3430es2_clk_hsotgusb_find_idlest(struct clk_hw_omap *clk,
-						 void __iomem **idlest_reg,
-						 u8 *idlest_bit,
-						 u8 *idlest_val)
+static void
+omap3430es2_clk_hsotgusb_find_idlest(struct clk_hw_omap *clk,
+				     struct clk_omap_reg *idlest_reg,
+				     u8 *idlest_bit,
+				     u8 *idlest_val)
 {
-	u32 r;
-
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	*idlest_bit = OMAP3430ES2_ST_HSOTGUSB_IDLE_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
 }
@@ -154,11 +153,11 @@ const struct clk_hw_omap_ops clkhwops_omap3430es2_iclk_hsotgusb_wait = {
  * bit. A value of 1 indicates that clock is enabled.
  */
 static void am35xx_clk_find_idlest(struct clk_hw_omap *clk,
-				   void __iomem **idlest_reg,
+				   struct clk_omap_reg *idlest_reg,
 				   u8 *idlest_bit,
 				   u8 *idlest_val)
 {
-	*idlest_reg = (__force void __iomem *)(clk->enable_reg);
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
 	*idlest_bit = clk->enable_bit + AM35XX_IPSS_ICK_EN_ACK_OFFSET;
 	*idlest_val = AM35XX_IPSS_CLK_IDLEST_VAL;
 }
@@ -178,10 +177,10 @@ static void am35xx_clk_find_idlest(struct clk_hw_omap *clk,
  * avoid this issue, and remove the casts.  No return value.
  */
 static void am35xx_clk_find_companion(struct clk_hw_omap *clk,
-				      void __iomem **other_reg,
+				      struct clk_omap_reg *other_reg,
 				      u8 *other_bit)
 {
-	*other_reg = (__force void __iomem *)(clk->enable_reg);
+	memcpy(other_reg, &clk->enable_reg, sizeof(*other_reg));
 	if (clk->enable_bit & AM35XX_IPSS_ICK_MASK)
 		*other_bit = clk->enable_bit + AM35XX_IPSS_ICK_FCK_OFFSET;
 	else
@@ -205,14 +204,14 @@ const struct clk_hw_omap_ops clkhwops_am35xx_ipss_module_wait = {
  * and @idlest_bit.  No return value.
  */
 static void am35xx_clk_ipss_find_idlest(struct clk_hw_omap *clk,
-					void __iomem **idlest_reg,
+					struct clk_omap_reg *idlest_reg,
 					u8 *idlest_bit,
 					u8 *idlest_val)
 {
-	u32 r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
 
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 	*idlest_bit = AM35XX_ST_IPSS_SHIFT;
 	*idlest_val = OMAP34XX_CM_IDLEST_VAL;
 }
diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c
index 024123f421d6..ddbad7e8d7c9 100644
--- a/drivers/clk/ti/clk.c
+++ b/drivers/clk/ti/clk.c
@@ -43,27 +43,29 @@ struct clk_iomap {
 
 static struct clk_iomap *clk_memmaps[CLK_MAX_MEMMAPS];
 
-static void clk_memmap_writel(u32 val, void __iomem *reg)
+static void clk_memmap_writel(u32 val, const struct clk_omap_reg *reg)
 {
-	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
-	struct clk_iomap *io = clk_memmaps[r->index];
+	struct clk_iomap *io = clk_memmaps[reg->index];
 
-	if (io->regmap)
-		regmap_write(io->regmap, r->offset, val);
+	if (reg->ptr)
+		writel_relaxed(val, reg->ptr);
+	else if (io->regmap)
+		regmap_write(io->regmap, reg->offset, val);
 	else
-		writel_relaxed(val, io->mem + r->offset);
+		writel_relaxed(val, io->mem + reg->offset);
 }
 
-static u32 clk_memmap_readl(void __iomem *reg)
+static u32 clk_memmap_readl(const struct clk_omap_reg *reg)
 {
 	u32 val;
-	struct clk_omap_reg *r = (struct clk_omap_reg *)&reg;
-	struct clk_iomap *io = clk_memmaps[r->index];
+	struct clk_iomap *io = clk_memmaps[reg->index];
 
-	if (io->regmap)
-		regmap_read(io->regmap, r->offset, &val);
+	if (reg->ptr)
+		val = readl_relaxed(reg->ptr);
+	else if (io->regmap)
+		regmap_read(io->regmap, reg->offset, &val);
 	else
-		val = readl_relaxed(io->mem + r->offset);
+		val = readl_relaxed(io->mem + reg->offset);
 
 	return val;
 }
@@ -162,20 +164,18 @@ int __init ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
  * ti_clk_get_reg_addr - get register address for a clock register
  * @node: device node for the clock
  * @index: register index from the clock node
+ * @reg: pointer to target register struct
  *
- * Builds clock register address from device tree information. This
- * is a struct of type clk_omap_reg. Returns a pointer to the register
- * address, or a pointer error value in failure.
+ * Builds clock register address from device tree information, and returns
+ * the data via the provided output pointer @reg. Returns 0 on success,
+ * negative error value on failure.
  */
-void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index)
+int ti_clk_get_reg_addr(struct device_node *node, int index,
+			struct clk_omap_reg *reg)
 {
-	struct clk_omap_reg *reg;
 	u32 val;
-	u32 tmp;
 	int i;
 
-	reg = (struct clk_omap_reg *)&tmp;
-
 	for (i = 0; i < CLK_MAX_MEMMAPS; i++) {
 		if (clocks_node_ptr[i] == node->parent)
 			break;
@@ -183,19 +183,20 @@ void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index)
 
 	if (i == CLK_MAX_MEMMAPS) {
 		pr_err("clk-provider not found for %s!\n", node->name);
-		return IOMEM_ERR_PTR(-ENOENT);
+		return -ENOENT;
 	}
 
 	reg->index = i;
 
 	if (of_property_read_u32_index(node, "reg", index, &val)) {
 		pr_err("%s must have reg[%d]!\n", node->name, index);
-		return IOMEM_ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	reg->offset = val;
+	reg->ptr = NULL;
 
-	return (__force void __iomem *)tmp;
+	return 0;
 }
 
 /**
diff --git a/drivers/clk/ti/clkt_dflt.c b/drivers/clk/ti/clkt_dflt.c
index c6ae563801d7..91751dd26b16 100644
--- a/drivers/clk/ti/clkt_dflt.c
+++ b/drivers/clk/ti/clkt_dflt.c
@@ -55,7 +55,8 @@
  * elapsed.  XXX Deprecated - should be moved into drivers for the
  * individual IP block that the IDLEST register exists in.
  */
-static int _wait_idlest_generic(struct clk_hw_omap *clk, void __iomem *reg,
+static int _wait_idlest_generic(struct clk_hw_omap *clk,
+				struct clk_omap_reg *reg,
 				u32 mask, u8 idlest, const char *name)
 {
 	int i = 0, ena = 0;
@@ -91,7 +92,7 @@ static int _wait_idlest_generic(struct clk_hw_omap *clk, void __iomem *reg,
  */
 static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
 {
-	void __iomem *companion_reg, *idlest_reg;
+	struct clk_omap_reg companion_reg, idlest_reg;
 	u8 other_bit, idlest_bit, idlest_val, idlest_reg_id;
 	s16 prcm_mod;
 	int r;
@@ -99,17 +100,17 @@ static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
 	/* Not all modules have multiple clocks that their IDLEST depends on */
 	if (clk->ops->find_companion) {
 		clk->ops->find_companion(clk, &companion_reg, &other_bit);
-		if (!(ti_clk_ll_ops->clk_readl(companion_reg) &
+		if (!(ti_clk_ll_ops->clk_readl(&companion_reg) &
 		      (1 << other_bit)))
 			return;
 	}
 
 	clk->ops->find_idlest(clk, &idlest_reg, &idlest_bit, &idlest_val);
-	r = ti_clk_ll_ops->cm_split_idlest_reg(idlest_reg, &prcm_mod,
+	r = ti_clk_ll_ops->cm_split_idlest_reg(&idlest_reg, &prcm_mod,
 					       &idlest_reg_id);
 	if (r) {
 		/* IDLEST register not in the CM module */
-		_wait_idlest_generic(clk, idlest_reg, (1 << idlest_bit),
+		_wait_idlest_generic(clk, &idlest_reg, (1 << idlest_bit),
 				     idlest_val, clk_hw_get_name(&clk->hw));
 	} else {
 		ti_clk_ll_ops->cm_wait_module_ready(0, prcm_mod, idlest_reg_id,
@@ -139,17 +140,17 @@ static void _omap2_module_wait_ready(struct clk_hw_omap *clk)
  * avoid this issue, and remove the casts.  No return value.
  */
 void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-				   void __iomem **other_reg, u8 *other_bit)
+				   struct clk_omap_reg *other_reg,
+				   u8 *other_bit)
 {
-	u32 r;
+	memcpy(other_reg, &clk->enable_reg, sizeof(*other_reg));
 
 	/*
 	 * Convert CM_ICLKEN* <-> CM_FCLKEN*.  This conversion assumes
 	 * it's just a matter of XORing the bits.
 	 */
-	r = ((__force u32)clk->enable_reg ^ (CM_FCLKEN ^ CM_ICLKEN));
+	other_reg->offset ^= (CM_FCLKEN ^ CM_ICLKEN);
 
-	*other_reg = (__force void __iomem *)r;
 	*other_bit = clk->enable_bit;
 }
 
@@ -168,13 +169,14 @@ void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
  * CM_IDLEST2).  This is not true for all modules.  No return value.
  */
 void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-				void __iomem **idlest_reg, u8 *idlest_bit,
+				struct clk_omap_reg *idlest_reg, u8 *idlest_bit,
 				u8 *idlest_val)
 {
-	u32 r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+
+	idlest_reg->offset &= ~0xf0;
+	idlest_reg->offset |= 0x20;
 
-	r = (((__force u32)clk->enable_reg & ~0xf0) | 0x20);
-	*idlest_reg = (__force void __iomem *)r;
 	*idlest_bit = clk->enable_bit;
 
 	/*
@@ -222,31 +224,19 @@ int omap2_dflt_clk_enable(struct clk_hw *hw)
 		}
 	}
 
-	if (IS_ERR(clk->enable_reg)) {
-		pr_err("%s: %s missing enable_reg\n", __func__,
-		       clk_hw_get_name(hw));
-		ret = -EINVAL;
-		goto err;
-	}
-
 	/* FIXME should not have INVERT_ENABLE bit here */
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg);
 	if (clk->flags & INVERT_ENABLE)
 		v &= ~(1 << clk->enable_bit);
 	else
 		v |= (1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, clk->enable_reg);
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg); /* OCP barrier */
+	ti_clk_ll_ops->clk_writel(v, &clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg); /* OCP barrier */
 
 	if (clk->ops && clk->ops->find_idlest)
 		_omap2_module_wait_ready(clk);
 
 	return 0;
-
-err:
-	if (clkdm_control && clk->clkdm)
-		ti_clk_ll_ops->clkdm_clk_disable(clk->clkdm, hw->clk);
-	return ret;
 }
 
 /**
@@ -264,22 +254,13 @@ void omap2_dflt_clk_disable(struct clk_hw *hw)
 	u32 v;
 
 	clk = to_clk_hw_omap(hw);
-	if (IS_ERR(clk->enable_reg)) {
-		/*
-		 * 'independent' here refers to a clock which is not
-		 * controlled by its parent.
-		 */
-		pr_err("%s: independent clock %s has no enable_reg\n",
-		       __func__, clk_hw_get_name(hw));
-		return;
-	}
 
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg);
 	if (clk->flags & INVERT_ENABLE)
 		v |= (1 << clk->enable_bit);
 	else
 		v &= ~(1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, clk->enable_reg);
+	ti_clk_ll_ops->clk_writel(v, &clk->enable_reg);
 	/* No OCP barrier needed here since it is a disable operation */
 
 	if (!(ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) &&
@@ -300,7 +281,7 @@ int omap2_dflt_clk_is_enabled(struct clk_hw *hw)
 	struct clk_hw_omap *clk = to_clk_hw_omap(hw);
 	u32 v;
 
-	v = ti_clk_ll_ops->clk_readl(clk->enable_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->enable_reg);
 
 	if (clk->flags & INVERT_ENABLE)
 		v ^= BIT(clk->enable_bit);
diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c
index b919fdfe8256..ce98da2c10be 100644
--- a/drivers/clk/ti/clkt_dpll.c
+++ b/drivers/clk/ti/clkt_dpll.c
@@ -213,7 +213,7 @@ u8 omap2_init_dpll_parent(struct clk_hw *hw)
 	if (!dd)
 		return -EINVAL;
 
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	v &= dd->enable_mask;
 	v >>= __ffs(dd->enable_mask);
 
@@ -249,14 +249,14 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk)
 		return 0;
 
 	/* Return bypass rate if DPLL is bypassed */
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	v &= dd->enable_mask;
 	v >>= __ffs(dd->enable_mask);
 
 	if (_omap2_dpll_is_in_bypass(v))
 		return clk_hw_get_rate(dd->clk_bypass);
 
-	v = ti_clk_ll_ops->clk_readl(dd->mult_div1_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->mult_div1_reg);
 	dpll_mult = v & dd->mult_mask;
 	dpll_mult >>= __ffs(dd->mult_mask);
 	dpll_div = v & dd->div1_mask;
diff --git a/drivers/clk/ti/clkt_iclk.c b/drivers/clk/ti/clkt_iclk.c
index 38c36908cf88..60b583d7db33 100644
--- a/drivers/clk/ti/clkt_iclk.c
+++ b/drivers/clk/ti/clkt_iclk.c
@@ -31,28 +31,29 @@
 void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk)
 {
 	u32 v;
-	void __iomem *r;
+	struct clk_omap_reg r;
 
-	r = (__force void __iomem *)
-		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
+	memcpy(&r, &clk->enable_reg, sizeof(r));
+	r.offset ^= (CM_AUTOIDLE ^ CM_ICLKEN);
 
-	v = ti_clk_ll_ops->clk_readl(r);
+	v = ti_clk_ll_ops->clk_readl(&r);
 	v |= (1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, r);
+	ti_clk_ll_ops->clk_writel(v, &r);
 }
 
 /* XXX */
 void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
 {
 	u32 v;
-	void __iomem *r;
+	struct clk_omap_reg r;
 
-	r = (__force void __iomem *)
-		((__force u32)clk->enable_reg ^ (CM_AUTOIDLE ^ CM_ICLKEN));
+	memcpy(&r, &clk->enable_reg, sizeof(r));
 
-	v = ti_clk_ll_ops->clk_readl(r);
+	r.offset ^= (CM_AUTOIDLE ^ CM_ICLKEN);
+
+	v = ti_clk_ll_ops->clk_readl(&r);
 	v &= ~(1 << clk->enable_bit);
-	ti_clk_ll_ops->clk_writel(v, r);
+	ti_clk_ll_ops->clk_writel(v, &r);
 }
 
 /**
@@ -68,14 +69,12 @@ void omap2_clkt_iclk_deny_idle(struct clk_hw_omap *clk)
  * modules.  No return value.
  */
 static void omap2430_clk_i2chs_find_idlest(struct clk_hw_omap *clk,
-					   void __iomem **idlest_reg,
+					   struct clk_omap_reg *idlest_reg,
 					   u8 *idlest_bit,
 					   u8 *idlest_val)
 {
-	u32 r;
-
-	r = ((__force u32)clk->enable_reg ^ (OMAP24XX_CM_FCLKEN2 ^ CM_IDLEST));
-	*idlest_reg = (__force void __iomem *)r;
+	memcpy(idlest_reg, &clk->enable_reg, sizeof(*idlest_reg));
+	idlest_reg->offset ^= (OMAP24XX_CM_FCLKEN2 ^ CM_IDLEST);
 	*idlest_bit = clk->enable_bit;
 	*idlest_val = OMAP24XX_CM_IDLEST_VAL;
 }
diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h
index 437ea768f837..3f7b26540be8 100644
--- a/drivers/clk/ti/clock.h
+++ b/drivers/clk/ti/clock.h
@@ -18,7 +18,7 @@
 
 struct clk_omap_divider {
 	struct clk_hw		hw;
-	void __iomem		*reg;
+	struct clk_omap_reg	reg;
 	u8			shift;
 	u8			width;
 	u8			flags;
@@ -29,7 +29,7 @@ struct clk_omap_divider {
 
 struct clk_omap_mux {
 	struct clk_hw		hw;
-	void __iomem		*reg;
+	struct clk_omap_reg	reg;
 	u32			*table;
 	u32			mask;
 	u8			shift;
@@ -228,7 +228,8 @@ void ti_clk_patch_legacy_clks(struct ti_clk **patch);
 struct clk *ti_clk_register_clk(struct ti_clk *setup);
 int ti_clk_register_legacy_clks(struct ti_clk_alias *clks);
 
-void __iomem *ti_clk_get_reg_addr(struct device_node *node, int index);
+int ti_clk_get_reg_addr(struct device_node *node, int index,
+			struct clk_omap_reg *reg);
 void ti_dt_clocks_register(struct ti_dt_clk *oclks);
 int ti_clk_retry_init(struct device_node *node, struct clk_hw *hw,
 		      ti_of_clk_init_cb_t func);
@@ -263,10 +264,10 @@ int omap2_dflt_clk_enable(struct clk_hw *hw);
 void omap2_dflt_clk_disable(struct clk_hw *hw);
 int omap2_dflt_clk_is_enabled(struct clk_hw *hw);
 void omap2_clk_dflt_find_companion(struct clk_hw_omap *clk,
-				   void __iomem **other_reg,
+				   struct clk_omap_reg *other_reg,
 				   u8 *other_bit);
 void omap2_clk_dflt_find_idlest(struct clk_hw_omap *clk,
-				void __iomem **idlest_reg,
+				struct clk_omap_reg *idlest_reg,
 				u8 *idlest_bit, u8 *idlest_val);
 
 void omap2_clkt_iclk_allow_idle(struct clk_hw_omap *clk);
diff --git a/drivers/clk/ti/clockdomain.c b/drivers/clk/ti/clockdomain.c
index 704157d8c0b7..fbedc6a9fed0 100644
--- a/drivers/clk/ti/clockdomain.c
+++ b/drivers/clk/ti/clockdomain.c
@@ -52,10 +52,6 @@ int omap2_clkops_enable_clkdm(struct clk_hw *hw)
 		return -EINVAL;
 	}
 
-	if (unlikely(clk->enable_reg))
-		pr_err("%s: %s: should use dflt_clk_enable ?!\n", __func__,
-		       clk_hw_get_name(hw));
-
 	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
 		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
 		       __func__, clk_hw_get_name(hw));
@@ -90,10 +86,6 @@ void omap2_clkops_disable_clkdm(struct clk_hw *hw)
 		return;
 	}
 
-	if (unlikely(clk->enable_reg))
-		pr_err("%s: %s: should use dflt_clk_disable ?!\n", __func__,
-		       clk_hw_get_name(hw));
-
 	if (ti_clk_get_features()->flags & TI_CLK_DISABLE_CLKDM_CONTROL) {
 		pr_err("%s: %s: clkfw-based clockdomain control disabled ?!\n",
 		       __func__, clk_hw_get_name(hw));
diff --git a/drivers/clk/ti/divider.c b/drivers/clk/ti/divider.c
index 1cc0242d823f..d6dcb283b72b 100644
--- a/drivers/clk/ti/divider.c
+++ b/drivers/clk/ti/divider.c
@@ -100,7 +100,7 @@ static unsigned long ti_clk_divider_recalc_rate(struct clk_hw *hw,
 	struct clk_omap_divider *divider = to_clk_omap_divider(hw);
 	unsigned int div, val;
 
-	val = ti_clk_ll_ops->clk_readl(divider->reg) >> divider->shift;
+	val = ti_clk_ll_ops->clk_readl(&divider->reg) >> divider->shift;
 	val &= div_mask(divider);
 
 	div = _get_div(divider, val);
@@ -257,11 +257,11 @@ static int ti_clk_divider_set_rate(struct clk_hw *hw, unsigned long rate,
 	if (divider->flags & CLK_DIVIDER_HIWORD_MASK) {
 		val = div_mask(divider) << (divider->shift + 16);
 	} else {
-		val = ti_clk_ll_ops->clk_readl(divider->reg);
+		val = ti_clk_ll_ops->clk_readl(&divider->reg);
 		val &= ~(div_mask(divider) << divider->shift);
 	}
 	val |= value << divider->shift;
-	ti_clk_ll_ops->clk_writel(val, divider->reg);
+	ti_clk_ll_ops->clk_writel(val, &divider->reg);
 
 	return 0;
 }
@@ -274,7 +274,8 @@ const struct clk_ops ti_clk_divider_ops = {
 
 static struct clk *_register_divider(struct device *dev, const char *name,
 				     const char *parent_name,
-				     unsigned long flags, void __iomem *reg,
+				     unsigned long flags,
+				     struct clk_omap_reg *reg,
 				     u8 shift, u8 width, u8 clk_divider_flags,
 				     const struct clk_div_table *table)
 {
@@ -303,7 +304,7 @@ static struct clk *_register_divider(struct device *dev, const char *name,
 	init.num_parents = (parent_name ? 1 : 0);
 
 	/* struct clk_divider assignments */
-	div->reg = reg;
+	memcpy(&div->reg, reg, sizeof(*reg));
 	div->shift = shift;
 	div->width = width;
 	div->flags = clk_divider_flags;
@@ -561,14 +562,15 @@ static int _get_divider_width(struct device_node *node,
 }
 
 static int __init ti_clk_divider_populate(struct device_node *node,
-	void __iomem **reg, const struct clk_div_table **table,
+	struct clk_omap_reg *reg, const struct clk_div_table **table,
 	u32 *flags, u8 *div_flags, u8 *width, u8 *shift)
 {
 	u32 val;
+	int ret;
 
-	*reg = ti_clk_get_reg_addr(node, 0);
-	if (IS_ERR(*reg))
-		return PTR_ERR(*reg);
+	ret = ti_clk_get_reg_addr(node, 0, reg);
+	if (ret)
+		return ret;
 
 	if (!of_property_read_u32(node, "ti,bit-shift", &val))
 		*shift = val;
@@ -607,7 +609,7 @@ static void __init of_ti_divider_clk_setup(struct device_node *node)
 {
 	struct clk *clk;
 	const char *parent_name;
-	void __iomem *reg;
+	struct clk_omap_reg reg;
 	u8 clk_divider_flags = 0;
 	u8 width = 0;
 	u8 shift = 0;
@@ -620,7 +622,7 @@ static void __init of_ti_divider_clk_setup(struct device_node *node)
 				    &clk_divider_flags, &width, &shift))
 		goto cleanup;
 
-	clk = _register_divider(NULL, node->name, parent_name, flags, reg,
+	clk = _register_divider(NULL, node->name, parent_name, flags, &reg,
 				shift, width, clk_divider_flags, table);
 
 	if (!IS_ERR(clk)) {
diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c
index 778bc90955b9..96d84888c6c5 100644
--- a/drivers/clk/ti/dpll.c
+++ b/drivers/clk/ti/dpll.c
@@ -203,17 +203,10 @@ cleanup:
 }
 
 #if defined(CONFIG_ARCH_OMAP3) && defined(CONFIG_ATAGS)
-static void __iomem *_get_reg(u8 module, u16 offset)
+void _get_reg(u8 module, u16 offset, struct clk_omap_reg *reg)
 {
-	u32 reg;
-	struct clk_omap_reg *reg_setup;
-
-	reg_setup = (struct clk_omap_reg *)&reg;
-
-	reg_setup->index = module;
-	reg_setup->offset = offset;
-
-	return (void __iomem *)reg;
+	reg->index = module;
+	reg->offset = offset;
 }
 
 struct clk *ti_clk_register_dpll(struct ti_clk *setup)
@@ -255,10 +248,10 @@ struct clk *ti_clk_register_dpll(struct ti_clk *setup)
 	init.num_parents = dpll->num_parents;
 	init.parent_names = dpll->parents;
 
-	dd->control_reg = _get_reg(dpll->module, dpll->control_reg);
-	dd->idlest_reg = _get_reg(dpll->module, dpll->idlest_reg);
-	dd->mult_div1_reg = _get_reg(dpll->module, dpll->mult_div1_reg);
-	dd->autoidle_reg = _get_reg(dpll->module, dpll->autoidle_reg);
+	_get_reg(dpll->module, dpll->control_reg, &dd->control_reg);
+	_get_reg(dpll->module, dpll->idlest_reg, &dd->idlest_reg);
+	_get_reg(dpll->module, dpll->mult_div1_reg, &dd->mult_div1_reg);
+	_get_reg(dpll->module, dpll->autoidle_reg, &dd->autoidle_reg);
 
 	dd->modes = dpll->modes;
 	dd->div1_mask = dpll->div1_mask;
@@ -344,12 +337,9 @@ static void _register_dpll_x2(struct device_node *node,
 		ret = of_property_count_elems_of_size(node, "reg", 1);
 		if (ret <= 0) {
 			hw_ops = NULL;
-		} else {
-			clk_hw->clksel_reg = ti_clk_get_reg_addr(node, 0);
-			if (IS_ERR(clk_hw->clksel_reg)) {
-				kfree(clk_hw);
-				return;
-			}
+		} else if (ti_clk_get_reg_addr(node, 0, &clk_hw->clksel_reg)) {
+			kfree(clk_hw);
+			return;
 		}
 	}
 
@@ -412,7 +402,8 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 
 	init->parent_names = parent_names;
 
-	dd->control_reg = ti_clk_get_reg_addr(node, 0);
+	if (ti_clk_get_reg_addr(node, 0, &dd->control_reg))
+		goto cleanup;
 
 	/*
 	 * Special case for OMAP2 DPLL, register order is different due to
@@ -420,25 +411,22 @@ static void __init of_ti_dpll_setup(struct device_node *node,
 	 * missing idlest_mask.
 	 */
 	if (!dd->idlest_mask) {
-		dd->mult_div1_reg = ti_clk_get_reg_addr(node, 1);
+		if (ti_clk_get_reg_addr(node, 1, &dd->mult_div1_reg))
+			goto cleanup;
 #ifdef CONFIG_ARCH_OMAP2
 		clk_hw->ops = &clkhwops_omap2xxx_dpll;
 		omap2xxx_clkt_dpllcore_init(&clk_hw->hw);
 #endif
 	} else {
-		dd->idlest_reg = ti_clk_get_reg_addr(node, 1);
-		if (IS_ERR(dd->idlest_reg))
+		if (ti_clk_get_reg_addr(node, 1, &dd->idlest_reg))
 			goto cleanup;
 
-		dd->mult_div1_reg = ti_clk_get_reg_addr(node, 2);
+		if (ti_clk_get_reg_addr(node, 2, &dd->mult_div1_reg))
+			goto cleanup;
 	}
 
-	if (IS_ERR(dd->control_reg) || IS_ERR(dd->mult_div1_reg))
-		goto cleanup;
-
 	if (dd->autoidle_mask) {
-		dd->autoidle_reg = ti_clk_get_reg_addr(node, 3);
-		if (IS_ERR(dd->autoidle_reg))
+		if (ti_clk_get_reg_addr(node, 3, &dd->autoidle_reg))
 			goto cleanup;
 	}
 
diff --git a/drivers/clk/ti/dpll3xxx.c b/drivers/clk/ti/dpll3xxx.c
index 4cdd28a25584..4534de2ef455 100644
--- a/drivers/clk/ti/dpll3xxx.c
+++ b/drivers/clk/ti/dpll3xxx.c
@@ -54,10 +54,10 @@ static void _omap3_dpll_write_clken(struct clk_hw_omap *clk, u8 clken_bits)
 
 	dd = clk->dpll_data;
 
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	v &= ~dd->enable_mask;
 	v |= clken_bits << __ffs(dd->enable_mask);
-	ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 }
 
 /* _omap3_wait_dpll_status: wait for a DPLL to enter a specific state */
@@ -73,7 +73,7 @@ static int _omap3_wait_dpll_status(struct clk_hw_omap *clk, u8 state)
 
 	state <<= __ffs(dd->idlest_mask);
 
-	while (((ti_clk_ll_ops->clk_readl(dd->idlest_reg) & dd->idlest_mask)
+	while (((ti_clk_ll_ops->clk_readl(&dd->idlest_reg) & dd->idlest_mask)
 		!= state) && i < MAX_DPLL_WAIT_TRIES) {
 		i++;
 		udelay(1);
@@ -151,7 +151,7 @@ static int _omap3_noncore_dpll_lock(struct clk_hw_omap *clk)
 	state <<= __ffs(dd->idlest_mask);
 
 	/* Check if already locked */
-	if ((ti_clk_ll_ops->clk_readl(dd->idlest_reg) & dd->idlest_mask) ==
+	if ((ti_clk_ll_ops->clk_readl(&dd->idlest_reg) & dd->idlest_mask) ==
 	    state)
 		goto done;
 
@@ -317,14 +317,14 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 	 * only since freqsel field is no longer present on other devices.
 	 */
 	if (ti_clk_get_features()->flags & TI_CLK_DPLL_HAS_FREQSEL) {
-		v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+		v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 		v &= ~dd->freqsel_mask;
 		v |= freqsel << __ffs(dd->freqsel_mask);
-		ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+		ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 	}
 
 	/* Set DPLL multiplier, divider */
-	v = ti_clk_ll_ops->clk_readl(dd->mult_div1_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->mult_div1_reg);
 
 	/* Handle Duty Cycle Correction */
 	if (dd->dcc_mask) {
@@ -370,11 +370,11 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 		}
 	}
 
-	ti_clk_ll_ops->clk_writel(v, dd->mult_div1_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->mult_div1_reg);
 
 	/* Set 4X multiplier and low-power mode */
 	if (dd->m4xen_mask || dd->lpmode_mask) {
-		v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+		v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 
 		if (dd->m4xen_mask) {
 			if (dd->last_rounded_m4xen)
@@ -390,7 +390,7 @@ static int omap3_noncore_dpll_program(struct clk_hw_omap *clk, u16 freqsel)
 				v &= ~dd->lpmode_mask;
 		}
 
-		ti_clk_ll_ops->clk_writel(v, dd->control_reg);
+		ti_clk_ll_ops->clk_writel(v, &dd->control_reg);
 	}
 
 	/* We let the clock framework set the other output dividers later */
@@ -652,10 +652,10 @@ static u32 omap3_dpll_autoidle_read(struct clk_hw_omap *clk)
 
 	dd = clk->dpll_data;
 
-	if (!dd->autoidle_reg)
+	if (!dd->autoidle_mask)
 		return -EINVAL;
 
-	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->autoidle_reg);
 	v &= dd->autoidle_mask;
 	v >>= __ffs(dd->autoidle_mask);
 
@@ -681,7 +681,7 @@ static void omap3_dpll_allow_idle(struct clk_hw_omap *clk)
 
 	dd = clk->dpll_data;
 
-	if (!dd->autoidle_reg)
+	if (!dd->autoidle_mask)
 		return;
 
 	/*
@@ -689,10 +689,10 @@ static void omap3_dpll_allow_idle(struct clk_hw_omap *clk)
 	 * by writing 0x5 instead of 0x1.  Add some mechanism to
 	 * optionally enter this mode.
 	 */
-	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->autoidle_reg);
 	v &= ~dd->autoidle_mask;
 	v |= DPLL_AUTOIDLE_LOW_POWER_STOP << __ffs(dd->autoidle_mask);
-	ti_clk_ll_ops->clk_writel(v, dd->autoidle_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->autoidle_reg);
 }
 
 /**
@@ -711,13 +711,13 @@ static void omap3_dpll_deny_idle(struct clk_hw_omap *clk)
 
 	dd = clk->dpll_data;
 
-	if (!dd->autoidle_reg)
+	if (!dd->autoidle_mask)
 		return;
 
-	v = ti_clk_ll_ops->clk_readl(dd->autoidle_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->autoidle_reg);
 	v &= ~dd->autoidle_mask;
 	v |= DPLL_AUTOIDLE_DISABLE << __ffs(dd->autoidle_mask);
-	ti_clk_ll_ops->clk_writel(v, dd->autoidle_reg);
+	ti_clk_ll_ops->clk_writel(v, &dd->autoidle_reg);
 }
 
 /* Clock control for DPLL outputs */
@@ -773,7 +773,7 @@ unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw,
 
 	WARN_ON(!dd->enable_mask);
 
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg) & dd->enable_mask;
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg) & dd->enable_mask;
 	v >>= __ffs(dd->enable_mask);
 	if ((v != OMAP3XXX_EN_DPLL_LOCKED) || (dd->flags & DPLL_J_TYPE))
 		rate = parent_rate;
diff --git a/drivers/clk/ti/dpll44xx.c b/drivers/clk/ti/dpll44xx.c
index 82c05b55a7be..d7a3f7ec8d77 100644
--- a/drivers/clk/ti/dpll44xx.c
+++ b/drivers/clk/ti/dpll44xx.c
@@ -42,17 +42,17 @@ static void omap4_dpllmx_allow_gatectrl(struct clk_hw_omap *clk)
 	u32 v;
 	u32 mask;
 
-	if (!clk || !clk->clksel_reg)
+	if (!clk)
 		return;
 
 	mask = clk->flags & CLOCK_CLKOUTX2 ?
 			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
 			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
 
-	v = ti_clk_ll_ops->clk_readl(clk->clksel_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->clksel_reg);
 	/* Clear the bit to allow gatectrl */
 	v &= ~mask;
-	ti_clk_ll_ops->clk_writel(v, clk->clksel_reg);
+	ti_clk_ll_ops->clk_writel(v, &clk->clksel_reg);
 }
 
 static void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk)
@@ -60,17 +60,17 @@ static void omap4_dpllmx_deny_gatectrl(struct clk_hw_omap *clk)
 	u32 v;
 	u32 mask;
 
-	if (!clk || !clk->clksel_reg)
+	if (!clk)
 		return;
 
 	mask = clk->flags & CLOCK_CLKOUTX2 ?
 			OMAP4430_DPLL_CLKOUTX2_GATE_CTRL_MASK :
 			OMAP4430_DPLL_CLKOUT_GATE_CTRL_MASK;
 
-	v = ti_clk_ll_ops->clk_readl(clk->clksel_reg);
+	v = ti_clk_ll_ops->clk_readl(&clk->clksel_reg);
 	/* Set the bit to deny gatectrl */
 	v |= mask;
-	ti_clk_ll_ops->clk_writel(v, clk->clksel_reg);
+	ti_clk_ll_ops->clk_writel(v, &clk->clksel_reg);
 }
 
 const struct clk_hw_omap_ops clkhwops_omap4_dpllmx = {
@@ -128,7 +128,7 @@ unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw,
 	rate = omap2_get_dpll_rate(clk);
 
 	/* regm4xen adds a multiplier of 4 to DPLL calculations */
-	v = ti_clk_ll_ops->clk_readl(dd->control_reg);
+	v = ti_clk_ll_ops->clk_readl(&dd->control_reg);
 	if (v & OMAP4430_DPLL_REGM4XEN_MASK)
 		rate *= OMAP4430_REGM4XEN_MULT;
 
diff --git a/drivers/clk/ti/gate.c b/drivers/clk/ti/gate.c
index 77f0920e12f1..7151ec3a1b07 100644
--- a/drivers/clk/ti/gate.c
+++ b/drivers/clk/ti/gate.c
@@ -76,15 +76,15 @@ static int omap36xx_gate_clk_enable_with_hsdiv_restore(struct clk_hw *hw)
 
 	/* Restore the dividers */
 	if (!ret) {
-		orig_v = ti_clk_ll_ops->clk_readl(parent->reg);
+		orig_v = ti_clk_ll_ops->clk_readl(&parent->reg);
 		dummy_v = orig_v;
 
 		/* Write any other value different from the Read value */
 		dummy_v ^= (1 << parent->shift);
-		ti_clk_ll_ops->clk_writel(dummy_v, parent->reg);
+		ti_clk_ll_ops->clk_writel(dummy_v, &parent->reg);
 
 		/* Write the original divider */
-		ti_clk_ll_ops->clk_writel(orig_v, parent->reg);
+		ti_clk_ll_ops->clk_writel(orig_v, &parent->reg);
 	}
 
 	return ret;
@@ -92,7 +92,7 @@ static int omap36xx_gate_clk_enable_with_hsdiv_restore(struct clk_hw *hw)
 
 static struct clk *_register_gate(struct device *dev, const char *name,
 				  const char *parent_name, unsigned long flags,
-				  void __iomem *reg, u8 bit_idx,
+				  struct clk_omap_reg *reg, u8 bit_idx,
 				  u8 clk_gate_flags, const struct clk_ops *ops,
 				  const struct clk_hw_omap_ops *hw_ops)
 {
@@ -109,7 +109,7 @@ static struct clk *_register_gate(struct device *dev, const char *name,
 	init.name = name;
 	init.ops = ops;
 
-	clk_hw->enable_reg = reg;
+	memcpy(&clk_hw->enable_reg, reg, sizeof(*reg));
 	clk_hw->enable_bit = bit_idx;
 	clk_hw->ops = hw_ops;
 
@@ -133,8 +133,7 @@ struct clk *ti_clk_register_gate(struct ti_clk *setup)
 {
 	const struct clk_ops *ops = &omap_gate_clk_ops;
 	const struct clk_hw_omap_ops *hw_ops = NULL;
-	u32 reg;
-	struct clk_omap_reg *reg_setup;
+	struct clk_omap_reg reg;
 	u32 flags = 0;
 	u8 clk_gate_flags = 0;
 	struct ti_clk_gate *gate;
@@ -144,8 +143,6 @@ struct clk *ti_clk_register_gate(struct ti_clk *setup)
 	if (gate->flags & CLKF_INTERFACE)
 		return ti_clk_register_interface(setup);
 
-	reg_setup = (struct clk_omap_reg *)&reg;
-
 	if (gate->flags & CLKF_SET_RATE_PARENT)
 		flags |= CLK_SET_RATE_PARENT;
 
@@ -169,11 +166,12 @@ struct clk *ti_clk_register_gate(struct ti_clk *setup)
 	if (gate->flags & CLKF_AM35XX)
 		hw_ops = &clkhwops_am35xx_ipss_module_wait;
 
-	reg_setup->index = gate->module;
-	reg_setup->offset = gate->reg;
+	reg.index = gate->module;
+	reg.offset = gate->reg;
+	reg.ptr = NULL;
 
 	return _register_gate(NULL, setup->name, gate->parent, flags,
-			      (void __iomem *)reg, gate->bit_shift,
+			      &reg, gate->bit_shift,
 			      clk_gate_flags, ops, hw_ops);
 }
 
@@ -214,15 +212,14 @@ static void __init _of_ti_gate_clk_setup(struct device_node *node,
 {
 	struct clk *clk;
 	const char *parent_name;
-	void __iomem *reg = NULL;
+	struct clk_omap_reg reg;
 	u8 enable_bit = 0;
 	u32 val;
 	u32 flags = 0;
 	u8 clk_gate_flags = 0;
 
 	if (ops != &omap_gate_clkdm_clk_ops) {
-		reg = ti_clk_get_reg_addr(node, 0);
-		if (IS_ERR(reg))
+		if (ti_clk_get_reg_addr(node, 0, &reg))
 			return;
 
 		if (!of_property_read_u32(node, "ti,bit-shift", &val))
@@ -242,7 +239,7 @@ static void __init _of_ti_gate_clk_setup(struct device_node *node,
 	if (of_property_read_bool(node, "ti,set-bit-to-disable"))
 		clk_gate_flags |= INVERT_ENABLE;
 
-	clk = _register_gate(NULL, node->name, parent_name, flags, reg,
+	clk = _register_gate(NULL, node->name, parent_name, flags, &reg,
 			     enable_bit, clk_gate_flags, ops, hw_ops);
 
 	if (!IS_ERR(clk))
@@ -260,8 +257,7 @@ _of_ti_composite_gate_clk_setup(struct device_node *node,
 	if (!gate)
 		return;
 
-	gate->enable_reg = ti_clk_get_reg_addr(node, 0);
-	if (IS_ERR(gate->enable_reg))
+	if (ti_clk_get_reg_addr(node, 0, &gate->enable_reg))
 		goto cleanup;
 
 	of_property_read_u32(node, "ti,bit-shift", &val);
diff --git a/drivers/clk/ti/interface.c b/drivers/clk/ti/interface.c
index 42d9fd4f5f6a..62cf50c1e1e3 100644
--- a/drivers/clk/ti/interface.c
+++ b/drivers/clk/ti/interface.c
@@ -34,7 +34,7 @@ static const struct clk_ops ti_interface_clk_ops = {
 
 static struct clk *_register_interface(struct device *dev, const char *name,
 				       const char *parent_name,
-				       void __iomem *reg, u8 bit_idx,
+				       struct clk_omap_reg *reg, u8 bit_idx,
 				       const struct clk_hw_omap_ops *ops)
 {
 	struct clk_init_data init = { NULL };
@@ -47,7 +47,7 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 
 	clk_hw->hw.init = &init;
 	clk_hw->ops = ops;
-	clk_hw->enable_reg = reg;
+	memcpy(&clk_hw->enable_reg, reg, sizeof(*reg));
 	clk_hw->enable_bit = bit_idx;
 
 	init.name = name;
@@ -71,14 +71,13 @@ static struct clk *_register_interface(struct device *dev, const char *name,
 struct clk *ti_clk_register_interface(struct ti_clk *setup)
 {
 	const struct clk_hw_omap_ops *ops = &clkhwops_iclk_wait;
-	u32 reg;
-	struct clk_omap_reg *reg_setup;
+	struct clk_omap_reg reg;
 	struct ti_clk_gate *gate;
 
 	gate = setup->data;
-	reg_setup = (struct clk_omap_reg *)&reg;
-	reg_setup->index = gate->module;
-	reg_setup->offset = gate->reg;
+	reg.index = gate->module;
+	reg.offset = gate->reg;
+	reg.ptr = NULL;
 
 	if (gate->flags & CLKF_NO_WAIT)
 		ops = &clkhwops_iclk;
@@ -96,7 +95,7 @@ struct clk *ti_clk_register_interface(struct ti_clk *setup)
 		ops = &clkhwops_am35xx_ipss_wait;
 
 	return _register_interface(NULL, setup->name, gate->parent,
-				   (void __iomem *)reg, gate->bit_shift, ops);
+				   &reg, gate->bit_shift, ops);
 }
 #endif
 
@@ -105,12 +104,11 @@ static void __init _of_ti_interface_clk_setup(struct device_node *node,
 {
 	struct clk *clk;
 	const char *parent_name;
-	void __iomem *reg;
+	struct clk_omap_reg reg;
 	u8 enable_bit = 0;
 	u32 val;
 
-	reg = ti_clk_get_reg_addr(node, 0);
-	if (IS_ERR(reg))
+	if (ti_clk_get_reg_addr(node, 0, &reg))
 		return;
 
 	if (!of_property_read_u32(node, "ti,bit-shift", &val))
@@ -122,7 +120,7 @@ static void __init _of_ti_interface_clk_setup(struct device_node *node,
 		return;
 	}
 
-	clk = _register_interface(NULL, node->name, parent_name, reg,
+	clk = _register_interface(NULL, node->name, parent_name, &reg,
 				  enable_bit, ops);
 
 	if (!IS_ERR(clk))
diff --git a/drivers/clk/ti/mux.c b/drivers/clk/ti/mux.c
index daa2dee6bafe..18c267b38461 100644
--- a/drivers/clk/ti/mux.c
+++ b/drivers/clk/ti/mux.c
@@ -39,7 +39,7 @@ static u8 ti_clk_mux_get_parent(struct clk_hw *hw)
 	 * OTOH, pmd_trace_clk_mux_ck uses a separate bit for each clock, so
 	 * val = 0x4 really means "bit 2, index starts at bit 0"
 	 */
-	val = ti_clk_ll_ops->clk_readl(mux->reg) >> mux->shift;
+	val = ti_clk_ll_ops->clk_readl(&mux->reg) >> mux->shift;
 	val &= mux->mask;
 
 	if (mux->table) {
@@ -81,11 +81,11 @@ static int ti_clk_mux_set_parent(struct clk_hw *hw, u8 index)
 	if (mux->flags & CLK_MUX_HIWORD_MASK) {
 		val = mux->mask << (mux->shift + 16);
 	} else {
-		val = ti_clk_ll_ops->clk_readl(mux->reg);
+		val = ti_clk_ll_ops->clk_readl(&mux->reg);
 		val &= ~(mux->mask << mux->shift);
 	}
 	val |= index << mux->shift;
-	ti_clk_ll_ops->clk_writel(val, mux->reg);
+	ti_clk_ll_ops->clk_writel(val, &mux->reg);
 
 	return 0;
 }
@@ -99,7 +99,7 @@ const struct clk_ops ti_clk_mux_ops = {
 static struct clk *_register_mux(struct device *dev, const char *name,
 				 const char * const *parent_names,
 				 u8 num_parents, unsigned long flags,
-				 void __iomem *reg, u8 shift, u32 mask,
+				 struct clk_omap_reg *reg, u8 shift, u32 mask,
 				 u8 clk_mux_flags, u32 *table)
 {
 	struct clk_omap_mux *mux;
@@ -120,7 +120,7 @@ static struct clk *_register_mux(struct device *dev, const char *name,
 	init.num_parents = num_parents;
 
 	/* struct clk_mux assignments */
-	mux->reg = reg;
+	memcpy(&mux->reg, reg, sizeof(*reg));
 	mux->shift = shift;
 	mux->mask = mask;
 	mux->flags = clk_mux_flags;
@@ -140,12 +140,9 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 	struct ti_clk_mux *mux;
 	u32 flags;
 	u8 mux_flags = 0;
-	struct clk_omap_reg *reg_setup;
-	u32 reg;
+	struct clk_omap_reg reg;
 	u32 mask;
 
-	reg_setup = (struct clk_omap_reg *)&reg;
-
 	mux = setup->data;
 	flags = CLK_SET_RATE_NO_REPARENT;
 
@@ -154,8 +151,9 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 		mask--;
 
 	mask = (1 << fls(mask)) - 1;
-	reg_setup->index = mux->module;
-	reg_setup->offset = mux->reg;
+	reg.index = mux->module;
+	reg.offset = mux->reg;
+	reg.ptr = NULL;
 
 	if (mux->flags & CLKF_INDEX_STARTS_AT_ONE)
 		mux_flags |= CLK_MUX_INDEX_ONE;
@@ -164,7 +162,7 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 		flags |= CLK_SET_RATE_PARENT;
 
 	return _register_mux(NULL, setup->name, mux->parents, mux->num_parents,
-			     flags, (void __iomem *)reg, mux->bit_shift, mask,
+			     flags, &reg, mux->bit_shift, mask,
 			     mux_flags, NULL);
 }
 
@@ -177,7 +175,7 @@ struct clk *ti_clk_register_mux(struct ti_clk *setup)
 static void of_mux_clk_setup(struct device_node *node)
 {
 	struct clk *clk;
-	void __iomem *reg;
+	struct clk_omap_reg reg;
 	unsigned int num_parents;
 	const char **parent_names;
 	u8 clk_mux_flags = 0;
@@ -196,9 +194,7 @@ static void of_mux_clk_setup(struct device_node *node)
 
 	of_clk_parent_fill(node, parent_names, num_parents);
 
-	reg = ti_clk_get_reg_addr(node, 0);
-
-	if (IS_ERR(reg))
+	if (ti_clk_get_reg_addr(node, 0, &reg))
 		goto cleanup;
 
 	of_property_read_u32(node, "ti,bit-shift", &shift);
@@ -217,7 +213,7 @@ static void of_mux_clk_setup(struct device_node *node)
 	mask = (1 << fls(mask)) - 1;
 
 	clk = _register_mux(NULL, node->name, parent_names, num_parents,
-			    flags, reg, shift, mask, clk_mux_flags, NULL);
+			    flags, &reg, shift, mask, clk_mux_flags, NULL);
 
 	if (!IS_ERR(clk))
 		of_clk_add_provider(node, of_clk_src_simple_get, clk);
@@ -230,7 +226,6 @@ CLK_OF_DECLARE(mux_clk, "ti,mux-clock", of_mux_clk_setup);
 struct clk_hw *ti_clk_build_component_mux(struct ti_clk_mux *setup)
 {
 	struct clk_omap_mux *mux;
-	struct clk_omap_reg *reg;
 	int num_parents;
 
 	if (!setup)
@@ -240,12 +235,10 @@ struct clk_hw *ti_clk_build_component_mux(struct ti_clk_mux *setup)
 	if (!mux)
 		return ERR_PTR(-ENOMEM);
 
-	reg = (struct clk_omap_reg *)&mux->reg;
-
 	mux->shift = setup->bit_shift;
 
-	reg->index = setup->module;
-	reg->offset = setup->reg;
+	mux->reg.index = setup->module;
+	mux->reg.offset = setup->reg;
 
 	if (setup->flags & CLKF_INDEX_STARTS_AT_ONE)
 		mux->flags |= CLK_MUX_INDEX_ONE;
@@ -268,9 +261,7 @@ static void __init of_ti_composite_mux_clk_setup(struct device_node *node)
 	if (!mux)
 		return;
 
-	mux->reg = ti_clk_get_reg_addr(node, 0);
-
-	if (IS_ERR(mux->reg))
+	if (ti_clk_get_reg_addr(node, 0, &mux->reg))
 		goto cleanup;
 
 	if (!of_property_read_u32(node, "ti,bit-shift", &val))
diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h
index affdabd0b6a1..d18da839b810 100644
--- a/include/linux/clk/ti.h
+++ b/include/linux/clk/ti.h
@@ -18,6 +18,18 @@
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
 
+/**
+ * struct clk_omap_reg - OMAP register declaration
+ * @offset: offset from the master IP module base address
+ * @index: index of the master IP module
+ */
+struct clk_omap_reg {
+	void __iomem *ptr;
+	u16 offset;
+	u8 index;
+	u8 flags;
+};
+
 /**
  * struct dpll_data - DPLL registers and integration data
  * @mult_div1_reg: register containing the DPLL M and N bitfields
@@ -67,12 +79,12 @@
  * can be placed into read-only space.
  */
 struct dpll_data {
-	void __iomem		*mult_div1_reg;
+	struct clk_omap_reg	mult_div1_reg;
 	u32			mult_mask;
 	u32			div1_mask;
 	struct clk_hw		*clk_bypass;
 	struct clk_hw		*clk_ref;
-	void __iomem		*control_reg;
+	struct clk_omap_reg	control_reg;
 	u32			enable_mask;
 	unsigned long		last_rounded_rate;
 	u16			last_rounded_m;
@@ -84,8 +96,8 @@ struct dpll_data {
 	u16			max_divider;
 	unsigned long		max_rate;
 	u8			modes;
-	void __iomem		*autoidle_reg;
-	void __iomem		*idlest_reg;
+	struct clk_omap_reg	autoidle_reg;
+	struct clk_omap_reg	idlest_reg;
 	u32			autoidle_mask;
 	u32			freqsel_mask;
 	u32			idlest_mask;
@@ -113,10 +125,10 @@ struct clk_hw_omap;
  */
 struct clk_hw_omap_ops {
 	void	(*find_idlest)(struct clk_hw_omap *oclk,
-			       void __iomem **idlest_reg,
+			       struct clk_omap_reg *idlest_reg,
 			       u8 *idlest_bit, u8 *idlest_val);
 	void	(*find_companion)(struct clk_hw_omap *oclk,
-				  void __iomem **other_reg,
+				  struct clk_omap_reg *other_reg,
 				  u8 *other_bit);
 	void	(*allow_idle)(struct clk_hw_omap *oclk);
 	void	(*deny_idle)(struct clk_hw_omap *oclk);
@@ -139,10 +151,10 @@ struct clk_hw_omap {
 	struct list_head	node;
 	unsigned long		fixed_rate;
 	u8			fixed_div;
-	void __iomem		*enable_reg;
+	struct clk_omap_reg	enable_reg;
 	u8			enable_bit;
 	u8			flags;
-	void __iomem		*clksel_reg;
+	struct clk_omap_reg	clksel_reg;
 	struct dpll_data	*dpll_data;
 	const char		*clkdm_name;
 	struct clockdomain	*clkdm;
@@ -195,16 +207,6 @@ enum {
 	CLK_MAX_MEMMAPS
 };
 
-/**
- * struct clk_omap_reg - OMAP register declaration
- * @offset: offset from the master IP module base address
- * @index: index of the master IP module
- */
-struct clk_omap_reg {
-	u16 offset;
-	u16 index;
-};
-
 /**
  * struct ti_clk_ll_ops - low-level ops for clocks
  * @clk_readl: pointer to register read function
@@ -222,16 +224,16 @@ struct clk_omap_reg {
  * operations not provided directly by clock drivers.
  */
 struct ti_clk_ll_ops {
-	u32	(*clk_readl)(void __iomem *reg);
-	void	(*clk_writel)(u32 val, void __iomem *reg);
+	u32	(*clk_readl)(const struct clk_omap_reg *reg);
+	void	(*clk_writel)(u32 val, const struct clk_omap_reg *reg);
 	int	(*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk);
 	int	(*clkdm_clk_disable)(struct clockdomain *clkdm,
 				     struct clk *clk);
 	struct clockdomain * (*clkdm_lookup)(const char *name);
 	int	(*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg,
 					u8 idlest_shift);
-	int	(*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst,
-				       u8 *idlest_reg_id);
+	int	(*cm_split_idlest_reg)(struct clk_omap_reg *idlest_reg,
+				       s16 *prcm_inst, u8 *idlest_reg_id);
 };
 
 #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw)
-- 
cgit v1.2.3


From 442c609830e98919faa78b797e9b89c53bab9cbf Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Mon, 6 Mar 2017 06:19:44 +0100
Subject: leds: core: add OF variants of LED registering functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These new functions allow passing an additional device_node argument
that will be internally set for created LED device. Thanks to this LED
core code and triggers will be able to access DT node for reading extra
info.

The easiest solution for achieving this was reworking old functions to
more generic ones & adding simple defines for API compatibility.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-class.c | 26 ++++++++++++++++----------
 include/linux/leds.h     | 14 ++++++++++----
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index f2b0a80a62b4..b0e2d55acbd6 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -244,11 +244,14 @@ static int led_classdev_next_name(const char *init_name, char *name,
 }
 
 /**
- * led_classdev_register - register a new object of led_classdev class.
- * @parent: The device to register.
+ * of_led_classdev_register - register a new object of led_classdev class.
+ *
+ * @parent: parent of LED device
  * @led_cdev: the led_classdev structure for this device.
+ * @np: DT node describing this LED
  */
-int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
+int of_led_classdev_register(struct device *parent, struct device_node *np,
+			    struct led_classdev *led_cdev)
 {
 	char name[LED_MAX_NAME_SIZE];
 	int ret;
@@ -261,6 +264,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 				led_cdev, led_cdev->groups, "%s", name);
 	if (IS_ERR(led_cdev->dev))
 		return PTR_ERR(led_cdev->dev);
+	led_cdev->dev->of_node = np;
 
 	if (ret)
 		dev_warn(parent, "Led %s renamed to %s due to name collision",
@@ -303,7 +307,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(led_classdev_register);
+EXPORT_SYMBOL_GPL(of_led_classdev_register);
 
 /**
  * led_classdev_unregister - unregisters a object of led_properties class.
@@ -348,12 +352,14 @@ static void devm_led_classdev_release(struct device *dev, void *res)
 }
 
 /**
- * devm_led_classdev_register - resource managed led_classdev_register()
- * @parent: The device to register.
+ * devm_of_led_classdev_register - resource managed led_classdev_register()
+ *
+ * @parent: parent of LED device
  * @led_cdev: the led_classdev structure for this device.
  */
-int devm_led_classdev_register(struct device *parent,
-			       struct led_classdev *led_cdev)
+int devm_of_led_classdev_register(struct device *parent,
+				  struct device_node *np,
+				  struct led_classdev *led_cdev)
 {
 	struct led_classdev **dr;
 	int rc;
@@ -362,7 +368,7 @@ int devm_led_classdev_register(struct device *parent,
 	if (!dr)
 		return -ENOMEM;
 
-	rc = led_classdev_register(parent, led_cdev);
+	rc = of_led_classdev_register(parent, np, led_cdev);
 	if (rc) {
 		devres_free(dr);
 		return rc;
@@ -373,7 +379,7 @@ int devm_led_classdev_register(struct device *parent,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(devm_led_classdev_register);
+EXPORT_SYMBOL_GPL(devm_of_led_classdev_register);
 
 static int devm_led_classdev_match(struct device *dev, void *res, void *data)
 {
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 38c0bd7ca107..64c56d454f7d 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -122,10 +122,16 @@ struct led_classdev {
 	struct mutex		led_access;
 };
 
-extern int led_classdev_register(struct device *parent,
-				 struct led_classdev *led_cdev);
-extern int devm_led_classdev_register(struct device *parent,
-				      struct led_classdev *led_cdev);
+extern int of_led_classdev_register(struct device *parent,
+				    struct device_node *np,
+				    struct led_classdev *led_cdev);
+#define led_classdev_register(parent, led_cdev)				\
+	of_led_classdev_register(parent, NULL, led_cdev)
+extern int devm_of_led_classdev_register(struct device *parent,
+					 struct device_node *np,
+					 struct led_classdev *led_cdev);
+#define devm_led_classdev_register(parent, led_cdev)			\
+	devm_of_led_classdev_register(parent, NULL, led_cdev)
 extern void led_classdev_unregister(struct led_classdev *led_cdev);
 extern void devm_led_classdev_unregister(struct device *parent,
 					 struct led_classdev *led_cdev);
-- 
cgit v1.2.3


From d1caa99055382c91b57244343020ea37c4fa4d09 Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@free-electrons.com>
Date: Tue, 13 Dec 2016 15:33:32 +0100
Subject: iio: adc: add support for Allwinner SoCs ADC

The Allwinner SoCs all have an ADC that can also act as a touchscreen
controller and a thermal sensor. This patch adds the ADC driver which is
based on the MFD for the same SoCs ADC.

This also registers the thermal adc channel in the iio map array so
iio_hwmon could use it without modifying the Device Tree. This registers
the driver in the thermal framework.

The thermal sensor requires the IP to be in touchscreen mode to return
correct values. Therefore, if the user is continuously reading the ADC
channel(s), the thermal framework in which the thermal sensor is
registered will switch the IP in touchscreen mode to get a temperature
value and requires a delay of 100ms (because of the mode switching),
then the ADC will switch back to ADC mode and requires also a delay of
100ms. If the ADC readings are critical to user and the SoC temperature
is not, this driver is capable of not registering the thermal sensor in
the thermal framework and thus, "quicken" the ADC readings.

This driver probes on three different platform_device_id to take into
account slight differences (registers bit and temperature computation)
between Allwinner SoCs ADCs.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Acked-by: Jonathan Cameron <jic23@kernel.org>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/Kconfig           |  17 ++
 drivers/iio/adc/Makefile          |   1 +
 drivers/iio/adc/sun4i-gpadc-iio.c | 613 ++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/sun4i-gpadc.h   |   2 +
 4 files changed, 633 insertions(+)
 create mode 100644 drivers/iio/adc/sun4i-gpadc-iio.c

(limited to 'include/linux')

diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index d777a972586d..9f8b4b1d655b 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -559,6 +559,23 @@ config STX104
 	  The base port addresses for the devices may be configured via the base
 	  array module parameter.
 
+config SUN4I_GPADC
+	tristate "Support for the Allwinner SoCs GPADC"
+	depends on IIO
+	depends on MFD_SUN4I_GPADC
+	help
+	  Say yes here to build support for Allwinner (A10, A13 and A31) SoCs
+	  GPADC. This ADC provides 4 channels which can be used as an ADC or as
+	  a touchscreen input and one channel for thermal sensor.
+
+	  The thermal sensor slows down ADC readings and can be disabled by
+	  disabling CONFIG_THERMAL_OF. However, the thermal sensor should be
+	  enabled by default since the SoC temperature is usually more critical
+	  than ADC readings.
+
+	  To compile this driver as a module, choose M here: the module will be
+	  called sun4i-gpadc-iio.
+
 config TI_ADC081C
 	tristate "Texas Instruments ADC081C/ADC101C/ADC121C family"
 	depends on I2C
diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile
index b11bb5767543..73dbe399f894 100644
--- a/drivers/iio/adc/Makefile
+++ b/drivers/iio/adc/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_RCAR_GYRO_ADC) += rcar-gyroadc.o
 obj-$(CONFIG_ROCKCHIP_SARADC) += rockchip_saradc.o
 obj-$(CONFIG_SPEAR_ADC) += spear_adc.o
 obj-$(CONFIG_STX104) += stx104.o
+obj-$(CONFIG_SUN4I_GPADC) += sun4i-gpadc-iio.o
 obj-$(CONFIG_STM32_ADC_CORE) += stm32-adc-core.o
 obj-$(CONFIG_STM32_ADC) += stm32-adc.o
 obj-$(CONFIG_TI_ADC081C) += ti-adc081c.o
diff --git a/drivers/iio/adc/sun4i-gpadc-iio.c b/drivers/iio/adc/sun4i-gpadc-iio.c
new file mode 100644
index 000000000000..a8e134fa190d
--- /dev/null
+++ b/drivers/iio/adc/sun4i-gpadc-iio.c
@@ -0,0 +1,613 @@
+/* ADC driver for sunxi platforms' (A10, A13 and A31) GPADC
+ *
+ * Copyright (c) 2016 Quentin Schulz <quentin.schulz@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License version 2 as published by the
+ * Free Software Foundation.
+ *
+ * The Allwinner SoCs all have an ADC that can also act as a touchscreen
+ * controller and a thermal sensor.
+ * The thermal sensor works only when the ADC acts as a touchscreen controller
+ * and is configured to throw an interrupt every fixed periods of time (let say
+ * every X seconds).
+ * One would be tempted to disable the IP on the hardware side rather than
+ * disabling interrupts to save some power but that resets the internal clock of
+ * the IP, resulting in having to wait X seconds every time we want to read the
+ * value of the thermal sensor.
+ * This is also the reason of using autosuspend in pm_runtime. If there was no
+ * autosuspend, the thermal sensor would need X seconds after every
+ * pm_runtime_get_sync to get a value from the ADC. The autosuspend allows the
+ * thermal sensor to be requested again in a certain time span before it gets
+ * shutdown for not being used.
+ */
+
+#include <linux/completion.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+#include <linux/delay.h>
+
+#include <linux/iio/iio.h>
+#include <linux/iio/driver.h>
+#include <linux/iio/machine.h>
+#include <linux/mfd/sun4i-gpadc.h>
+
+static unsigned int sun4i_gpadc_chan_select(unsigned int chan)
+{
+	return SUN4I_GPADC_CTRL1_ADC_CHAN_SELECT(chan);
+}
+
+static unsigned int sun6i_gpadc_chan_select(unsigned int chan)
+{
+	return SUN6I_GPADC_CTRL1_ADC_CHAN_SELECT(chan);
+}
+
+struct gpadc_data {
+	int		temp_offset;
+	int		temp_scale;
+	unsigned int	tp_mode_en;
+	unsigned int	tp_adc_select;
+	unsigned int	(*adc_chan_select)(unsigned int chan);
+	unsigned int	adc_chan_mask;
+};
+
+static const struct gpadc_data sun4i_gpadc_data = {
+	.temp_offset = -1932,
+	.temp_scale = 133,
+	.tp_mode_en = SUN4I_GPADC_CTRL1_TP_MODE_EN,
+	.tp_adc_select = SUN4I_GPADC_CTRL1_TP_ADC_SELECT,
+	.adc_chan_select = &sun4i_gpadc_chan_select,
+	.adc_chan_mask = SUN4I_GPADC_CTRL1_ADC_CHAN_MASK,
+};
+
+static const struct gpadc_data sun5i_gpadc_data = {
+	.temp_offset = -1447,
+	.temp_scale = 100,
+	.tp_mode_en = SUN4I_GPADC_CTRL1_TP_MODE_EN,
+	.tp_adc_select = SUN4I_GPADC_CTRL1_TP_ADC_SELECT,
+	.adc_chan_select = &sun4i_gpadc_chan_select,
+	.adc_chan_mask = SUN4I_GPADC_CTRL1_ADC_CHAN_MASK,
+};
+
+static const struct gpadc_data sun6i_gpadc_data = {
+	.temp_offset = -1623,
+	.temp_scale = 167,
+	.tp_mode_en = SUN6I_GPADC_CTRL1_TP_MODE_EN,
+	.tp_adc_select = SUN6I_GPADC_CTRL1_TP_ADC_SELECT,
+	.adc_chan_select = &sun6i_gpadc_chan_select,
+	.adc_chan_mask = SUN6I_GPADC_CTRL1_ADC_CHAN_MASK,
+};
+
+struct sun4i_gpadc_iio {
+	struct iio_dev			*indio_dev;
+	struct completion		completion;
+	int				temp_data;
+	u32				adc_data;
+	struct regmap			*regmap;
+	unsigned int			fifo_data_irq;
+	atomic_t			ignore_fifo_data_irq;
+	unsigned int			temp_data_irq;
+	atomic_t			ignore_temp_data_irq;
+	const struct gpadc_data		*data;
+	/* prevents concurrent reads of temperature and ADC */
+	struct mutex			mutex;
+};
+
+#define SUN4I_GPADC_ADC_CHANNEL(_channel, _name) {		\
+	.type = IIO_VOLTAGE,					\
+	.indexed = 1,						\
+	.channel = _channel,					\
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),		\
+	.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE),	\
+	.datasheet_name = _name,				\
+}
+
+static struct iio_map sun4i_gpadc_hwmon_maps[] = {
+	{
+		.adc_channel_label = "temp_adc",
+		.consumer_dev_name = "iio_hwmon.0",
+	},
+	{ /* sentinel */ },
+};
+
+static const struct iio_chan_spec sun4i_gpadc_channels[] = {
+	SUN4I_GPADC_ADC_CHANNEL(0, "adc_chan0"),
+	SUN4I_GPADC_ADC_CHANNEL(1, "adc_chan1"),
+	SUN4I_GPADC_ADC_CHANNEL(2, "adc_chan2"),
+	SUN4I_GPADC_ADC_CHANNEL(3, "adc_chan3"),
+	{
+		.type = IIO_TEMP,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				      BIT(IIO_CHAN_INFO_SCALE) |
+				      BIT(IIO_CHAN_INFO_OFFSET),
+		.datasheet_name = "temp_adc",
+	},
+};
+
+static const struct iio_chan_spec sun4i_gpadc_channels_no_temp[] = {
+	SUN4I_GPADC_ADC_CHANNEL(0, "adc_chan0"),
+	SUN4I_GPADC_ADC_CHANNEL(1, "adc_chan1"),
+	SUN4I_GPADC_ADC_CHANNEL(2, "adc_chan2"),
+	SUN4I_GPADC_ADC_CHANNEL(3, "adc_chan3"),
+};
+
+static int sun4i_prepare_for_irq(struct iio_dev *indio_dev, int channel,
+				 unsigned int irq)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+	int ret;
+	u32 reg;
+
+	pm_runtime_get_sync(indio_dev->dev.parent);
+
+	reinit_completion(&info->completion);
+
+	ret = regmap_write(info->regmap, SUN4I_GPADC_INT_FIFOC,
+			   SUN4I_GPADC_INT_FIFOC_TP_FIFO_TRIG_LEVEL(1) |
+			   SUN4I_GPADC_INT_FIFOC_TP_FIFO_FLUSH);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(info->regmap, SUN4I_GPADC_CTRL1, &reg);
+	if (ret)
+		return ret;
+
+	if (irq == info->fifo_data_irq) {
+		ret = regmap_write(info->regmap, SUN4I_GPADC_CTRL1,
+				   info->data->tp_mode_en |
+				   info->data->tp_adc_select |
+				   info->data->adc_chan_select(channel));
+		/*
+		 * When the IP changes channel, it needs a bit of time to get
+		 * correct values.
+		 */
+		if ((reg & info->data->adc_chan_mask) !=
+			 info->data->adc_chan_select(channel))
+			mdelay(10);
+
+	} else {
+		/*
+		 * The temperature sensor returns valid data only when the ADC
+		 * operates in touchscreen mode.
+		 */
+		ret = regmap_write(info->regmap, SUN4I_GPADC_CTRL1,
+				   info->data->tp_mode_en);
+	}
+
+	if (ret)
+		return ret;
+
+	/*
+	 * When the IP changes mode between ADC or touchscreen, it
+	 * needs a bit of time to get correct values.
+	 */
+	if ((reg & info->data->tp_adc_select) != info->data->tp_adc_select)
+		mdelay(100);
+
+	return 0;
+}
+
+static int sun4i_gpadc_read(struct iio_dev *indio_dev, int channel, int *val,
+			    unsigned int irq)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+	int ret;
+
+	mutex_lock(&info->mutex);
+
+	ret = sun4i_prepare_for_irq(indio_dev, channel, irq);
+	if (ret)
+		goto err;
+
+	enable_irq(irq);
+
+	/*
+	 * The temperature sensor throws an interruption periodically (currently
+	 * set at periods of ~0.6s in sun4i_gpadc_runtime_resume). A 1s delay
+	 * makes sure an interruption occurs in normal conditions. If it doesn't
+	 * occur, then there is a timeout.
+	 */
+	if (!wait_for_completion_timeout(&info->completion,
+					 msecs_to_jiffies(1000))) {
+		ret = -ETIMEDOUT;
+		goto err;
+	}
+
+	if (irq == info->fifo_data_irq)
+		*val = info->adc_data;
+	else
+		*val = info->temp_data;
+
+	ret = 0;
+	pm_runtime_mark_last_busy(indio_dev->dev.parent);
+
+err:
+	pm_runtime_put_autosuspend(indio_dev->dev.parent);
+	mutex_unlock(&info->mutex);
+
+	return ret;
+}
+
+static int sun4i_gpadc_adc_read(struct iio_dev *indio_dev, int channel,
+				int *val)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+
+	return sun4i_gpadc_read(indio_dev, channel, val, info->fifo_data_irq);
+}
+
+static int sun4i_gpadc_temp_read(struct iio_dev *indio_dev, int *val)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+
+	return sun4i_gpadc_read(indio_dev, 0, val, info->temp_data_irq);
+}
+
+static int sun4i_gpadc_temp_offset(struct iio_dev *indio_dev, int *val)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+
+	*val = info->data->temp_offset;
+
+	return 0;
+}
+
+static int sun4i_gpadc_temp_scale(struct iio_dev *indio_dev, int *val)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+
+	*val = info->data->temp_scale;
+
+	return 0;
+}
+
+static int sun4i_gpadc_read_raw(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan, int *val,
+				int *val2, long mask)
+{
+	int ret;
+
+	switch (mask) {
+	case IIO_CHAN_INFO_OFFSET:
+		ret = sun4i_gpadc_temp_offset(indio_dev, val);
+		if (ret)
+			return ret;
+
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_RAW:
+		if (chan->type == IIO_VOLTAGE)
+			ret = sun4i_gpadc_adc_read(indio_dev, chan->channel,
+						   val);
+		else
+			ret = sun4i_gpadc_temp_read(indio_dev, val);
+
+		if (ret)
+			return ret;
+
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_SCALE:
+		if (chan->type == IIO_VOLTAGE) {
+			/* 3000mV / 4096 * raw */
+			*val = 0;
+			*val2 = 732421875;
+			return IIO_VAL_INT_PLUS_NANO;
+		}
+
+		ret = sun4i_gpadc_temp_scale(indio_dev, val);
+		if (ret)
+			return ret;
+
+		return IIO_VAL_INT;
+	default:
+		return -EINVAL;
+	}
+
+	return -EINVAL;
+}
+
+static const struct iio_info sun4i_gpadc_iio_info = {
+	.read_raw = sun4i_gpadc_read_raw,
+	.driver_module = THIS_MODULE,
+};
+
+static irqreturn_t sun4i_gpadc_temp_data_irq_handler(int irq, void *dev_id)
+{
+	struct sun4i_gpadc_iio *info = dev_id;
+
+	if (atomic_read(&info->ignore_temp_data_irq))
+		goto out;
+
+	if (!regmap_read(info->regmap, SUN4I_GPADC_TEMP_DATA, &info->temp_data))
+		complete(&info->completion);
+
+out:
+	disable_irq_nosync(info->temp_data_irq);
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t sun4i_gpadc_fifo_data_irq_handler(int irq, void *dev_id)
+{
+	struct sun4i_gpadc_iio *info = dev_id;
+
+	if (atomic_read(&info->ignore_fifo_data_irq))
+		goto out;
+
+	if (!regmap_read(info->regmap, SUN4I_GPADC_DATA, &info->adc_data))
+		complete(&info->completion);
+
+out:
+	disable_irq_nosync(info->fifo_data_irq);
+	return IRQ_HANDLED;
+}
+
+static int sun4i_gpadc_runtime_suspend(struct device *dev)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(dev_get_drvdata(dev));
+
+	/* Disable the ADC on IP */
+	regmap_write(info->regmap, SUN4I_GPADC_CTRL1, 0);
+	/* Disable temperature sensor on IP */
+	regmap_write(info->regmap, SUN4I_GPADC_TPR, 0);
+
+	return 0;
+}
+
+static int sun4i_gpadc_runtime_resume(struct device *dev)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(dev_get_drvdata(dev));
+
+	/* clkin = 6MHz */
+	regmap_write(info->regmap, SUN4I_GPADC_CTRL0,
+		     SUN4I_GPADC_CTRL0_ADC_CLK_DIVIDER(2) |
+		     SUN4I_GPADC_CTRL0_FS_DIV(7) |
+		     SUN4I_GPADC_CTRL0_T_ACQ(63));
+	regmap_write(info->regmap, SUN4I_GPADC_CTRL1, info->data->tp_mode_en);
+	regmap_write(info->regmap, SUN4I_GPADC_CTRL3,
+		     SUN4I_GPADC_CTRL3_FILTER_EN |
+		     SUN4I_GPADC_CTRL3_FILTER_TYPE(1));
+	/* period = SUN4I_GPADC_TPR_TEMP_PERIOD * 256 * 16 / clkin; ~0.6s */
+	regmap_write(info->regmap, SUN4I_GPADC_TPR,
+		     SUN4I_GPADC_TPR_TEMP_ENABLE |
+		     SUN4I_GPADC_TPR_TEMP_PERIOD(800));
+
+	return 0;
+}
+
+static int sun4i_gpadc_get_temp(void *data, int *temp)
+{
+	struct sun4i_gpadc_iio *info = (struct sun4i_gpadc_iio *)data;
+	int val, scale, offset;
+
+	if (sun4i_gpadc_temp_read(info->indio_dev, &val))
+		return -ETIMEDOUT;
+
+	sun4i_gpadc_temp_scale(info->indio_dev, &scale);
+	sun4i_gpadc_temp_offset(info->indio_dev, &offset);
+
+	*temp = (val + offset) * scale;
+
+	return 0;
+}
+
+static const struct thermal_zone_of_device_ops sun4i_ts_tz_ops = {
+	.get_temp = &sun4i_gpadc_get_temp,
+};
+
+static const struct dev_pm_ops sun4i_gpadc_pm_ops = {
+	.runtime_suspend = &sun4i_gpadc_runtime_suspend,
+	.runtime_resume = &sun4i_gpadc_runtime_resume,
+};
+
+static int sun4i_irq_init(struct platform_device *pdev, const char *name,
+			  irq_handler_t handler, const char *devname,
+			  unsigned int *irq, atomic_t *atomic)
+{
+	int ret;
+	struct sun4i_gpadc_dev *mfd_dev = dev_get_drvdata(pdev->dev.parent);
+	struct sun4i_gpadc_iio *info = iio_priv(dev_get_drvdata(&pdev->dev));
+
+	/*
+	 * Once the interrupt is activated, the IP continuously performs
+	 * conversions thus throws interrupts. The interrupt is activated right
+	 * after being requested but we want to control when these interrupts
+	 * occur thus we disable it right after being requested. However, an
+	 * interrupt might occur between these two instructions and we have to
+	 * make sure that does not happen, by using atomic flags. We set the
+	 * flag before requesting the interrupt and unset it right after
+	 * disabling the interrupt. When an interrupt occurs between these two
+	 * instructions, reading the atomic flag will tell us to ignore the
+	 * interrupt.
+	 */
+	atomic_set(atomic, 1);
+
+	ret = platform_get_irq_byname(pdev, name);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "no %s interrupt registered\n", name);
+		return ret;
+	}
+
+	ret = regmap_irq_get_virq(mfd_dev->regmap_irqc, ret);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "failed to get virq for irq %s\n", name);
+		return ret;
+	}
+
+	*irq = ret;
+	ret = devm_request_any_context_irq(&pdev->dev, *irq, handler, 0,
+					   devname, info);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "could not request %s interrupt: %d\n",
+			name, ret);
+		return ret;
+	}
+
+	disable_irq(*irq);
+	atomic_set(atomic, 0);
+
+	return 0;
+}
+
+static int sun4i_gpadc_probe(struct platform_device *pdev)
+{
+	struct sun4i_gpadc_iio *info;
+	struct iio_dev *indio_dev;
+	int ret;
+	struct sun4i_gpadc_dev *sun4i_gpadc_dev;
+
+	sun4i_gpadc_dev = dev_get_drvdata(pdev->dev.parent);
+
+	indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*info));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	info = iio_priv(indio_dev);
+	platform_set_drvdata(pdev, indio_dev);
+
+	mutex_init(&info->mutex);
+	info->regmap = sun4i_gpadc_dev->regmap;
+	info->indio_dev = indio_dev;
+	init_completion(&info->completion);
+	indio_dev->name = dev_name(&pdev->dev);
+	indio_dev->dev.parent = &pdev->dev;
+	indio_dev->dev.of_node = pdev->dev.of_node;
+	indio_dev->info = &sun4i_gpadc_iio_info;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+	indio_dev->num_channels = ARRAY_SIZE(sun4i_gpadc_channels);
+	indio_dev->channels = sun4i_gpadc_channels;
+
+	info->data = (struct gpadc_data *)platform_get_device_id(pdev)->driver_data;
+
+	/*
+	 * Since the controller needs to be in touchscreen mode for its thermal
+	 * sensor to operate properly, and that switching between the two modes
+	 * needs a delay, always registering in the thermal framework will
+	 * significantly slow down the conversion rate of the ADCs.
+	 *
+	 * Therefore, instead of depending on THERMAL_OF in Kconfig, we only
+	 * register the sensor if that option is enabled, eventually leaving
+	 * that choice to the user.
+	 */
+
+	if (IS_ENABLED(CONFIG_THERMAL_OF)) {
+		/*
+		 * This driver is a child of an MFD which has a node in the DT
+		 * but not its children, because of DT backward compatibility
+		 * for A10, A13 and A31 SoCs. Therefore, the resulting devices
+		 * of this driver do not have an of_node variable.
+		 * However, its parent (the MFD driver) has an of_node variable
+		 * and since devm_thermal_zone_of_sensor_register uses its first
+		 * argument to match the phandle defined in the node of the
+		 * thermal driver with the of_node of the device passed as first
+		 * argument and the third argument to call ops from
+		 * thermal_zone_of_device_ops, the solution is to use the parent
+		 * device as first argument to match the phandle with its
+		 * of_node, and the device from this driver as third argument to
+		 * return the temperature.
+		 */
+		struct thermal_zone_device *tzd;
+		tzd = devm_thermal_zone_of_sensor_register(pdev->dev.parent, 0,
+							   info,
+							   &sun4i_ts_tz_ops);
+		if (IS_ERR(tzd)) {
+			dev_err(&pdev->dev,
+				"could not register thermal sensor: %ld\n",
+				PTR_ERR(tzd));
+			ret = PTR_ERR(tzd);
+			goto err;
+		}
+	} else {
+		indio_dev->num_channels =
+			ARRAY_SIZE(sun4i_gpadc_channels_no_temp);
+		indio_dev->channels = sun4i_gpadc_channels_no_temp;
+	}
+
+	pm_runtime_set_autosuspend_delay(&pdev->dev,
+					 SUN4I_GPADC_AUTOSUSPEND_DELAY);
+	pm_runtime_use_autosuspend(&pdev->dev);
+	pm_runtime_set_suspended(&pdev->dev);
+	pm_runtime_enable(&pdev->dev);
+
+	if (IS_ENABLED(CONFIG_THERMAL_OF)) {
+		ret = sun4i_irq_init(pdev, "TEMP_DATA_PENDING",
+				     sun4i_gpadc_temp_data_irq_handler,
+				     "temp_data", &info->temp_data_irq,
+				     &info->ignore_temp_data_irq);
+		if (ret < 0)
+			goto err;
+	}
+
+	ret = sun4i_irq_init(pdev, "FIFO_DATA_PENDING",
+			     sun4i_gpadc_fifo_data_irq_handler, "fifo_data",
+			     &info->fifo_data_irq, &info->ignore_fifo_data_irq);
+	if (ret < 0)
+		goto err;
+
+	if (IS_ENABLED(CONFIG_THERMAL_OF)) {
+		ret = iio_map_array_register(indio_dev, sun4i_gpadc_hwmon_maps);
+		if (ret < 0) {
+			dev_err(&pdev->dev,
+				"failed to register iio map array\n");
+			goto err;
+		}
+	}
+
+	ret = devm_iio_device_register(&pdev->dev, indio_dev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "could not register the device\n");
+		goto err_map;
+	}
+
+	return 0;
+
+err_map:
+	if (IS_ENABLED(CONFIG_THERMAL_OF))
+		iio_map_array_unregister(indio_dev);
+
+err:
+	pm_runtime_put(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+
+	return ret;
+}
+
+static int sun4i_gpadc_remove(struct platform_device *pdev)
+{
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+
+	pm_runtime_put(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+	if (IS_ENABLED(CONFIG_THERMAL_OF))
+		iio_map_array_unregister(indio_dev);
+
+	return 0;
+}
+
+static const struct platform_device_id sun4i_gpadc_id[] = {
+	{ "sun4i-a10-gpadc-iio", (kernel_ulong_t)&sun4i_gpadc_data },
+	{ "sun5i-a13-gpadc-iio", (kernel_ulong_t)&sun5i_gpadc_data },
+	{ "sun6i-a31-gpadc-iio", (kernel_ulong_t)&sun6i_gpadc_data },
+	{ /* sentinel */ },
+};
+
+static struct platform_driver sun4i_gpadc_driver = {
+	.driver = {
+		.name = "sun4i-gpadc-iio",
+		.pm = &sun4i_gpadc_pm_ops,
+	},
+	.id_table = sun4i_gpadc_id,
+	.probe = sun4i_gpadc_probe,
+	.remove = sun4i_gpadc_remove,
+};
+
+module_platform_driver(sun4i_gpadc_driver);
+
+MODULE_DESCRIPTION("ADC driver for sunxi platforms");
+MODULE_AUTHOR("Quentin Schulz <quentin.schulz@free-electrons.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h
index d7a29f246d64..509e736d27fb 100644
--- a/include/linux/mfd/sun4i-gpadc.h
+++ b/include/linux/mfd/sun4i-gpadc.h
@@ -28,6 +28,7 @@
 #define SUN4I_GPADC_CTRL1_TP_MODE_EN			BIT(4)
 #define SUN4I_GPADC_CTRL1_TP_ADC_SELECT			BIT(3)
 #define SUN4I_GPADC_CTRL1_ADC_CHAN_SELECT(x)		(GENMASK(2, 0) & (x))
+#define SUN4I_GPADC_CTRL1_ADC_CHAN_MASK			GENMASK(2, 0)
 
 /* TP_CTRL1 bits for sun6i SOCs */
 #define SUN6I_GPADC_CTRL1_TOUCH_PAN_CALI_EN		BIT(7)
@@ -35,6 +36,7 @@
 #define SUN6I_GPADC_CTRL1_TP_MODE_EN			BIT(5)
 #define SUN6I_GPADC_CTRL1_TP_ADC_SELECT			BIT(4)
 #define SUN6I_GPADC_CTRL1_ADC_CHAN_SELECT(x)		(GENMASK(3, 0) & BIT(x))
+#define SUN6I_GPADC_CTRL1_ADC_CHAN_MASK			GENMASK(3, 0)
 
 #define SUN4I_GPADC_CTRL2				0x08
 
-- 
cgit v1.2.3


From 7f501f0a72036dc29ad9a53811474c393634b401 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Tue, 24 May 2016 19:20:05 +0200
Subject: mtd: nand: Store nand ID in struct nand_chip

Store the NAND ID in struct nand_chip to avoid passing id_data and id_len
as function parameters.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Marek Vasut <marek.vasut@gmail.com>
---
 drivers/mtd/nand/nand_base.c | 55 ++++++++++++++++++++++++--------------------
 include/linux/mtd/nand.h     | 13 +++++++++++
 2 files changed, 43 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f7969e0d59fc..6f3ae626cabc 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3821,18 +3821,16 @@ static int nand_get_bits_per_cell(u8 cellinfo)
  * chip. The rest of the parameters must be decoded according to generic or
  * manufacturer-specific "extended ID" decoding patterns.
  */
-static void nand_decode_ext_id(struct nand_chip *chip, u8 id_data[8],
-			       int *busw)
+static void nand_decode_ext_id(struct nand_chip *chip, int *busw)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	int extid, id_len;
+	int extid, id_len = chip->id.len;
+	u8 *id_data = chip->id.data;
 	/* The 3rd id byte holds MLC / multichip data */
 	chip->bits_per_cell = nand_get_bits_per_cell(id_data[2]);
 	/* The 4th id byte is the important one */
 	extid = id_data[3];
 
-	id_len = nand_id_len(id_data, 8);
-
 	/*
 	 * Field definitions are in the following datasheets:
 	 * Old style (4,5 byte ID): Samsung K9GAG08U0M (p.32)
@@ -3956,9 +3954,10 @@ static void nand_decode_ext_id(struct nand_chip *chip, u8 id_data[8],
  * the chip.
  */
 static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type,
-			   u8 id_data[8], int *busw)
+			   int *busw)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *id_data = chip->id.data;
 	int maf_id = id_data[0];
 
 	mtd->erasesize = type->erasesize;
@@ -3988,9 +3987,10 @@ static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type,
  * heuristic patterns using various detected parameters (e.g., manufacturer,
  * page size, cell-type information).
  */
-static void nand_decode_bbm_options(struct nand_chip *chip, u8 id_data[8])
+static void nand_decode_bbm_options(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *id_data = chip->id.data;
 	int maf_id = id_data[0];
 
 	/* Set the bad block position */
@@ -4026,10 +4026,10 @@ static inline bool is_full_id_nand(struct nand_flash_dev *type)
 }
 
 static bool find_full_id_nand(struct nand_chip *chip,
-			      struct nand_flash_dev *type, u8 *id_data,
-			      int *busw)
+			      struct nand_flash_dev *type, int *busw)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	u8 *id_data = chip->id.data;
 
 	if (!strncmp(type->id, id_data, type->id_len)) {
 		mtd->writesize = type->pagesize;
@@ -4058,13 +4058,13 @@ static bool find_full_id_nand(struct nand_chip *chip,
  * Get the flash and manufacturer id and lookup if the type is supported.
  */
 static int nand_get_flash_type(struct nand_chip *chip,
-			       int *maf_id, int *dev_id,
 			       struct nand_flash_dev *type)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int busw;
 	int i, maf_idx;
-	u8 id_data[8];
+	u8 *id_data = chip->id.data;
+	u8 maf_id, dev_id;
 
 	/*
 	 * Reset the chip, required by some chips (e.g. Micron MT29FxGxxxxx)
@@ -4079,8 +4079,8 @@ static int nand_get_flash_type(struct nand_chip *chip,
 	chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1);
 
 	/* Read manufacturer and device IDs */
-	*maf_id = chip->read_byte(mtd);
-	*dev_id = chip->read_byte(mtd);
+	maf_id = chip->read_byte(mtd);
+	dev_id = chip->read_byte(mtd);
 
 	/*
 	 * Try again to make sure, as some systems the bus-hold or other
@@ -4095,20 +4095,22 @@ static int nand_get_flash_type(struct nand_chip *chip,
 	for (i = 0; i < 8; i++)
 		id_data[i] = chip->read_byte(mtd);
 
-	if (id_data[0] != *maf_id || id_data[1] != *dev_id) {
+	if (id_data[0] != maf_id || id_data[1] != dev_id) {
 		pr_info("second ID read did not match %02x,%02x against %02x,%02x\n",
-			*maf_id, *dev_id, id_data[0], id_data[1]);
+			maf_id, dev_id, id_data[0], id_data[1]);
 		return -ENODEV;
 	}
 
+	chip->id.len = nand_id_len(id_data, 8);
+
 	if (!type)
 		type = nand_flash_ids;
 
 	for (; type->name != NULL; type++) {
 		if (is_full_id_nand(type)) {
-			if (find_full_id_nand(chip, type, id_data, &busw))
+			if (find_full_id_nand(chip, type, &busw))
 				goto ident_done;
-		} else if (*dev_id == type->dev_id) {
+		} else if (dev_id == type->dev_id) {
 			break;
 		}
 	}
@@ -4134,9 +4136,9 @@ static int nand_get_flash_type(struct nand_chip *chip,
 
 	if (!type->pagesize) {
 		/* Decode parameters from extended ID */
-		nand_decode_ext_id(chip, id_data, &busw);
+		nand_decode_ext_id(chip, &busw);
 	} else {
-		nand_decode_id(chip, type, id_data, &busw);
+		nand_decode_id(chip, type, &busw);
 	}
 	/* Get chip options */
 	chip->options |= type->options;
@@ -4145,13 +4147,13 @@ static int nand_get_flash_type(struct nand_chip *chip,
 	 * Check if chip is not a Samsung device. Do not clear the
 	 * options for chips which do not have an extended id.
 	 */
-	if (*maf_id != NAND_MFR_SAMSUNG && !type->pagesize)
+	if (maf_id != NAND_MFR_SAMSUNG && !type->pagesize)
 		chip->options &= ~NAND_SAMSUNG_LP_OPTIONS;
 ident_done:
 
 	/* Try to identify manufacturer */
 	for (maf_idx = 0; nand_manuf_ids[maf_idx].id != 0x0; maf_idx++) {
-		if (nand_manuf_ids[maf_idx].id == *maf_id)
+		if (nand_manuf_ids[maf_idx].id == maf_id)
 			break;
 	}
 
@@ -4165,7 +4167,7 @@ ident_done:
 		 * chip correct!
 		 */
 		pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
-			*maf_id, *dev_id);
+			maf_id, dev_id);
 		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name, mtd->name);
 		pr_warn("bus width %d instead %d bit\n",
 			   (chip->options & NAND_BUSWIDTH_16) ? 16 : 8,
@@ -4173,7 +4175,7 @@ ident_done:
 		return -EINVAL;
 	}
 
-	nand_decode_bbm_options(chip, id_data);
+	nand_decode_bbm_options(chip);
 
 	/* Calculate the address shift from the page size */
 	chip->page_shift = ffs(mtd->writesize) - 1;
@@ -4197,7 +4199,7 @@ ident_done:
 		chip->cmdfunc = nand_command_lp;
 
 	pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
-		*maf_id, *dev_id);
+		maf_id, dev_id);
 
 	if (chip->onfi_version)
 		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
@@ -4400,7 +4402,7 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 	nand_set_defaults(chip, chip->options & NAND_BUSWIDTH_16);
 
 	/* Read the flash type */
-	ret = nand_get_flash_type(chip, &nand_maf_id, &nand_dev_id, table);
+	ret = nand_get_flash_type(chip, table);
 	if (ret) {
 		if (!(chip->options & NAND_SCAN_SILENT_NODEV))
 			pr_warn("No NAND device found\n");
@@ -4425,6 +4427,9 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips,
 	if (ret)
 		return ret;
 
+	nand_maf_id = chip->id.data[0];
+	nand_dev_id = chip->id.data[1];
+
 	chip->select_chip(mtd, -1);
 
 	/* Check for a chip array */
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9591e0fbe5bd..e2c11351b1bd 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -464,6 +464,17 @@ struct nand_jedec_params {
 	__le16 crc;
 } __packed;
 
+/**
+ * struct nand_id - NAND id structure
+ * @data: buffer containing the id bytes. Currently 8 bytes large, but can
+ *	  be extended if required.
+ * @len: ID length.
+ */
+struct nand_id {
+	u8 data[8];
+	int len;
+};
+
 /**
  * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices
  * @lock:               protection lock
@@ -793,6 +804,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  * @pagebuf_bitflips:	[INTERN] holds the bitflip count for the page which is
  *			currently in data_buf.
  * @subpagesize:	[INTERN] holds the subpagesize
+ * @id:			[INTERN] holds NAND ID
  * @onfi_version:	[INTERN] holds the chip ONFI version (BCD encoded),
  *			non 0 if ONFI supported.
  * @jedec_version:	[INTERN] holds the chip JEDEC version (BCD encoded),
@@ -881,6 +893,7 @@ struct nand_chip {
 	int badblockpos;
 	int badblockbits;
 
+	struct nand_id id;
 	int onfi_version;
 	int jedec_version;
 	union {
-- 
cgit v1.2.3


From 8cfb9ab68f90703d419870fce7ac21ac401399f2 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Sat, 7 Jan 2017 15:15:57 +0100
Subject: mtd: nand: Rename the nand_manufacturers struct

Drop the 's' at the end of nand_manufacturers since the struct is actually
describing a single manufacturer, not a manufacturer table.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_ids.c | 2 +-
 include/linux/mtd/nand.h    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 4a2f75b0c200..3f80cfcb5e37 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -169,7 +169,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 };
 
 /* Manufacturer IDs */
-struct nand_manufacturers nand_manuf_ids[] = {
+struct nand_manufacturer nand_manuf_ids[] = {
 	{NAND_MFR_TOSHIBA, "Toshiba"},
 	{NAND_MFR_ESMT, "ESMT"},
 	{NAND_MFR_SAMSUNG, "Samsung"},
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index e2c11351b1bd..9c679e8bde42 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1062,17 +1062,17 @@ struct nand_flash_dev {
 };
 
 /**
- * struct nand_manufacturers - NAND Flash Manufacturer ID Structure
+ * struct nand_manufacturer - NAND Flash Manufacturer structure
  * @name:	Manufacturer name
  * @id:		manufacturer ID code of device.
 */
-struct nand_manufacturers {
+struct nand_manufacturer {
 	int id;
 	char *name;
 };
 
 extern struct nand_flash_dev nand_flash_ids[];
-extern struct nand_manufacturers nand_manuf_ids[];
+extern struct nand_manufacturer nand_manuf_ids[];
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From bcc678c2d7a0e0af14cb3d858ebd367be378c172 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Sat, 7 Jan 2017 15:48:25 +0100
Subject: mtd: nand: Do not expose the NAND manufacturer table directly

There is no reason to expose the NAND manufacturer table. Provide an
helper function to find manufacturers by their id.

We also turn the nand_manufacturers table into a const array, since its
members are not modified after the initial assignment.

Finally, we remove the sentinel manufacturer entry from the manufacturers
table (we already have the array size information given by ARRAY_SIZE()),
and add the nand_manufacturer_name() helper to handle the "Unknown" case
properly.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 23 +++++++++++------------
 drivers/mtd/nand/nand_ids.c  | 22 ++++++++++++++++++++--
 include/linux/mtd/nand.h     |  9 ++++++++-
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index b1eb99e84044..0120252e0710 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4052,9 +4052,10 @@ static bool find_full_id_nand(struct nand_chip *chip,
  */
 static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 {
+	const struct nand_manufacturer *manufacturer;
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int busw;
-	int i, maf_idx;
+	int i;
 	u8 *id_data = chip->id.data;
 	u8 maf_id, dev_id;
 
@@ -4159,10 +4160,7 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 ident_done:
 
 	/* Try to identify manufacturer */
-	for (maf_idx = 0; nand_manuf_ids[maf_idx].id != 0x0; maf_idx++) {
-		if (nand_manuf_ids[maf_idx].id == maf_id)
-			break;
-	}
+	manufacturer = nand_get_manufacturer(maf_id);
 
 	if (chip->options & NAND_BUSWIDTH_AUTO) {
 		WARN_ON(busw & NAND_BUSWIDTH_16);
@@ -4174,7 +4172,8 @@ ident_done:
 		 */
 		pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
 			maf_id, dev_id);
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name, mtd->name);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			mtd->name);
 		pr_warn("bus width %d instead of %d bits\n", busw ? 16 : 8,
 			(chip->options & NAND_BUSWIDTH_16) ? 16 : 8);
 		return -EINVAL;
@@ -4207,14 +4206,14 @@ ident_done:
 		maf_id, dev_id);
 
 	if (chip->onfi_version)
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
-				chip->onfi_params.model);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			chip->onfi_params.model);
 	else if (chip->jedec_version)
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
-				chip->jedec_params.model);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			chip->jedec_params.model);
 	else
-		pr_info("%s %s\n", nand_manuf_ids[maf_idx].name,
-				type->name);
+		pr_info("%s %s\n", nand_manufacturer_name(manufacturer),
+			type->name);
 
 	pr_info("%d MiB, %s, erase size: %d KiB, page size: %d, OOB size: %d\n",
 		(int)(chip->chipsize >> 20), nand_is_slc(chip) ? "SLC" : "MLC",
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index bd267ade0742..06f59a6b74ff 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -169,7 +169,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 };
 
 /* Manufacturer IDs */
-struct nand_manufacturer nand_manuf_ids[] = {
+static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_TOSHIBA, "Toshiba"},
 	{NAND_MFR_ESMT, "ESMT"},
 	{NAND_MFR_SAMSUNG, "Samsung"},
@@ -186,5 +186,23 @@ struct nand_manufacturer nand_manuf_ids[] = {
 	{NAND_MFR_INTEL, "Intel"},
 	{NAND_MFR_ATO, "ATO"},
 	{NAND_MFR_WINBOND, "Winbond"},
-	{0x0, "Unknown"}
 };
+
+/**
+ * nand_get_manufacturer - Get manufacturer information from the manufacturer
+ *			   ID
+ * @id: manufacturer ID
+ *
+ * Returns a pointer a nand_manufacturer object if the manufacturer is defined
+ * in the NAND manufacturers database, NULL otherwise.
+ */
+const struct nand_manufacturer *nand_get_manufacturer(u8 id)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nand_manufacturers); i++)
+		if (nand_manufacturers[i].id == id)
+			return &nand_manufacturers[i];
+
+	return NULL;
+}
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9c679e8bde42..6415aa16043c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1071,8 +1071,15 @@ struct nand_manufacturer {
 	char *name;
 };
 
+const struct nand_manufacturer *nand_get_manufacturer(u8 id);
+
+static inline const char *
+nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
+{
+	return manufacturer ? manufacturer->name : "Unknown";
+}
+
 extern struct nand_flash_dev nand_flash_ids[];
-extern struct nand_manufacturer nand_manuf_ids[];
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From abbe26d144ec22bb067fa414d717b9f7ca2e12bd Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 09:32:55 +0200
Subject: mtd: nand: Add manufacturer specific initialization/detection steps

A lot of NANDs are implementing generic features in a non-generic way,
or are providing advanced auto-detection logic where the NAND ID bytes
meaning changes with the NAND generation.

Providing this vendor specific initialization step will allow us to get
rid of full-id entries in the nand_ids table or all the vendor specific
cases added over the time in the generic NAND ID decoding logic.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 75 ++++++++++++++++++++++++++++++++++++++------
 include/linux/mtd/nand.h     | 35 +++++++++++++++++++++
 2 files changed, 100 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0120252e0710..92ff6adbd7c9 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3819,7 +3819,7 @@ static int nand_get_bits_per_cell(u8 cellinfo)
  * chip. The rest of the parameters must be decoded according to generic or
  * manufacturer-specific "extended ID" decoding patterns.
  */
-static void nand_decode_ext_id(struct nand_chip *chip)
+void nand_decode_ext_id(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int extid, id_len = chip->id.len;
@@ -3944,6 +3944,7 @@ static void nand_decode_ext_id(struct nand_chip *chip)
 
 	}
 }
+EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 
 /*
  * Old devices have chip data hardcoded in the device ID table. nand_decode_id
@@ -4047,6 +4048,53 @@ static bool find_full_id_nand(struct nand_chip *chip,
 	return false;
 }
 
+/*
+ * Manufacturer detection. Only used when the NAND is not ONFI or JEDEC
+ * compliant and does not have a full-id or legacy-id entry in the nand_ids
+ * table.
+ */
+static void nand_manufacturer_detect(struct nand_chip *chip)
+{
+	/*
+	 * Try manufacturer detection if available and use
+	 * nand_decode_ext_id() otherwise.
+	 */
+	if (chip->manufacturer.desc && chip->manufacturer.desc->ops &&
+	    chip->manufacturer.desc->ops->detect)
+		chip->manufacturer.desc->ops->detect(chip);
+	else
+		nand_decode_ext_id(chip);
+}
+
+/*
+ * Manufacturer initialization. This function is called for all NANDs including
+ * ONFI and JEDEC compliant ones.
+ * Manufacturer drivers should put all their specific initialization code in
+ * their ->init() hook.
+ */
+static int nand_manufacturer_init(struct nand_chip *chip)
+{
+	if (!chip->manufacturer.desc || !chip->manufacturer.desc->ops ||
+	    !chip->manufacturer.desc->ops->init)
+		return 0;
+
+	return chip->manufacturer.desc->ops->init(chip);
+}
+
+/*
+ * Manufacturer cleanup. This function is called for all NANDs including
+ * ONFI and JEDEC compliant ones.
+ * Manufacturer drivers should put all their specific cleanup code in their
+ * ->cleanup() hook.
+ */
+static void nand_manufacturer_cleanup(struct nand_chip *chip)
+{
+	/* Release manufacturer private data */
+	if (chip->manufacturer.desc && chip->manufacturer.desc->ops &&
+	    chip->manufacturer.desc->ops->cleanup)
+		chip->manufacturer.desc->ops->cleanup(chip);
+}
+
 /*
  * Get the flash and manufacturer id and lookup if the type is supported.
  */
@@ -4055,7 +4103,7 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 	const struct nand_manufacturer *manufacturer;
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	int busw;
-	int i;
+	int i, ret;
 	u8 *id_data = chip->id.data;
 	u8 maf_id, dev_id;
 
@@ -4096,6 +4144,10 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 
 	chip->id.len = nand_id_len(id_data, 8);
 
+	/* Try to identify manufacturer */
+	manufacturer = nand_get_manufacturer(maf_id);
+	chip->manufacturer.desc = manufacturer;
+
 	if (!type)
 		type = nand_flash_ids;
 
@@ -4142,12 +4194,11 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 
 	chip->chipsize = (uint64_t)type->chipsize << 20;
 
-	if (!type->pagesize) {
-		/* Decode parameters from extended ID */
-		nand_decode_ext_id(chip);
-	} else {
+	if (!type->pagesize)
+		nand_manufacturer_detect(chip);
+	else
 		nand_decode_id(chip, type);
-	}
+
 	/* Get chip options */
 	chip->options |= type->options;
 
@@ -4159,9 +4210,6 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 		chip->options &= ~NAND_SAMSUNG_LP_OPTIONS;
 ident_done:
 
-	/* Try to identify manufacturer */
-	manufacturer = nand_get_manufacturer(maf_id);
-
 	if (chip->options & NAND_BUSWIDTH_AUTO) {
 		WARN_ON(busw & NAND_BUSWIDTH_16);
 		nand_set_defaults(chip);
@@ -4202,6 +4250,10 @@ ident_done:
 	if (mtd->writesize > 512 && chip->cmdfunc == nand_command)
 		chip->cmdfunc = nand_command_lp;
 
+	ret = nand_manufacturer_init(chip);
+	if (ret)
+		return ret;
+
 	pr_info("device found, Manufacturer ID: 0x%02x, Chip ID: 0x%02x\n",
 		maf_id, dev_id);
 
@@ -4947,6 +4999,9 @@ void nand_cleanup(struct nand_chip *chip)
 	if (chip->badblock_pattern && chip->badblock_pattern->options
 			& NAND_BBT_DYNAMICSTRUCT)
 		kfree(chip->badblock_pattern);
+
+	/* Free manufacturer priv data. */
+	nand_manufacturer_cleanup(chip);
 }
 EXPORT_SYMBOL_GPL(nand_cleanup);
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 6415aa16043c..ee9a19f42293 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -731,6 +731,20 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
 	return &conf->timings.sdr;
 }
 
+/**
+ * struct nand_manufacturer_ops - NAND Manufacturer operations
+ * @detect: detect the NAND memory organization and capabilities
+ * @init: initialize all vendor specific fields (like the ->read_retry()
+ *	  implementation) if any.
+ * @cleanup: the ->init() function may have allocated resources, ->cleanup()
+ *	     is here to let vendor specific code release those resources.
+ */
+struct nand_manufacturer_ops {
+	void (*detect)(struct nand_chip *chip);
+	int (*init)(struct nand_chip *chip);
+	void (*cleanup)(struct nand_chip *chip);
+};
+
 /**
  * struct nand_chip - NAND Private Flash Chip Data
  * @mtd:		MTD device registered to the MTD framework
@@ -835,6 +849,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf)
  *			additional error status checks (determine if errors are
  *			correctable).
  * @write_page:		[REPLACEABLE] High-level page write function
+ * @manufacturer:	[INTERN] Contains manufacturer information
  */
 
 struct nand_chip {
@@ -923,6 +938,11 @@ struct nand_chip {
 	struct nand_bbt_descr *badblock_pattern;
 
 	void *priv;
+
+	struct {
+		const struct nand_manufacturer *desc;
+		void *priv;
+	} manufacturer;
 };
 
 extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops;
@@ -959,6 +979,17 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
 	chip->priv = priv;
 }
 
+static inline void nand_set_manufacturer_data(struct nand_chip *chip,
+					      void *priv)
+{
+	chip->manufacturer.priv = priv;
+}
+
+static inline void *nand_get_manufacturer_data(struct nand_chip *chip)
+{
+	return chip->manufacturer.priv;
+}
+
 /*
  * NAND Flash Manufacturer ID Codes
  */
@@ -1065,10 +1096,12 @@ struct nand_flash_dev {
  * struct nand_manufacturer - NAND Flash Manufacturer structure
  * @name:	Manufacturer name
  * @id:		manufacturer ID code of device.
+ * @ops:	manufacturer operations
 */
 struct nand_manufacturer {
 	int id;
 	char *name;
+	const struct nand_manufacturer_ops *ops;
 };
 
 const struct nand_manufacturer *nand_get_manufacturer(u8 id);
@@ -1246,4 +1279,6 @@ int nand_reset(struct nand_chip *chip, int chipnr);
 /* Free resources held by the NAND device */
 void nand_cleanup(struct nand_chip *chip);
 
+/* Default extended ID decoding function */
+void nand_decode_ext_id(struct nand_chip *chip);
 #endif /* __LINUX_MTD_NAND_H */
-- 
cgit v1.2.3


From c51d0ac59f24200dfdccc897ff7c3c9446c7599a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:22:19 +0200
Subject: mtd: nand: Move Samsung specific init/detection logic in
 nand_samsung.c

Move Samsung specific initialization and detection logic into
nand_samsung.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile       |  1 +
 drivers/mtd/nand/nand_base.c    | 52 ++---------------------
 drivers/mtd/nand/nand_ids.c     |  4 +-
 drivers/mtd/nand/nand_samsung.c | 92 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h        |  2 +
 5 files changed, 101 insertions(+), 50 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_samsung.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index bfd5d12b9ade..d4b90b0f879e 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -61,3 +61,4 @@ obj-$(CONFIG_MTD_NAND_QCOM)		+= qcom_nandc.o
 obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
+nand-objs += nand_samsung.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 92ff6adbd7c9..fd38d59d33a6 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3832,48 +3832,13 @@ void nand_decode_ext_id(struct nand_chip *chip)
 	/*
 	 * Field definitions are in the following datasheets:
 	 * Old style (4,5 byte ID): Samsung K9GAG08U0M (p.32)
-	 * New Samsung (6 byte ID): Samsung K9GAG08U0F (p.44)
 	 * Hynix MLC   (6 byte ID): Hynix H27UBG8T2B (p.22)
 	 *
 	 * Check for ID length, non-zero 6th byte, cell type, and Hynix/Samsung
 	 * ID to decide what to do.
 	 */
-	if (id_len == 6 && id_data[0] == NAND_MFR_SAMSUNG &&
-			!nand_is_slc(chip) && id_data[5] != 0x00) {
-		/* Calc pagesize */
-		mtd->writesize = 2048 << (extid & 0x03);
-		extid >>= 2;
-		/* Calc oobsize */
-		switch (((extid >> 2) & 0x04) | (extid & 0x03)) {
-		case 1:
-			mtd->oobsize = 128;
-			break;
-		case 2:
-			mtd->oobsize = 218;
-			break;
-		case 3:
-			mtd->oobsize = 400;
-			break;
-		case 4:
-			mtd->oobsize = 436;
-			break;
-		case 5:
-			mtd->oobsize = 512;
-			break;
-		case 6:
-			mtd->oobsize = 640;
-			break;
-		case 7:
-		default: /* Other cases are "reserved" (unknown) */
-			mtd->oobsize = 1024;
-			break;
-		}
-		extid >>= 2;
-		/* Calc blocksize */
-		mtd->erasesize = (128 * 1024) <<
-			(((extid >> 1) & 0x04) | (extid & 0x03));
-	} else if (id_len == 6 && id_data[0] == NAND_MFR_HYNIX &&
-			!nand_is_slc(chip)) {
+	if (id_len == 6 && id_data[0] == NAND_MFR_HYNIX &&
+	    !nand_is_slc(chip)) {
 		unsigned int tmp;
 
 		/* Calc pagesize */
@@ -4001,13 +3966,10 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if (!nand_is_slc(chip) &&
-			(maf_id == NAND_MFR_SAMSUNG ||
-			 maf_id == NAND_MFR_HYNIX))
+	if (!nand_is_slc(chip) && maf_id == NAND_MFR_HYNIX)
 		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
 	else if ((nand_is_slc(chip) &&
-				(maf_id == NAND_MFR_SAMSUNG ||
-				 maf_id == NAND_MFR_HYNIX ||
+				(maf_id == NAND_MFR_HYNIX ||
 				 maf_id == NAND_MFR_TOSHIBA ||
 				 maf_id == NAND_MFR_AMD ||
 				 maf_id == NAND_MFR_MACRONIX)) ||
@@ -4202,12 +4164,6 @@ static int nand_detect(struct nand_chip *chip, struct nand_flash_dev *type)
 	/* Get chip options */
 	chip->options |= type->options;
 
-	/*
-	 * Check if chip is not a Samsung device. Do not clear the
-	 * options for chips which do not have an extended id.
-	 */
-	if (maf_id != NAND_MFR_SAMSUNG && !type->pagesize)
-		chip->options &= ~NAND_SAMSUNG_LP_OPTIONS;
 ident_done:
 
 	if (chip->options & NAND_BUSWIDTH_AUTO) {
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 06f59a6b74ff..8eba7dfe7c1f 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -10,7 +10,7 @@
 #include <linux/mtd/nand.h>
 #include <linux/sizes.h>
 
-#define LP_OPTIONS NAND_SAMSUNG_LP_OPTIONS
+#define LP_OPTIONS 0
 #define LP_OPTIONS16 (LP_OPTIONS | NAND_BUSWIDTH_16)
 
 #define SP_OPTIONS NAND_NEED_READRDY
@@ -172,7 +172,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_TOSHIBA, "Toshiba"},
 	{NAND_MFR_ESMT, "ESMT"},
-	{NAND_MFR_SAMSUNG, "Samsung"},
+	{NAND_MFR_SAMSUNG, "Samsung", &samsung_nand_manuf_ops},
 	{NAND_MFR_FUJITSU, "Fujitsu"},
 	{NAND_MFR_NATIONAL, "National"},
 	{NAND_MFR_RENESAS, "Renesas"},
diff --git a/drivers/mtd/nand/nand_samsung.c b/drivers/mtd/nand/nand_samsung.c
new file mode 100644
index 000000000000..5c259dd5b652
--- /dev/null
+++ b/drivers/mtd/nand/nand_samsung.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void samsung_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	/* New Samsung (6 byte ID): Samsung K9GAG08U0F (p.44) */
+	if (chip->id.len == 6 && !nand_is_slc(chip) &&
+	    chip->id.data[5] != 0x00) {
+		u8 extid = chip->id.data[3];
+
+		/* Get pagesize */
+		mtd->writesize = 2048 << (extid & 0x03);
+
+		extid >>= 2;
+
+		/* Get oobsize */
+		switch (((extid >> 2) & 0x4) | (extid & 0x3)) {
+		case 1:
+			mtd->oobsize = 128;
+			break;
+		case 2:
+			mtd->oobsize = 218;
+			break;
+		case 3:
+			mtd->oobsize = 400;
+			break;
+		case 4:
+			mtd->oobsize = 436;
+			break;
+		case 5:
+			mtd->oobsize = 512;
+			break;
+		case 6:
+			mtd->oobsize = 640;
+			break;
+		default:
+			/*
+			 * We should never reach this case, but if that
+			 * happens, this probably means Samsung decided to use
+			 * a different extended ID format, and we should find
+			 * a way to support it.
+			 */
+			WARN(1, "Invalid OOB size value");
+			break;
+		}
+
+		/* Get blocksize */
+		extid >>= 2;
+		mtd->erasesize = (128 * 1024) <<
+				 (((extid >> 1) & 0x04) | (extid & 0x03));
+	} else {
+		nand_decode_ext_id(chip);
+	}
+}
+
+static int samsung_nand_init(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	if (mtd->writesize > 512)
+		chip->options |= NAND_SAMSUNG_LP_OPTIONS;
+
+	if (!nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
+	else
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops samsung_nand_manuf_ops = {
+	.detect = samsung_nand_decode_id,
+	.init = samsung_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index ee9a19f42293..2f83cb55392f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1114,6 +1114,8 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
 
 extern struct nand_flash_dev nand_flash_ids[];
 
+extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
+
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
 int nand_isreserved_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From 01389b6bd2f4f7649cdbb4a99a15d9e0c05d6f8c Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:30:18 +0200
Subject: mtd: nand: Move Hynix specific init/detection logic in nand_hynix.c

Move Hynix specific initialization and detection logic into
nand_hynix.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile     |   1 +
 drivers/mtd/nand/nand_base.c  | 114 +++++++++++-------------------------------
 drivers/mtd/nand/nand_hynix.c |  84 +++++++++++++++++++++++++++++++
 drivers/mtd/nand/nand_ids.c   |   2 +-
 include/linux/mtd/nand.h      |   1 +
 5 files changed, 115 insertions(+), 87 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_hynix.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index d4b90b0f879e..ddb2670d9767 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -61,4 +61,5 @@ obj-$(CONFIG_MTD_NAND_QCOM)		+= qcom_nandc.o
 obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
+nand-objs += nand_hynix.o
 nand-objs += nand_samsung.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index fd38d59d33a6..d65db5921a0f 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3829,85 +3829,32 @@ void nand_decode_ext_id(struct nand_chip *chip)
 	/* The 4th id byte is the important one */
 	extid = id_data[3];
 
+	/* Calc pagesize */
+	mtd->writesize = 1024 << (extid & 0x03);
+	extid >>= 2;
+	/* Calc oobsize */
+	mtd->oobsize = (8 << (extid & 0x01)) * (mtd->writesize >> 9);
+	extid >>= 2;
+	/* Calc blocksize. Blocksize is multiples of 64KiB */
+	mtd->erasesize = (64 * 1024) << (extid & 0x03);
+	extid >>= 2;
+	/* Get buswidth information */
+	if (extid & 0x1)
+		chip->options |= NAND_BUSWIDTH_16;
+
 	/*
-	 * Field definitions are in the following datasheets:
-	 * Old style (4,5 byte ID): Samsung K9GAG08U0M (p.32)
-	 * Hynix MLC   (6 byte ID): Hynix H27UBG8T2B (p.22)
-	 *
-	 * Check for ID length, non-zero 6th byte, cell type, and Hynix/Samsung
-	 * ID to decide what to do.
+	 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
+	 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
+	 * follows:
+	 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
+	 *                         110b -> 24nm
+	 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
 	 */
-	if (id_len == 6 && id_data[0] == NAND_MFR_HYNIX &&
-	    !nand_is_slc(chip)) {
-		unsigned int tmp;
-
-		/* Calc pagesize */
-		mtd->writesize = 2048 << (extid & 0x03);
-		extid >>= 2;
-		/* Calc oobsize */
-		switch (((extid >> 2) & 0x04) | (extid & 0x03)) {
-		case 0:
-			mtd->oobsize = 128;
-			break;
-		case 1:
-			mtd->oobsize = 224;
-			break;
-		case 2:
-			mtd->oobsize = 448;
-			break;
-		case 3:
-			mtd->oobsize = 64;
-			break;
-		case 4:
-			mtd->oobsize = 32;
-			break;
-		case 5:
-			mtd->oobsize = 16;
-			break;
-		default:
-			mtd->oobsize = 640;
-			break;
-		}
-		extid >>= 2;
-		/* Calc blocksize */
-		tmp = ((extid >> 1) & 0x04) | (extid & 0x03);
-		if (tmp < 0x03)
-			mtd->erasesize = (128 * 1024) << tmp;
-		else if (tmp == 0x03)
-			mtd->erasesize = 768 * 1024;
-		else
-			mtd->erasesize = (64 * 1024) << tmp;
-	} else {
-		/* Calc pagesize */
-		mtd->writesize = 1024 << (extid & 0x03);
-		extid >>= 2;
-		/* Calc oobsize */
-		mtd->oobsize = (8 << (extid & 0x01)) *
-			(mtd->writesize >> 9);
-		extid >>= 2;
-		/* Calc blocksize. Blocksize is multiples of 64KiB */
-		mtd->erasesize = (64 * 1024) << (extid & 0x03);
-		extid >>= 2;
-		/* Get buswidth information */
-		if (extid & 0x1)
-			chip->options |= NAND_BUSWIDTH_16;
-
-		/*
-		 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
-		 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
-		 * follows:
-		 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
-		 *                         110b -> 24nm
-		 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
-		 */
-		if (id_len >= 6 && id_data[0] == NAND_MFR_TOSHIBA &&
-				nand_is_slc(chip) &&
-				(id_data[5] & 0x7) == 0x6 /* 24nm */ &&
-				!(id_data[4] & 0x80) /* !BENAND */) {
-			mtd->oobsize = 32 * mtd->writesize >> 9;
-		}
-
-	}
+	if (id_len >= 6 && id_data[0] == NAND_MFR_TOSHIBA &&
+	    nand_is_slc(chip) &&
+	    (id_data[5] & 0x7) == 0x6 /* 24nm */ &&
+	     !(id_data[4] & 0x80) /* !BENAND */)
+		mtd->oobsize = 32 * mtd->writesize >> 9;
 }
 EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 
@@ -3966,15 +3913,10 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if (!nand_is_slc(chip) && maf_id == NAND_MFR_HYNIX)
-		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
-	else if ((nand_is_slc(chip) &&
-				(maf_id == NAND_MFR_HYNIX ||
-				 maf_id == NAND_MFR_TOSHIBA ||
-				 maf_id == NAND_MFR_AMD ||
-				 maf_id == NAND_MFR_MACRONIX)) ||
-			(mtd->writesize == 2048 &&
-			 maf_id == NAND_MFR_MICRON))
+	if ((nand_is_slc(chip) &&
+	     (maf_id == NAND_MFR_TOSHIBA || maf_id == NAND_MFR_AMD ||
+	      maf_id == NAND_MFR_MACRONIX)) ||
+	    (mtd->writesize == 2048 && maf_id == NAND_MFR_MICRON))
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
diff --git a/drivers/mtd/nand/nand_hynix.c b/drivers/mtd/nand/nand_hynix.c
new file mode 100644
index 000000000000..fbd661688158
--- /dev/null
+++ b/drivers/mtd/nand/nand_hynix.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void hynix_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	/* Hynix MLC   (6 byte ID): Hynix H27UBG8T2B (p.22) */
+	if (chip->id.len == 6 && !nand_is_slc(chip)) {
+		u8 tmp, extid = chip->id.data[3];
+
+		/* Extract pagesize */
+		mtd->writesize = 2048 << (extid & 0x03);
+		extid >>= 2;
+
+		/* Extract oobsize */
+		switch (((extid >> 2) & 0x4) | (extid & 0x3)) {
+		case 0:
+			mtd->oobsize = 128;
+			break;
+		case 1:
+			mtd->oobsize = 224;
+			break;
+		case 2:
+			mtd->oobsize = 448;
+			break;
+		case 3:
+			mtd->oobsize = 64;
+			break;
+		case 4:
+			mtd->oobsize = 32;
+			break;
+		case 5:
+			mtd->oobsize = 16;
+			break;
+		default:
+			mtd->oobsize = 640;
+			break;
+		}
+
+		/* Extract blocksize */
+		extid >>= 2;
+		tmp = ((extid >> 1) & 0x04) | (extid & 0x03);
+		if (tmp < 0x03)
+			mtd->erasesize = (128 * 1024) << tmp;
+		else if (tmp == 0x03)
+			mtd->erasesize = 768 * 1024;
+		else
+			mtd->erasesize = (64 * 1024) << tmp;
+	} else {
+		nand_decode_ext_id(chip);
+	}
+}
+
+static int hynix_nand_init(struct nand_chip *chip)
+{
+	if (!nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCANLASTPAGE;
+	else
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops hynix_nand_manuf_ops = {
+	.detect = hynix_nand_decode_id,
+	.init = hynix_nand_init,
+};
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 8eba7dfe7c1f..7dc06a807d00 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -177,7 +177,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_NATIONAL, "National"},
 	{NAND_MFR_RENESAS, "Renesas"},
 	{NAND_MFR_STMICRO, "ST Micro"},
-	{NAND_MFR_HYNIX, "Hynix"},
+	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
 	{NAND_MFR_MICRON, "Micron"},
 	{NAND_MFR_AMD, "AMD/Spansion"},
 	{NAND_MFR_MACRONIX, "Macronix"},
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 2f83cb55392f..74e3a231cb56 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1115,6 +1115,7 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
 extern struct nand_flash_dev nand_flash_ids[];
 
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
+extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From 9b2d61f80b060ce3ea5af2a99e148b0b214932b2 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:34:57 +0200
Subject: mtd: nand: Move Toshiba specific init/detection logic in
 nand_toshiba.c

Move Toshiba specific initialization and detection logic into
nand_toshiba.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile       |  1 +
 drivers/mtd/nand/nand_base.c    | 19 ++-------------
 drivers/mtd/nand/nand_ids.c     |  2 +-
 drivers/mtd/nand/nand_toshiba.c | 51 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h        |  1 +
 5 files changed, 56 insertions(+), 18 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_toshiba.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index ddb2670d9767..7c059822f479 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -63,3 +63,4 @@ obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
 nand-objs += nand_hynix.o
 nand-objs += nand_samsung.o
+nand-objs += nand_toshiba.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index d65db5921a0f..36bca97900af 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3822,7 +3822,7 @@ static int nand_get_bits_per_cell(u8 cellinfo)
 void nand_decode_ext_id(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	int extid, id_len = chip->id.len;
+	int extid;
 	u8 *id_data = chip->id.data;
 	/* The 3rd id byte holds MLC / multichip data */
 	chip->bits_per_cell = nand_get_bits_per_cell(id_data[2]);
@@ -3841,20 +3841,6 @@ void nand_decode_ext_id(struct nand_chip *chip)
 	/* Get buswidth information */
 	if (extid & 0x1)
 		chip->options |= NAND_BUSWIDTH_16;
-
-	/*
-	 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
-	 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
-	 * follows:
-	 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
-	 *                         110b -> 24nm
-	 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
-	 */
-	if (id_len >= 6 && id_data[0] == NAND_MFR_TOSHIBA &&
-	    nand_is_slc(chip) &&
-	    (id_data[5] & 0x7) == 0x6 /* 24nm */ &&
-	     !(id_data[4] & 0x80) /* !BENAND */)
-		mtd->oobsize = 32 * mtd->writesize >> 9;
 }
 EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 
@@ -3914,8 +3900,7 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
 	if ((nand_is_slc(chip) &&
-	     (maf_id == NAND_MFR_TOSHIBA || maf_id == NAND_MFR_AMD ||
-	      maf_id == NAND_MFR_MACRONIX)) ||
+	     (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX)) ||
 	    (mtd->writesize == 2048 && maf_id == NAND_MFR_MICRON))
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 7dc06a807d00..22ff89e9c478 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -170,7 +170,7 @@ struct nand_flash_dev nand_flash_ids[] = {
 
 /* Manufacturer IDs */
 static const struct nand_manufacturer nand_manufacturers[] = {
-	{NAND_MFR_TOSHIBA, "Toshiba"},
+	{NAND_MFR_TOSHIBA, "Toshiba", &toshiba_nand_manuf_ops},
 	{NAND_MFR_ESMT, "ESMT"},
 	{NAND_MFR_SAMSUNG, "Samsung", &samsung_nand_manuf_ops},
 	{NAND_MFR_FUJITSU, "Fujitsu"},
diff --git a/drivers/mtd/nand/nand_toshiba.c b/drivers/mtd/nand/nand_toshiba.c
new file mode 100644
index 000000000000..fa787ba38dcd
--- /dev/null
+++ b/drivers/mtd/nand/nand_toshiba.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void toshiba_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	nand_decode_ext_id(chip);
+
+	/*
+	 * Toshiba 24nm raw SLC (i.e., not BENAND) have 32B OOB per
+	 * 512B page. For Toshiba SLC, we decode the 5th/6th byte as
+	 * follows:
+	 * - ID byte 6, bits[2:0]: 100b -> 43nm, 101b -> 32nm,
+	 *                         110b -> 24nm
+	 * - ID byte 5, bit[7]:    1 -> BENAND, 0 -> raw SLC
+	 */
+	if (chip->id.len >= 6 && nand_is_slc(chip) &&
+	    (chip->id.data[5] & 0x7) == 0x6 /* 24nm */ &&
+	    !(chip->id.data[4] & 0x80) /* !BENAND */)
+		mtd->oobsize = 32 * mtd->writesize >> 9;
+}
+
+static int toshiba_nand_init(struct nand_chip *chip)
+{
+	if (nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops toshiba_nand_manuf_ops = {
+	.detect = toshiba_nand_decode_id,
+	.init = toshiba_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 74e3a231cb56..dd9e3b5ddd4f 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1114,6 +1114,7 @@ nand_manufacturer_name(const struct nand_manufacturer *manufacturer)
 
 extern struct nand_flash_dev nand_flash_ids[];
 
+extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 
-- 
cgit v1.2.3


From 10d4e75c36f6c16311dde1461f318210da357219 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:38:57 +0200
Subject: mtd: nand: Move Micron specific init logic in nand_micron.c

Move Micron specific initialization logic into nand_micron.c. This is
part of the "separate vendor specific code from core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile      |  1 +
 drivers/mtd/nand/nand_base.c   | 32 +---------------
 drivers/mtd/nand/nand_ids.c    |  2 +-
 drivers/mtd/nand/nand_micron.c | 86 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h       | 21 +----------
 5 files changed, 91 insertions(+), 51 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_micron.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 7c059822f479..11d743155810 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -62,5 +62,6 @@ obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
 nand-objs += nand_hynix.o
+nand-objs += nand_micron.o
 nand-objs += nand_samsung.o
 nand-objs += nand_toshiba.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 36bca97900af..f45e3bbe5f85 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3537,30 +3537,6 @@ ext_out:
 	return ret;
 }
 
-static int nand_setup_read_retry_micron(struct mtd_info *mtd, int retry_mode)
-{
-	struct nand_chip *chip = mtd_to_nand(mtd);
-	uint8_t feature[ONFI_SUBFEATURE_PARAM_LEN] = {retry_mode};
-
-	return chip->onfi_set_features(mtd, chip, ONFI_FEATURE_ADDR_READ_RETRY,
-			feature);
-}
-
-/*
- * Configure chip properties from Micron vendor-specific ONFI table
- */
-static void nand_onfi_detect_micron(struct nand_chip *chip,
-		struct nand_onfi_params *p)
-{
-	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
-
-	if (le16_to_cpu(p->vendor_revision) < 1)
-		return;
-
-	chip->read_retries = micron->read_retry_options;
-	chip->setup_read_retry = nand_setup_read_retry_micron;
-}
-
 /*
  * Check if the NAND chip is ONFI compliant, returns 1 if it is, 0 otherwise.
  */
@@ -3660,9 +3636,6 @@ static int nand_flash_detect_onfi(struct nand_chip *chip)
 		pr_warn("Could not retrieve ONFI ECC requirements\n");
 	}
 
-	if (p->jedec_id == NAND_MFR_MICRON)
-		nand_onfi_detect_micron(chip, p);
-
 	return 1;
 }
 
@@ -3899,9 +3872,8 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if ((nand_is_slc(chip) &&
-	     (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX)) ||
-	    (mtd->writesize == 2048 && maf_id == NAND_MFR_MICRON))
+	if (nand_is_slc(chip) &&
+	    (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX))
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 22ff89e9c478..42c6743401c5 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -178,7 +178,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_RENESAS, "Renesas"},
 	{NAND_MFR_STMICRO, "ST Micro"},
 	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
-	{NAND_MFR_MICRON, "Micron"},
+	{NAND_MFR_MICRON, "Micron", &micron_nand_manuf_ops},
 	{NAND_MFR_AMD, "AMD/Spansion"},
 	{NAND_MFR_MACRONIX, "Macronix"},
 	{NAND_MFR_EON, "Eon"},
diff --git a/drivers/mtd/nand/nand_micron.c b/drivers/mtd/nand/nand_micron.c
new file mode 100644
index 000000000000..877011069251
--- /dev/null
+++ b/drivers/mtd/nand/nand_micron.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+struct nand_onfi_vendor_micron {
+	u8 two_plane_read;
+	u8 read_cache;
+	u8 read_unique_id;
+	u8 dq_imped;
+	u8 dq_imped_num_settings;
+	u8 dq_imped_feat_addr;
+	u8 rb_pulldown_strength;
+	u8 rb_pulldown_strength_feat_addr;
+	u8 rb_pulldown_strength_num_settings;
+	u8 otp_mode;
+	u8 otp_page_start;
+	u8 otp_data_prot_addr;
+	u8 otp_num_pages;
+	u8 otp_feat_addr;
+	u8 read_retry_options;
+	u8 reserved[72];
+	u8 param_revision;
+} __packed;
+
+static int micron_nand_setup_read_retry(struct mtd_info *mtd, int retry_mode)
+{
+	struct nand_chip *chip = mtd_to_nand(mtd);
+	u8 feature[ONFI_SUBFEATURE_PARAM_LEN] = {retry_mode};
+
+	return chip->onfi_set_features(mtd, chip, ONFI_FEATURE_ADDR_READ_RETRY,
+				       feature);
+}
+
+/*
+ * Configure chip properties from Micron vendor-specific ONFI table
+ */
+static int micron_nand_onfi_init(struct nand_chip *chip)
+{
+	struct nand_onfi_params *p = &chip->onfi_params;
+	struct nand_onfi_vendor_micron *micron = (void *)p->vendor;
+
+	if (!chip->onfi_version)
+		return 0;
+
+	if (le16_to_cpu(p->vendor_revision) < 1)
+		return 0;
+
+	chip->read_retries = micron->read_retry_options;
+	chip->setup_read_retry = micron_nand_setup_read_retry;
+
+	return 0;
+}
+
+static int micron_nand_init(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+	int ret;
+
+	ret = micron_nand_onfi_init(chip);
+	if (ret)
+		return ret;
+
+	if (mtd->writesize == 2048)
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops micron_nand_manuf_ops = {
+	.init = micron_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index dd9e3b5ddd4f..7d0f18ecbf57 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -366,26 +366,6 @@ struct onfi_ext_param_page {
 	 */
 } __packed;
 
-struct nand_onfi_vendor_micron {
-	u8 two_plane_read;
-	u8 read_cache;
-	u8 read_unique_id;
-	u8 dq_imped;
-	u8 dq_imped_num_settings;
-	u8 dq_imped_feat_addr;
-	u8 rb_pulldown_strength;
-	u8 rb_pulldown_strength_feat_addr;
-	u8 rb_pulldown_strength_num_settings;
-	u8 otp_mode;
-	u8 otp_page_start;
-	u8 otp_data_prot_addr;
-	u8 otp_num_pages;
-	u8 otp_feat_addr;
-	u8 read_retry_options;
-	u8 reserved[72];
-	u8 param_revision;
-} __packed;
-
 struct jedec_ecc_info {
 	u8 ecc_bits;
 	u8 codeword_size;
@@ -1117,6 +1097,7 @@ extern struct nand_flash_dev nand_flash_ids[];
 extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
+extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From 229204da53b31d576fcc1c93a33626943ea8202c Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:42:23 +0200
Subject: mtd: nand: Move AMD/Spansion specific init/detection logic in
 nand_amd.c

Move AMD/Spansion specific initialization/detection logic into
nand_amd.c. This is part of the "separate vendor specific code from
core" cleanup process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/Makefile    |  1 +
 drivers/mtd/nand/nand_amd.c  | 51 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/mtd/nand/nand_base.c | 18 +---------------
 drivers/mtd/nand/nand_ids.c  |  2 +-
 include/linux/mtd/nand.h     |  1 +
 5 files changed, 55 insertions(+), 18 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_amd.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index 11d743155810..f48ddcc132bc 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_MTD_NAND_QCOM)		+= qcom_nandc.o
 obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
+nand-objs += nand_amd.o
 nand-objs += nand_hynix.o
 nand-objs += nand_micron.o
 nand-objs += nand_samsung.o
diff --git a/drivers/mtd/nand/nand_amd.c b/drivers/mtd/nand/nand_amd.c
new file mode 100644
index 000000000000..170403a3bfa8
--- /dev/null
+++ b/drivers/mtd/nand/nand_amd.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static void amd_nand_decode_id(struct nand_chip *chip)
+{
+	struct mtd_info *mtd = nand_to_mtd(chip);
+
+	nand_decode_ext_id(chip);
+
+	/*
+	 * Check for Spansion/AMD ID + repeating 5th, 6th byte since
+	 * some Spansion chips have erasesize that conflicts with size
+	 * listed in nand_ids table.
+	 * Data sheet (5 byte ID): Spansion S30ML-P ORNAND (p.39)
+	 */
+	if (chip->id.data[4] != 0x00 && chip->id.data[5] == 0x00 &&
+	    chip->id.data[6] == 0x00 && chip->id.data[7] == 0x00 &&
+	    mtd->writesize == 512) {
+		mtd->erasesize = 128 * 1024;
+		mtd->erasesize <<= ((chip->id.data[3] & 0x03) << 1);
+	}
+}
+
+static int amd_nand_init(struct nand_chip *chip)
+{
+	if (nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops amd_nand_manuf_ops = {
+	.detect = amd_nand_decode_id,
+	.init = amd_nand_init,
+};
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f45e3bbe5f85..a9060bcb9eb5 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3825,8 +3825,6 @@ EXPORT_SYMBOL_GPL(nand_decode_ext_id);
 static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	u8 *id_data = chip->id.data;
-	int maf_id = id_data[0];
 
 	mtd->erasesize = type->erasesize;
 	mtd->writesize = type->pagesize;
@@ -3834,19 +3832,6 @@ static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type)
 
 	/* All legacy ID NAND are small-page, SLC */
 	chip->bits_per_cell = 1;
-
-	/*
-	 * Check for Spansion/AMD ID + repeating 5th, 6th byte since
-	 * some Spansion chips have erasesize that conflicts with size
-	 * listed in nand_ids table.
-	 * Data sheet (5 byte ID): Spansion S30ML-P ORNAND (p.39)
-	 */
-	if (maf_id == NAND_MFR_AMD && id_data[4] != 0x00 && id_data[5] == 0x00
-			&& id_data[6] == 0x00 && id_data[7] == 0x00
-			&& mtd->writesize == 512) {
-		mtd->erasesize = 128 * 1024;
-		mtd->erasesize <<= ((id_data[3] & 0x03) << 1);
-	}
 }
 
 /*
@@ -3872,8 +3857,7 @@ static void nand_decode_bbm_options(struct nand_chip *chip)
 	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
 	 * AMD/Spansion, and Macronix.  All others scan only the first page.
 	 */
-	if (nand_is_slc(chip) &&
-	    (maf_id == NAND_MFR_AMD || maf_id == NAND_MFR_MACRONIX))
+	if (nand_is_slc(chip) && maf_id == NAND_MFR_MACRONIX)
 		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 42c6743401c5..af7b92e08fbd 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -179,7 +179,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_STMICRO, "ST Micro"},
 	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
 	{NAND_MFR_MICRON, "Micron", &micron_nand_manuf_ops},
-	{NAND_MFR_AMD, "AMD/Spansion"},
+	{NAND_MFR_AMD, "AMD/Spansion", &amd_nand_manuf_ops},
 	{NAND_MFR_MACRONIX, "Macronix"},
 	{NAND_MFR_EON, "Eon"},
 	{NAND_MFR_SANDISK, "SanDisk"},
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7d0f18ecbf57..97dce42778e9 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1098,6 +1098,7 @@ extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops;
 extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
+extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From 3b5206f4be9b65d2f0f85b3239cf117a1d0de7ce Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Wed, 8 Jun 2016 10:43:26 +0200
Subject: mtd: nand: Move Macronix specific initialization in nand_macronix.c

Move Macronix specific initialization logic into nand_macronix.c. This
is part of the "separate vendor specific code from core" cleanup
process.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/Makefile        |  1 +
 drivers/mtd/nand/nand_base.c     | 11 -----------
 drivers/mtd/nand/nand_ids.c      |  2 +-
 drivers/mtd/nand/nand_macronix.c | 30 ++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h         |  1 +
 5 files changed, 33 insertions(+), 12 deletions(-)
 create mode 100644 drivers/mtd/nand/nand_macronix.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index f48ddcc132bc..098b8791f10a 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_MTD_NAND_MTK)		+= mtk_nand.o mtk_ecc.o
 nand-objs := nand_base.o nand_bbt.o nand_timings.o nand_ids.o
 nand-objs += nand_amd.o
 nand-objs += nand_hynix.o
+nand-objs += nand_macronix.o
 nand-objs += nand_micron.o
 nand-objs += nand_samsung.o
 nand-objs += nand_toshiba.o
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index a9060bcb9eb5..685376d8ceab 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3842,23 +3842,12 @@ static void nand_decode_id(struct nand_chip *chip, struct nand_flash_dev *type)
 static void nand_decode_bbm_options(struct nand_chip *chip)
 {
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	u8 *id_data = chip->id.data;
-	int maf_id = id_data[0];
 
 	/* Set the bad block position */
 	if (mtd->writesize > 512 || (chip->options & NAND_BUSWIDTH_16))
 		chip->badblockpos = NAND_LARGE_BADBLOCK_POS;
 	else
 		chip->badblockpos = NAND_SMALL_BADBLOCK_POS;
-
-	/*
-	 * Bad block marker is stored in the last page of each block on Samsung
-	 * and Hynix MLC devices; stored in first two pages of each block on
-	 * Micron devices with 2KiB pages and on SLC Samsung, Hynix, Toshiba,
-	 * AMD/Spansion, and Macronix.  All others scan only the first page.
-	 */
-	if (nand_is_slc(chip) && maf_id == NAND_MFR_MACRONIX)
-		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
 }
 
 static inline bool is_full_id_nand(struct nand_flash_dev *type)
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index af7b92e08fbd..9d5ca0e540b5 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -180,7 +180,7 @@ static const struct nand_manufacturer nand_manufacturers[] = {
 	{NAND_MFR_HYNIX, "Hynix", &hynix_nand_manuf_ops},
 	{NAND_MFR_MICRON, "Micron", &micron_nand_manuf_ops},
 	{NAND_MFR_AMD, "AMD/Spansion", &amd_nand_manuf_ops},
-	{NAND_MFR_MACRONIX, "Macronix"},
+	{NAND_MFR_MACRONIX, "Macronix", &macronix_nand_manuf_ops},
 	{NAND_MFR_EON, "Eon"},
 	{NAND_MFR_SANDISK, "SanDisk"},
 	{NAND_MFR_INTEL, "Intel"},
diff --git a/drivers/mtd/nand/nand_macronix.c b/drivers/mtd/nand/nand_macronix.c
new file mode 100644
index 000000000000..84855c3e1a02
--- /dev/null
+++ b/drivers/mtd/nand/nand_macronix.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2017 Free Electrons
+ * Copyright (C) 2017 NextThing Co
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/mtd/nand.h>
+
+static int macronix_nand_init(struct nand_chip *chip)
+{
+	if (nand_is_slc(chip))
+		chip->bbt_options |= NAND_BBT_SCAN2NDPAGE;
+
+	return 0;
+}
+
+const struct nand_manufacturer_ops macronix_nand_manuf_ops = {
+	.init = macronix_nand_init,
+};
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 97dce42778e9..c7de017c7f4c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1099,6 +1099,7 @@ extern const struct nand_manufacturer_ops samsung_nand_manuf_ops;
 extern const struct nand_manufacturer_ops hynix_nand_manuf_ops;
 extern const struct nand_manufacturer_ops micron_nand_manuf_ops;
 extern const struct nand_manufacturer_ops amd_nand_manuf_ops;
+extern const struct nand_manufacturer_ops macronix_nand_manuf_ops;
 
 int nand_default_bbt(struct mtd_info *mtd);
 int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs);
-- 
cgit v1.2.3


From 4b9502e63b5e2b1b5ef491919d3219b9440fe0b3 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Wed, 8 Mar 2017 10:00:40 +0200
Subject: kernel: convert css_set.refcount from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     | 3 ++-
 kernel/cgroup/cgroup-internal.h | 5 +++--
 kernel/cgroup/cgroup-v1.c       | 4 ++--
 kernel/cgroup/cgroup.c          | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 6a3f850cabab..c74b78ecd583 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
+#include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
@@ -156,7 +157,7 @@ struct css_set {
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
 	/* reference count */
-	atomic_t refcount;
+	refcount_t refcount;
 
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..4567f12b02e9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
 #include <linux/kernfs.h>
 #include <linux/workqueue.h>
 #include <linux/list.h>
+#include <linux/refcount.h>
 
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
+	if (refcount_dec_not_one(&cset->refcount))
 		return;
 
 	spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
  */
 static inline void get_css_set(struct css_set *cset)
 {
-	atomic_inc(&cset->refcount);
+	refcount_inc(&cset->refcount);
 }
 
 bool cgroup_ssid_enabled(int ssid);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 56eba9caa632..c4a68c438fde 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cset->refcount);
+		count += refcount_read(&link->cset->refcount);
 	spin_unlock_irq(&css_set_lock);
 	return count;
 }
@@ -1286,7 +1286,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
 	u64 count;
 
 	rcu_read_lock();
-	count = atomic_read(&task_css_set(current)->refcount);
+	count = refcount_read(&task_css_set(current)->refcount);
 	rcu_read_unlock();
 	return count;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8ee78688e36d..b1cc1c306668 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -554,7 +554,7 @@ EXPORT_SYMBOL_GPL(of_css);
  * haven't been created.
  */
 struct css_set init_css_set = {
-	.refcount		= ATOMIC_INIT(1),
+	.refcount		= REFCOUNT_INIT(1),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +724,7 @@ void put_css_set_locked(struct css_set *cset)
 
 	lockdep_assert_held(&css_set_lock);
 
-	if (!atomic_dec_and_test(&cset->refcount))
+	if (!refcount_dec_and_test(&cset->refcount))
 		return;
 
 	/* This css_set is dead. unlink it and release cgroup and css refs */
@@ -977,7 +977,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		return NULL;
 	}
 
-	atomic_set(&cset->refcount, 1);
+	refcount_set(&cset->refcount, 1);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->task_iters);
-- 
cgit v1.2.3


From 915e70f9263d56fbf103742265025f7a492aa625 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Feb 2017 13:01:21 +0100
Subject: staging, android: remove lowmemory killer from the tree

Lowmemory killer is sitting in the staging tree since 2008 without any
serious interest for fixing issues brought up by the MM folks. The main
objection is that the implementation is basically broken by design:
	- it hooks into slab shrinker API which is not suitable for this
	  purpose. lowmem_count implementation just shows this nicely.
	  There is no scaling based on the memory pressure and no
	  feedback to the generic shrinker infrastructure.
	  Moreover lowmem_scan is called way too often for the heavy
	  work it performs.
	- it is not reclaim context aware - no NUMA and/or memcg
	  awareness.

As the code stands right now it just adds a maintenance overhead when
core MM changes have to update lowmemorykiller.c as well. It also seems
that the alternative LMK implementation will be solely in the userspace
so this code has no perspective it seems. The staging tree is supposed
to be for a code which needs to be put in shape before it can be merged
which is not the case here obviously.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/staging/android/Kconfig           |  10 --
 drivers/staging/android/Makefile          |   1 -
 drivers/staging/android/lowmemorykiller.c | 212 ------------------------------
 include/linux/sched.h                     |   4 -
 4 files changed, 227 deletions(-)
 delete mode 100644 drivers/staging/android/lowmemorykiller.c

(limited to 'include/linux')

diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 6c00d6f765c6..71a50b99caff 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -14,16 +14,6 @@ config ASHMEM
 	  It is, in theory, a good memory allocator for low-memory devices,
 	  because it can discard shared memory units when under memory pressure.
 
-config ANDROID_LOW_MEMORY_KILLER
-	bool "Android Low Memory Killer"
-	---help---
-	  Registers processes to be killed when low memory conditions, this is useful
-	  as there is no particular swap space on android.
-
-	  The registered process will kill according to the priorities in android init
-	  scripts (/init.rc), and it defines priority values with minimum free memory size
-	  for each priority.
-
 source "drivers/staging/android/ion/Kconfig"
 
 endif # if ANDROID
diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile
index 7ed1be798909..7cf1564a49a5 100644
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@@ -3,4 +3,3 @@ ccflags-y += -I$(src)			# needed for trace events
 obj-y					+= ion/
 
 obj-$(CONFIG_ASHMEM)			+= ashmem.o
-obj-$(CONFIG_ANDROID_LOW_MEMORY_KILLER)	+= lowmemorykiller.o
diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
deleted file mode 100644
index 054660049395..000000000000
--- a/drivers/staging/android/lowmemorykiller.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/* drivers/misc/lowmemorykiller.c
- *
- * The lowmemorykiller driver lets user-space specify a set of memory thresholds
- * where processes with a range of oom_score_adj values will get killed. Specify
- * the minimum oom_score_adj values in
- * /sys/module/lowmemorykiller/parameters/adj and the number of free pages in
- * /sys/module/lowmemorykiller/parameters/minfree. Both files take a comma
- * separated list of numbers in ascending order.
- *
- * For example, write "0,8" to /sys/module/lowmemorykiller/parameters/adj and
- * "1024,4096" to /sys/module/lowmemorykiller/parameters/minfree to kill
- * processes with a oom_score_adj value of 8 or higher when the free memory
- * drops below 4096 pages and kill processes with a oom_score_adj value of 0 or
- * higher when the free memory drops below 1024 pages.
- *
- * The driver considers memory used for caches to be free, but if a large
- * percentage of the cached memory is locked this can be very inaccurate
- * and processes may not get killed until the normal oom killer is triggered.
- *
- * Copyright (C) 2007-2008 Google, Inc.
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/init.h>
-#include <linux/moduleparam.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/oom.h>
-#include <linux/sched/signal.h>
-#include <linux/swap.h>
-#include <linux/rcupdate.h>
-#include <linux/profile.h>
-#include <linux/notifier.h>
-
-static u32 lowmem_debug_level = 1;
-static short lowmem_adj[6] = {
-	0,
-	1,
-	6,
-	12,
-};
-
-static int lowmem_adj_size = 4;
-static int lowmem_minfree[6] = {
-	3 * 512,	/* 6MB */
-	2 * 1024,	/* 8MB */
-	4 * 1024,	/* 16MB */
-	16 * 1024,	/* 64MB */
-};
-
-static int lowmem_minfree_size = 4;
-
-static unsigned long lowmem_deathpending_timeout;
-
-#define lowmem_print(level, x...)			\
-	do {						\
-		if (lowmem_debug_level >= (level))	\
-			pr_info(x);			\
-	} while (0)
-
-static unsigned long lowmem_count(struct shrinker *s,
-				  struct shrink_control *sc)
-{
-	return global_node_page_state(NR_ACTIVE_ANON) +
-		global_node_page_state(NR_ACTIVE_FILE) +
-		global_node_page_state(NR_INACTIVE_ANON) +
-		global_node_page_state(NR_INACTIVE_FILE);
-}
-
-static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
-{
-	struct task_struct *tsk;
-	struct task_struct *selected = NULL;
-	unsigned long rem = 0;
-	int tasksize;
-	int i;
-	short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
-	int minfree = 0;
-	int selected_tasksize = 0;
-	short selected_oom_score_adj;
-	int array_size = ARRAY_SIZE(lowmem_adj);
-	int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
-	int other_file = global_node_page_state(NR_FILE_PAGES) -
-				global_node_page_state(NR_SHMEM) -
-				total_swapcache_pages();
-
-	if (lowmem_adj_size < array_size)
-		array_size = lowmem_adj_size;
-	if (lowmem_minfree_size < array_size)
-		array_size = lowmem_minfree_size;
-	for (i = 0; i < array_size; i++) {
-		minfree = lowmem_minfree[i];
-		if (other_free < minfree && other_file < minfree) {
-			min_score_adj = lowmem_adj[i];
-			break;
-		}
-	}
-
-	lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
-		     sc->nr_to_scan, sc->gfp_mask, other_free,
-		     other_file, min_score_adj);
-
-	if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
-		lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
-			     sc->nr_to_scan, sc->gfp_mask);
-		return 0;
-	}
-
-	selected_oom_score_adj = min_score_adj;
-
-	rcu_read_lock();
-	for_each_process(tsk) {
-		struct task_struct *p;
-		short oom_score_adj;
-
-		if (tsk->flags & PF_KTHREAD)
-			continue;
-
-		p = find_lock_task_mm(tsk);
-		if (!p)
-			continue;
-
-		if (task_lmk_waiting(p) &&
-		    time_before_eq(jiffies, lowmem_deathpending_timeout)) {
-			task_unlock(p);
-			rcu_read_unlock();
-			return 0;
-		}
-		oom_score_adj = p->signal->oom_score_adj;
-		if (oom_score_adj < min_score_adj) {
-			task_unlock(p);
-			continue;
-		}
-		tasksize = get_mm_rss(p->mm);
-		task_unlock(p);
-		if (tasksize <= 0)
-			continue;
-		if (selected) {
-			if (oom_score_adj < selected_oom_score_adj)
-				continue;
-			if (oom_score_adj == selected_oom_score_adj &&
-			    tasksize <= selected_tasksize)
-				continue;
-		}
-		selected = p;
-		selected_tasksize = tasksize;
-		selected_oom_score_adj = oom_score_adj;
-		lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill\n",
-			     p->comm, p->pid, oom_score_adj, tasksize);
-	}
-	if (selected) {
-		task_lock(selected);
-		send_sig(SIGKILL, selected, 0);
-		if (selected->mm)
-			task_set_lmk_waiting(selected);
-		task_unlock(selected);
-		lowmem_print(1, "Killing '%s' (%d), adj %hd,\n"
-				 "   to free %ldkB on behalf of '%s' (%d) because\n"
-				 "   cache %ldkB is below limit %ldkB for oom_score_adj %hd\n"
-				 "   Free memory is %ldkB above reserved\n",
-			     selected->comm, selected->pid,
-			     selected_oom_score_adj,
-			     selected_tasksize * (long)(PAGE_SIZE / 1024),
-			     current->comm, current->pid,
-			     other_file * (long)(PAGE_SIZE / 1024),
-			     minfree * (long)(PAGE_SIZE / 1024),
-			     min_score_adj,
-			     other_free * (long)(PAGE_SIZE / 1024));
-		lowmem_deathpending_timeout = jiffies + HZ;
-		rem += selected_tasksize;
-	}
-
-	lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
-		     sc->nr_to_scan, sc->gfp_mask, rem);
-	rcu_read_unlock();
-	return rem;
-}
-
-static struct shrinker lowmem_shrinker = {
-	.scan_objects = lowmem_scan,
-	.count_objects = lowmem_count,
-	.seeks = DEFAULT_SEEKS * 16
-};
-
-static int __init lowmem_init(void)
-{
-	register_shrinker(&lowmem_shrinker);
-	return 0;
-}
-device_initcall(lowmem_init);
-
-/*
- * not really modular, but the easiest way to keep compat with existing
- * bootargs behaviour is to continue using module_param here.
- */
-module_param_named(cost, lowmem_shrinker.seeks, int, 0644);
-module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size, 0644);
-module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
-			 0644);
-module_param_named(debug_level, lowmem_debug_level, uint, 0644);
-
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..942c2250301b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1256,7 +1256,6 @@ extern struct pid *cad_pid;
 #define PFA_NO_NEW_PRIVS		0	/* May not gain new privileges. */
 #define PFA_SPREAD_PAGE			1	/* Spread page cache over cpuset */
 #define PFA_SPREAD_SLAB			2	/* Spread some slab caches over cpuset */
-#define PFA_LMK_WAITING			3	/* Lowmemorykiller is waiting */
 
 
 #define TASK_PFA_TEST(name, func)					\
@@ -1282,9 +1281,6 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
 TASK_PFA_SET(SPREAD_SLAB, spread_slab)
 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
 
-TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
-TASK_PFA_SET(LMK_WAITING, lmk_waiting)
-
 static inline void
 tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags)
 {
-- 
cgit v1.2.3


From 967c9cca2cc50569efc65945325c173cecba83bd Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Wed, 11 Mar 2015 14:39:39 +0100
Subject: tee: generic TEE subsystem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initial patch for generic TEE subsystem.
This subsystem provides:
* Registration/un-registration of TEE drivers.
* Shared memory between normal world and secure world.
* Ioctl interface for interaction with user space.
* Sysfs implementation_id of TEE driver

A TEE (Trusted Execution Environment) driver is a driver that interfaces
with a trusted OS running in some secure environment, for example,
TrustZone on ARM cpus, or a separate secure co-processor etc.

The TEE subsystem can serve a TEE driver for a Global Platform compliant
TEE, but it's not limited to only Global Platform TEEs.

This patch builds on other similar implementations trying to solve
the same problem:
* "optee_linuxdriver" by among others
  Jean-michel DELORME<jean-michel.delorme@st.com> and
  Emmanuel MICHEL <emmanuel.michel@st.com>
* "Generic TrustZone Driver" by Javier González <javier@javigon.com>

Acked-by: Andreas Dannenberg <dannenberg@ti.com>
Tested-by: Jerome Forissier <jerome.forissier@linaro.org> (HiKey)
Tested-by: Volodymyr Babchuk <vlad.babchuk@gmail.com> (RCAR H3)
Tested-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 Documentation/ioctl/ioctl-number.txt |   1 +
 MAINTAINERS                          |   7 +
 drivers/Kconfig                      |   2 +
 drivers/Makefile                     |   1 +
 drivers/tee/Kconfig                  |   8 +
 drivers/tee/Makefile                 |   4 +
 drivers/tee/tee_core.c               | 893 +++++++++++++++++++++++++++++++++++
 drivers/tee/tee_private.h            | 129 +++++
 drivers/tee/tee_shm.c                | 358 ++++++++++++++
 drivers/tee/tee_shm_pool.c           | 156 ++++++
 include/linux/tee_drv.h              | 277 +++++++++++
 include/uapi/linux/tee.h             | 346 ++++++++++++++
 12 files changed, 2182 insertions(+)
 create mode 100644 drivers/tee/Kconfig
 create mode 100644 drivers/tee/Makefile
 create mode 100644 drivers/tee/tee_core.c
 create mode 100644 drivers/tee/tee_private.h
 create mode 100644 drivers/tee/tee_shm.c
 create mode 100644 drivers/tee/tee_shm_pool.c
 create mode 100644 include/linux/tee_drv.h
 create mode 100644 include/uapi/linux/tee.h

(limited to 'include/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 08244bea5048..002331b0b48a 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -308,6 +308,7 @@ Code  Seq#(hex)	Include File		Comments
 0xA3	80-8F	Port ACL		in development:
 					<mailto:tlewis@mindspring.com>
 0xA3	90-9F	linux/dtlk.h
+0xA4	00-1F	uapi/linux/tee.h	Generic TEE subsystem
 0xAA	00-3F	linux/uapi/linux/userfaultfd.h
 0xAB	00-1F	linux/nbd.h
 0xAC	00-1F	linux/raw.h
diff --git a/MAINTAINERS b/MAINTAINERS
index c265a5fe4848..017521958c86 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11086,6 +11086,13 @@ F:	drivers/hwtracing/stm/
 F:	include/linux/stm.h
 F:	include/uapi/linux/stm.h
 
+TEE SUBSYSTEM
+M:	Jens Wiklander <jens.wiklander@linaro.org>
+S:	Maintained
+F:	include/linux/tee_drv.h
+F:	include/uapi/linux/tee.h
+F:	drivers/tee/
+
 THUNDERBOLT DRIVER
 M:	Andreas Noever <andreas.noever@gmail.com>
 S:	Maintained
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 117ca14ccf85..ba2901e76769 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -204,4 +204,6 @@ source "drivers/fpga/Kconfig"
 
 source "drivers/fsi/Kconfig"
 
+source "drivers/tee/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 2eced9afba53..5db9aa6beeaf 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -177,3 +177,4 @@ obj-$(CONFIG_ANDROID)		+= android/
 obj-$(CONFIG_NVMEM)		+= nvmem/
 obj-$(CONFIG_FPGA)		+= fpga/
 obj-$(CONFIG_FSI)		+= fsi/
+obj-$(CONFIG_TEE)		+= tee/
diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig
new file mode 100644
index 000000000000..50c244ead46d
--- /dev/null
+++ b/drivers/tee/Kconfig
@@ -0,0 +1,8 @@
+# Generic Trusted Execution Environment Configuration
+config TEE
+	tristate "Trusted Execution Environment support"
+	select DMA_SHARED_BUFFER
+	select GENERIC_ALLOCATOR
+	help
+	  This implements a generic interface towards a Trusted Execution
+	  Environment (TEE).
diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile
new file mode 100644
index 000000000000..ec64047a86e2
--- /dev/null
+++ b/drivers/tee/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_TEE) += tee.o
+tee-objs += tee_core.o
+tee-objs += tee_shm.o
+tee-objs += tee_shm_pool.o
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
new file mode 100644
index 000000000000..5c60bf4423e6
--- /dev/null
+++ b/drivers/tee/tee_core.c
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include <linux/uaccess.h>
+#include "tee_private.h"
+
+#define TEE_NUM_DEVICES	32
+
+#define TEE_IOCTL_PARAM_SIZE(x) (sizeof(struct tee_param) * (x))
+
+/*
+ * Unprivileged devices in the lower half range and privileged devices in
+ * the upper half range.
+ */
+static DECLARE_BITMAP(dev_mask, TEE_NUM_DEVICES);
+static DEFINE_SPINLOCK(driver_lock);
+
+static struct class *tee_class;
+static dev_t tee_devt;
+
+static int tee_open(struct inode *inode, struct file *filp)
+{
+	int rc;
+	struct tee_device *teedev;
+	struct tee_context *ctx;
+
+	teedev = container_of(inode->i_cdev, struct tee_device, cdev);
+	if (!tee_device_get(teedev))
+		return -EINVAL;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	ctx->teedev = teedev;
+	INIT_LIST_HEAD(&ctx->list_shm);
+	filp->private_data = ctx;
+	rc = teedev->desc->ops->open(ctx);
+	if (rc)
+		goto err;
+
+	return 0;
+err:
+	kfree(ctx);
+	tee_device_put(teedev);
+	return rc;
+}
+
+static int tee_release(struct inode *inode, struct file *filp)
+{
+	struct tee_context *ctx = filp->private_data;
+	struct tee_device *teedev = ctx->teedev;
+	struct tee_shm *shm;
+
+	ctx->teedev->desc->ops->release(ctx);
+	mutex_lock(&ctx->teedev->mutex);
+	list_for_each_entry(shm, &ctx->list_shm, link)
+		shm->ctx = NULL;
+	mutex_unlock(&ctx->teedev->mutex);
+	kfree(ctx);
+	tee_device_put(teedev);
+	return 0;
+}
+
+static int tee_ioctl_version(struct tee_context *ctx,
+			     struct tee_ioctl_version_data __user *uvers)
+{
+	struct tee_ioctl_version_data vers;
+
+	ctx->teedev->desc->ops->get_version(ctx->teedev, &vers);
+	if (copy_to_user(uvers, &vers, sizeof(vers)))
+		return -EFAULT;
+	return 0;
+}
+
+static int tee_ioctl_shm_alloc(struct tee_context *ctx,
+			       struct tee_ioctl_shm_alloc_data __user *udata)
+{
+	long ret;
+	struct tee_ioctl_shm_alloc_data data;
+	struct tee_shm *shm;
+
+	if (copy_from_user(&data, udata, sizeof(data)))
+		return -EFAULT;
+
+	/* Currently no input flags are supported */
+	if (data.flags)
+		return -EINVAL;
+
+	data.id = -1;
+
+	shm = tee_shm_alloc(ctx, data.size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
+	if (IS_ERR(shm))
+		return PTR_ERR(shm);
+
+	data.id = shm->id;
+	data.flags = shm->flags;
+	data.size = shm->size;
+
+	if (copy_to_user(udata, &data, sizeof(data)))
+		ret = -EFAULT;
+	else
+		ret = tee_shm_get_fd(shm);
+
+	/*
+	 * When user space closes the file descriptor the shared memory
+	 * should be freed or if tee_shm_get_fd() failed then it will
+	 * be freed immediately.
+	 */
+	tee_shm_put(shm);
+	return ret;
+}
+
+static int params_from_user(struct tee_context *ctx, struct tee_param *params,
+			    size_t num_params,
+			    struct tee_ioctl_param __user *uparams)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_shm *shm;
+		struct tee_ioctl_param ip;
+
+		if (copy_from_user(&ip, uparams + n, sizeof(ip)))
+			return -EFAULT;
+
+		/* All unused attribute bits has to be zero */
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+			return -EINVAL;
+
+		params[n].attr = ip.attr;
+		switch (ip.attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_NONE:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			params[n].u.value.a = ip.a;
+			params[n].u.value.b = ip.b;
+			params[n].u.value.c = ip.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			/*
+			 * If we fail to get a pointer to a shared memory
+			 * object (and increase the ref count) from an
+			 * identifier we return an error. All pointers that
+			 * has been added in params have an increased ref
+			 * count. It's the callers responibility to do
+			 * tee_shm_put() on all resolved pointers.
+			 */
+			shm = tee_shm_get_from_id(ctx, ip.c);
+			if (IS_ERR(shm))
+				return PTR_ERR(shm);
+
+			params[n].u.memref.shm_offs = ip.a;
+			params[n].u.memref.size = ip.b;
+			params[n].u.memref.shm = shm;
+			break;
+		default:
+			/* Unknown attribute */
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int params_to_user(struct tee_ioctl_param __user *uparams,
+			  size_t num_params, struct tee_param *params)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_ioctl_param __user *up = uparams + n;
+		struct tee_param *p = params + n;
+
+		switch (p->attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			if (put_user(p->u.value.a, &up->a) ||
+			    put_user(p->u.value.b, &up->b) ||
+			    put_user(p->u.value.c, &up->c))
+				return -EFAULT;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			if (put_user((u64)p->u.memref.size, &up->b))
+				return -EFAULT;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+static bool param_is_memref(struct tee_param *param)
+{
+	switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int tee_ioctl_open_session(struct tee_context *ctx,
+				  struct tee_ioctl_buf_data __user *ubuf)
+{
+	int rc;
+	size_t n;
+	struct tee_ioctl_buf_data buf;
+	struct tee_ioctl_open_session_arg __user *uarg;
+	struct tee_ioctl_open_session_arg arg;
+	struct tee_ioctl_param __user *uparams = NULL;
+	struct tee_param *params = NULL;
+	bool have_session = false;
+
+	if (!ctx->teedev->desc->ops->open_session)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_ioctl_open_session_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len)
+		return -EINVAL;
+
+	if (arg.num_params) {
+		params = kcalloc(arg.num_params, sizeof(struct tee_param),
+				 GFP_KERNEL);
+		if (!params)
+			return -ENOMEM;
+		uparams = uarg->params;
+		rc = params_from_user(ctx, params, arg.num_params, uparams);
+		if (rc)
+			goto out;
+	}
+
+	rc = ctx->teedev->desc->ops->open_session(ctx, &arg, params);
+	if (rc)
+		goto out;
+	have_session = true;
+
+	if (put_user(arg.session, &uarg->session) ||
+	    put_user(arg.ret, &uarg->ret) ||
+	    put_user(arg.ret_origin, &uarg->ret_origin)) {
+		rc = -EFAULT;
+		goto out;
+	}
+	rc = params_to_user(uparams, arg.num_params, params);
+out:
+	/*
+	 * If we've succeeded to open the session but failed to communicate
+	 * it back to user space, close the session again to avoid leakage.
+	 */
+	if (rc && have_session && ctx->teedev->desc->ops->close_session)
+		ctx->teedev->desc->ops->close_session(ctx, arg.session);
+
+	if (params) {
+		/* Decrease ref count for all valid shared memory pointers */
+		for (n = 0; n < arg.num_params; n++)
+			if (param_is_memref(params + n) &&
+			    params[n].u.memref.shm)
+				tee_shm_put(params[n].u.memref.shm);
+		kfree(params);
+	}
+
+	return rc;
+}
+
+static int tee_ioctl_invoke(struct tee_context *ctx,
+			    struct tee_ioctl_buf_data __user *ubuf)
+{
+	int rc;
+	size_t n;
+	struct tee_ioctl_buf_data buf;
+	struct tee_ioctl_invoke_arg __user *uarg;
+	struct tee_ioctl_invoke_arg arg;
+	struct tee_ioctl_param __user *uparams = NULL;
+	struct tee_param *params = NULL;
+
+	if (!ctx->teedev->desc->ops->invoke_func)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_ioctl_invoke_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len)
+		return -EINVAL;
+
+	if (arg.num_params) {
+		params = kcalloc(arg.num_params, sizeof(struct tee_param),
+				 GFP_KERNEL);
+		if (!params)
+			return -ENOMEM;
+		uparams = uarg->params;
+		rc = params_from_user(ctx, params, arg.num_params, uparams);
+		if (rc)
+			goto out;
+	}
+
+	rc = ctx->teedev->desc->ops->invoke_func(ctx, &arg, params);
+	if (rc)
+		goto out;
+
+	if (put_user(arg.ret, &uarg->ret) ||
+	    put_user(arg.ret_origin, &uarg->ret_origin)) {
+		rc = -EFAULT;
+		goto out;
+	}
+	rc = params_to_user(uparams, arg.num_params, params);
+out:
+	if (params) {
+		/* Decrease ref count for all valid shared memory pointers */
+		for (n = 0; n < arg.num_params; n++)
+			if (param_is_memref(params + n) &&
+			    params[n].u.memref.shm)
+				tee_shm_put(params[n].u.memref.shm);
+		kfree(params);
+	}
+	return rc;
+}
+
+static int tee_ioctl_cancel(struct tee_context *ctx,
+			    struct tee_ioctl_cancel_arg __user *uarg)
+{
+	struct tee_ioctl_cancel_arg arg;
+
+	if (!ctx->teedev->desc->ops->cancel_req)
+		return -EINVAL;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	return ctx->teedev->desc->ops->cancel_req(ctx, arg.cancel_id,
+						  arg.session);
+}
+
+static int
+tee_ioctl_close_session(struct tee_context *ctx,
+			struct tee_ioctl_close_session_arg __user *uarg)
+{
+	struct tee_ioctl_close_session_arg arg;
+
+	if (!ctx->teedev->desc->ops->close_session)
+		return -EINVAL;
+
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+
+	return ctx->teedev->desc->ops->close_session(ctx, arg.session);
+}
+
+static int params_to_supp(struct tee_context *ctx,
+			  struct tee_ioctl_param __user *uparams,
+			  size_t num_params, struct tee_param *params)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_ioctl_param ip;
+		struct tee_param *p = params + n;
+
+		ip.attr = p->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK;
+		switch (p->attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			ip.a = p->u.value.a;
+			ip.b = p->u.value.b;
+			ip.c = p->u.value.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			ip.b = p->u.memref.size;
+			if (!p->u.memref.shm) {
+				ip.a = 0;
+				ip.c = (u64)-1; /* invalid shm id */
+				break;
+			}
+			ip.a = p->u.memref.shm_offs;
+			ip.c = p->u.memref.shm->id;
+			break;
+		default:
+			ip.a = 0;
+			ip.b = 0;
+			ip.c = 0;
+			break;
+		}
+
+		if (copy_to_user(uparams + n, &ip, sizeof(ip)))
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int tee_ioctl_supp_recv(struct tee_context *ctx,
+			       struct tee_ioctl_buf_data __user *ubuf)
+{
+	int rc;
+	struct tee_ioctl_buf_data buf;
+	struct tee_iocl_supp_recv_arg __user *uarg;
+	struct tee_param *params;
+	u32 num_params;
+	u32 func;
+
+	if (!ctx->teedev->desc->ops->supp_recv)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_iocl_supp_recv_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (get_user(num_params, &uarg->num_params))
+		return -EFAULT;
+
+	if (sizeof(*uarg) + TEE_IOCTL_PARAM_SIZE(num_params) != buf.buf_len)
+		return -EINVAL;
+
+	params = kcalloc(num_params, sizeof(struct tee_param), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	rc = ctx->teedev->desc->ops->supp_recv(ctx, &func, &num_params, params);
+	if (rc)
+		goto out;
+
+	if (put_user(func, &uarg->func) ||
+	    put_user(num_params, &uarg->num_params)) {
+		rc = -EFAULT;
+		goto out;
+	}
+
+	rc = params_to_supp(ctx, uarg->params, num_params, params);
+out:
+	kfree(params);
+	return rc;
+}
+
+static int params_from_supp(struct tee_param *params, size_t num_params,
+			    struct tee_ioctl_param __user *uparams)
+{
+	size_t n;
+
+	for (n = 0; n < num_params; n++) {
+		struct tee_param *p = params + n;
+		struct tee_ioctl_param ip;
+
+		if (copy_from_user(&ip, uparams + n, sizeof(ip)))
+			return -EFAULT;
+
+		/* All unused attribute bits has to be zero */
+		if (ip.attr & ~TEE_IOCTL_PARAM_ATTR_TYPE_MASK)
+			return -EINVAL;
+
+		p->attr = ip.attr;
+		switch (ip.attr) {
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT:
+			/* Only out and in/out values can be updated */
+			p->u.value.a = ip.a;
+			p->u.value.b = ip.b;
+			p->u.value.c = ip.c;
+			break;
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+		case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+			/*
+			 * Only the size of the memref can be updated.
+			 * Since we don't have access to the original
+			 * parameters here, only store the supplied size.
+			 * The driver will copy the updated size into the
+			 * original parameters.
+			 */
+			p->u.memref.shm = NULL;
+			p->u.memref.shm_offs = 0;
+			p->u.memref.size = ip.b;
+			break;
+		default:
+			memset(&p->u, 0, sizeof(p->u));
+			break;
+		}
+	}
+	return 0;
+}
+
+static int tee_ioctl_supp_send(struct tee_context *ctx,
+			       struct tee_ioctl_buf_data __user *ubuf)
+{
+	long rc;
+	struct tee_ioctl_buf_data buf;
+	struct tee_iocl_supp_send_arg __user *uarg;
+	struct tee_param *params;
+	u32 num_params;
+	u32 ret;
+
+	/* Not valid for this driver */
+	if (!ctx->teedev->desc->ops->supp_send)
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, sizeof(buf)))
+		return -EFAULT;
+
+	if (buf.buf_len > TEE_MAX_ARG_SIZE ||
+	    buf.buf_len < sizeof(struct tee_iocl_supp_send_arg))
+		return -EINVAL;
+
+	uarg = u64_to_user_ptr(buf.buf_ptr);
+	if (get_user(ret, &uarg->ret) ||
+	    get_user(num_params, &uarg->num_params))
+		return -EFAULT;
+
+	if (sizeof(*uarg) + TEE_IOCTL_PARAM_SIZE(num_params) > buf.buf_len)
+		return -EINVAL;
+
+	params = kcalloc(num_params, sizeof(struct tee_param), GFP_KERNEL);
+	if (!params)
+		return -ENOMEM;
+
+	rc = params_from_supp(params, num_params, uarg->params);
+	if (rc)
+		goto out;
+
+	rc = ctx->teedev->desc->ops->supp_send(ctx, ret, num_params, params);
+out:
+	kfree(params);
+	return rc;
+}
+
+static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct tee_context *ctx = filp->private_data;
+	void __user *uarg = (void __user *)arg;
+
+	switch (cmd) {
+	case TEE_IOC_VERSION:
+		return tee_ioctl_version(ctx, uarg);
+	case TEE_IOC_SHM_ALLOC:
+		return tee_ioctl_shm_alloc(ctx, uarg);
+	case TEE_IOC_OPEN_SESSION:
+		return tee_ioctl_open_session(ctx, uarg);
+	case TEE_IOC_INVOKE:
+		return tee_ioctl_invoke(ctx, uarg);
+	case TEE_IOC_CANCEL:
+		return tee_ioctl_cancel(ctx, uarg);
+	case TEE_IOC_CLOSE_SESSION:
+		return tee_ioctl_close_session(ctx, uarg);
+	case TEE_IOC_SUPPL_RECV:
+		return tee_ioctl_supp_recv(ctx, uarg);
+	case TEE_IOC_SUPPL_SEND:
+		return tee_ioctl_supp_send(ctx, uarg);
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct file_operations tee_fops = {
+	.owner = THIS_MODULE,
+	.open = tee_open,
+	.release = tee_release,
+	.unlocked_ioctl = tee_ioctl,
+	.compat_ioctl = tee_ioctl,
+};
+
+static void tee_release_device(struct device *dev)
+{
+	struct tee_device *teedev = container_of(dev, struct tee_device, dev);
+
+	spin_lock(&driver_lock);
+	clear_bit(teedev->id, dev_mask);
+	spin_unlock(&driver_lock);
+	mutex_destroy(&teedev->mutex);
+	idr_destroy(&teedev->idr);
+	kfree(teedev);
+}
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc:	Descriptor for this driver
+ * @dev:	Parent device for this device
+ * @pool:	Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+				    struct device *dev,
+				    struct tee_shm_pool *pool,
+				    void *driver_data)
+{
+	struct tee_device *teedev;
+	void *ret;
+	int rc;
+	int offs = 0;
+
+	if (!teedesc || !teedesc->name || !teedesc->ops ||
+	    !teedesc->ops->get_version || !teedesc->ops->open ||
+	    !teedesc->ops->release || !pool)
+		return ERR_PTR(-EINVAL);
+
+	teedev = kzalloc(sizeof(*teedev), GFP_KERNEL);
+	if (!teedev) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	if (teedesc->flags & TEE_DESC_PRIVILEGED)
+		offs = TEE_NUM_DEVICES / 2;
+
+	spin_lock(&driver_lock);
+	teedev->id = find_next_zero_bit(dev_mask, TEE_NUM_DEVICES, offs);
+	if (teedev->id < TEE_NUM_DEVICES)
+		set_bit(teedev->id, dev_mask);
+	spin_unlock(&driver_lock);
+
+	if (teedev->id >= TEE_NUM_DEVICES) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err;
+	}
+
+	snprintf(teedev->name, sizeof(teedev->name), "tee%s%d",
+		 teedesc->flags & TEE_DESC_PRIVILEGED ? "priv" : "",
+		 teedev->id - offs);
+
+	teedev->dev.class = tee_class;
+	teedev->dev.release = tee_release_device;
+	teedev->dev.parent = dev;
+
+	teedev->dev.devt = MKDEV(MAJOR(tee_devt), teedev->id);
+
+	rc = dev_set_name(&teedev->dev, "%s", teedev->name);
+	if (rc) {
+		ret = ERR_PTR(rc);
+		goto err_devt;
+	}
+
+	cdev_init(&teedev->cdev, &tee_fops);
+	teedev->cdev.owner = teedesc->owner;
+	teedev->cdev.kobj.parent = &teedev->dev.kobj;
+
+	dev_set_drvdata(&teedev->dev, driver_data);
+	device_initialize(&teedev->dev);
+
+	/* 1 as tee_device_unregister() does one final tee_device_put() */
+	teedev->num_users = 1;
+	init_completion(&teedev->c_no_users);
+	mutex_init(&teedev->mutex);
+	idr_init(&teedev->idr);
+
+	teedev->desc = teedesc;
+	teedev->pool = pool;
+
+	return teedev;
+err_devt:
+	unregister_chrdev_region(teedev->dev.devt, 1);
+err:
+	pr_err("could not register %s driver\n",
+	       teedesc->flags & TEE_DESC_PRIVILEGED ? "privileged" : "client");
+	if (teedev && teedev->id < TEE_NUM_DEVICES) {
+		spin_lock(&driver_lock);
+		clear_bit(teedev->id, dev_mask);
+		spin_unlock(&driver_lock);
+	}
+	kfree(teedev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tee_device_alloc);
+
+static ssize_t implementation_id_show(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct tee_device *teedev = container_of(dev, struct tee_device, dev);
+	struct tee_ioctl_version_data vers;
+
+	teedev->desc->ops->get_version(teedev, &vers);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", vers.impl_id);
+}
+static DEVICE_ATTR_RO(implementation_id);
+
+static struct attribute *tee_dev_attrs[] = {
+	&dev_attr_implementation_id.attr,
+	NULL
+};
+
+static const struct attribute_group tee_dev_group = {
+	.attrs = tee_dev_attrs,
+};
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev:	Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev)
+{
+	int rc;
+
+	if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) {
+		dev_err(&teedev->dev, "attempt to register twice\n");
+		return -EINVAL;
+	}
+
+	rc = cdev_add(&teedev->cdev, teedev->dev.devt, 1);
+	if (rc) {
+		dev_err(&teedev->dev,
+			"unable to cdev_add() %s, major %d, minor %d, err=%d\n",
+			teedev->name, MAJOR(teedev->dev.devt),
+			MINOR(teedev->dev.devt), rc);
+		return rc;
+	}
+
+	rc = device_add(&teedev->dev);
+	if (rc) {
+		dev_err(&teedev->dev,
+			"unable to device_add() %s, major %d, minor %d, err=%d\n",
+			teedev->name, MAJOR(teedev->dev.devt),
+			MINOR(teedev->dev.devt), rc);
+		goto err_device_add;
+	}
+
+	rc = sysfs_create_group(&teedev->dev.kobj, &tee_dev_group);
+	if (rc) {
+		dev_err(&teedev->dev,
+			"failed to create sysfs attributes, err=%d\n", rc);
+		goto err_sysfs_create_group;
+	}
+
+	teedev->flags |= TEE_DEVICE_FLAG_REGISTERED;
+	return 0;
+
+err_sysfs_create_group:
+	device_del(&teedev->dev);
+err_device_add:
+	cdev_del(&teedev->cdev);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(tee_device_register);
+
+void tee_device_put(struct tee_device *teedev)
+{
+	mutex_lock(&teedev->mutex);
+	/* Shouldn't put in this state */
+	if (!WARN_ON(!teedev->desc)) {
+		teedev->num_users--;
+		if (!teedev->num_users) {
+			teedev->desc = NULL;
+			complete(&teedev->c_no_users);
+		}
+	}
+	mutex_unlock(&teedev->mutex);
+}
+
+bool tee_device_get(struct tee_device *teedev)
+{
+	mutex_lock(&teedev->mutex);
+	if (!teedev->desc) {
+		mutex_unlock(&teedev->mutex);
+		return false;
+	}
+	teedev->num_users++;
+	mutex_unlock(&teedev->mutex);
+	return true;
+}
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev:	Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev)
+{
+	if (!teedev)
+		return;
+
+	if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) {
+		sysfs_remove_group(&teedev->dev.kobj, &tee_dev_group);
+		cdev_del(&teedev->cdev);
+		device_del(&teedev->dev);
+	}
+
+	tee_device_put(teedev);
+	wait_for_completion(&teedev->c_no_users);
+
+	/*
+	 * No need to take a mutex any longer now since teedev->desc was
+	 * set to NULL before teedev->c_no_users was completed.
+	 */
+
+	teedev->pool = NULL;
+
+	put_device(&teedev->dev);
+}
+EXPORT_SYMBOL_GPL(tee_device_unregister);
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @teedev:	Device containing the driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev)
+{
+	return dev_get_drvdata(&teedev->dev);
+}
+EXPORT_SYMBOL_GPL(tee_get_drvdata);
+
+static int __init tee_init(void)
+{
+	int rc;
+
+	tee_class = class_create(THIS_MODULE, "tee");
+	if (IS_ERR(tee_class)) {
+		pr_err("couldn't create class\n");
+		return PTR_ERR(tee_class);
+	}
+
+	rc = alloc_chrdev_region(&tee_devt, 0, TEE_NUM_DEVICES, "tee");
+	if (rc) {
+		pr_err("failed to allocate char dev region\n");
+		class_destroy(tee_class);
+		tee_class = NULL;
+	}
+
+	return rc;
+}
+
+static void __exit tee_exit(void)
+{
+	class_destroy(tee_class);
+	tee_class = NULL;
+	unregister_chrdev_region(tee_devt, TEE_NUM_DEVICES);
+}
+
+subsys_initcall(tee_init);
+module_exit(tee_exit);
+
+MODULE_AUTHOR("Linaro");
+MODULE_DESCRIPTION("TEE Driver");
+MODULE_VERSION("1.0");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h
new file mode 100644
index 000000000000..21cb6be8bce9
--- /dev/null
+++ b/drivers/tee/tee_private.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef TEE_PRIVATE_H
+#define TEE_PRIVATE_H
+
+#include <linux/cdev.h>
+#include <linux/completion.h>
+#include <linux/device.h>
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+
+struct tee_device;
+
+/**
+ * struct tee_shm - shared memory object
+ * @teedev:	device used to allocate the object
+ * @ctx:	context using the object, if NULL the context is gone
+ * @link	link element
+ * @paddr:	physical address of the shared memory
+ * @kaddr:	virtual address of the shared memory
+ * @size:	size of shared memory
+ * @dmabuf:	dmabuf used to for exporting to user space
+ * @flags:	defined by TEE_SHM_* in tee_drv.h
+ * @id:		unique id of a shared memory object on this device
+ */
+struct tee_shm {
+	struct tee_device *teedev;
+	struct tee_context *ctx;
+	struct list_head link;
+	phys_addr_t paddr;
+	void *kaddr;
+	size_t size;
+	struct dma_buf *dmabuf;
+	u32 flags;
+	int id;
+};
+
+struct tee_shm_pool_mgr;
+
+/**
+ * struct tee_shm_pool_mgr_ops - shared memory pool manager operations
+ * @alloc:	called when allocating shared memory
+ * @free:	called when freeing shared memory
+ */
+struct tee_shm_pool_mgr_ops {
+	int (*alloc)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm,
+		     size_t size);
+	void (*free)(struct tee_shm_pool_mgr *poolmgr, struct tee_shm *shm);
+};
+
+/**
+ * struct tee_shm_pool_mgr - shared memory manager
+ * @ops:		operations
+ * @private_data:	private data for the shared memory manager
+ */
+struct tee_shm_pool_mgr {
+	const struct tee_shm_pool_mgr_ops *ops;
+	void *private_data;
+};
+
+/**
+ * struct tee_shm_pool - shared memory pool
+ * @private_mgr:	pool manager for shared memory only between kernel
+ *			and secure world
+ * @dma_buf_mgr:	pool manager for shared memory exported to user space
+ * @destroy:		called when destroying the pool
+ * @private_data:	private data for the pool
+ */
+struct tee_shm_pool {
+	struct tee_shm_pool_mgr private_mgr;
+	struct tee_shm_pool_mgr dma_buf_mgr;
+	void (*destroy)(struct tee_shm_pool *pool);
+	void *private_data;
+};
+
+#define TEE_DEVICE_FLAG_REGISTERED	0x1
+#define TEE_MAX_DEV_NAME_LEN		32
+
+/**
+ * struct tee_device - TEE Device representation
+ * @name:	name of device
+ * @desc:	description of device
+ * @id:		unique id of device
+ * @flags:	represented by TEE_DEVICE_FLAG_REGISTERED above
+ * @dev:	embedded basic device structure
+ * @cdev:	embedded cdev
+ * @num_users:	number of active users of this device
+ * @c_no_user:	completion used when unregistering the device
+ * @mutex:	mutex protecting @num_users and @idr
+ * @idr:	register of shared memory object allocated on this device
+ * @pool:	shared memory pool
+ */
+struct tee_device {
+	char name[TEE_MAX_DEV_NAME_LEN];
+	const struct tee_desc *desc;
+	int id;
+	unsigned int flags;
+
+	struct device dev;
+	struct cdev cdev;
+
+	size_t num_users;
+	struct completion c_no_users;
+	struct mutex mutex;	/* protects num_users and idr */
+
+	struct idr idr;
+	struct tee_shm_pool *pool;
+};
+
+int tee_shm_init(void);
+
+int tee_shm_get_fd(struct tee_shm *shm);
+
+bool tee_device_get(struct tee_device *teedev);
+void tee_device_put(struct tee_device *teedev);
+
+#endif /*TEE_PRIVATE_H*/
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
new file mode 100644
index 000000000000..0be1e3e93bee
--- /dev/null
+++ b/drivers/tee/tee_shm.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/fdtable.h>
+#include <linux/idr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "tee_private.h"
+
+static void tee_shm_release(struct tee_shm *shm)
+{
+	struct tee_device *teedev = shm->teedev;
+	struct tee_shm_pool_mgr *poolm;
+
+	mutex_lock(&teedev->mutex);
+	idr_remove(&teedev->idr, shm->id);
+	if (shm->ctx)
+		list_del(&shm->link);
+	mutex_unlock(&teedev->mutex);
+
+	if (shm->flags & TEE_SHM_DMA_BUF)
+		poolm = &teedev->pool->dma_buf_mgr;
+	else
+		poolm = &teedev->pool->private_mgr;
+
+	poolm->ops->free(poolm, shm);
+	kfree(shm);
+
+	tee_device_put(teedev);
+}
+
+static struct sg_table *tee_shm_op_map_dma_buf(struct dma_buf_attachment
+			*attach, enum dma_data_direction dir)
+{
+	return NULL;
+}
+
+static void tee_shm_op_unmap_dma_buf(struct dma_buf_attachment *attach,
+				     struct sg_table *table,
+				     enum dma_data_direction dir)
+{
+}
+
+static void tee_shm_op_release(struct dma_buf *dmabuf)
+{
+	struct tee_shm *shm = dmabuf->priv;
+
+	tee_shm_release(shm);
+}
+
+static void *tee_shm_op_kmap_atomic(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+	return NULL;
+}
+
+static void *tee_shm_op_kmap(struct dma_buf *dmabuf, unsigned long pgnum)
+{
+	return NULL;
+}
+
+static int tee_shm_op_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma)
+{
+	struct tee_shm *shm = dmabuf->priv;
+	size_t size = vma->vm_end - vma->vm_start;
+
+	return remap_pfn_range(vma, vma->vm_start, shm->paddr >> PAGE_SHIFT,
+			       size, vma->vm_page_prot);
+}
+
+static struct dma_buf_ops tee_shm_dma_buf_ops = {
+	.map_dma_buf = tee_shm_op_map_dma_buf,
+	.unmap_dma_buf = tee_shm_op_unmap_dma_buf,
+	.release = tee_shm_op_release,
+	.kmap_atomic = tee_shm_op_kmap_atomic,
+	.kmap = tee_shm_op_kmap,
+	.mmap = tee_shm_op_mmap,
+};
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* in <linux/tee_drv.h>. TEE_SHM_MAPPED must currently always be
+ * set. If TEE_SHM_DMA_BUF global shared memory will be allocated and
+ * associated with a dma-buf handle, else driver private memory.
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
+{
+	struct tee_device *teedev = ctx->teedev;
+	struct tee_shm_pool_mgr *poolm = NULL;
+	struct tee_shm *shm;
+	void *ret;
+	int rc;
+
+	if (!(flags & TEE_SHM_MAPPED)) {
+		dev_err(teedev->dev.parent,
+			"only mapped allocations supported\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF))) {
+		dev_err(teedev->dev.parent, "invalid shm flags 0x%x", flags);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!tee_device_get(teedev))
+		return ERR_PTR(-EINVAL);
+
+	if (!teedev->pool) {
+		/* teedev has been detached from driver */
+		ret = ERR_PTR(-EINVAL);
+		goto err_dev_put;
+	}
+
+	shm = kzalloc(sizeof(*shm), GFP_KERNEL);
+	if (!shm) {
+		ret = ERR_PTR(-ENOMEM);
+		goto err_dev_put;
+	}
+
+	shm->flags = flags;
+	shm->teedev = teedev;
+	shm->ctx = ctx;
+	if (flags & TEE_SHM_DMA_BUF)
+		poolm = &teedev->pool->dma_buf_mgr;
+	else
+		poolm = &teedev->pool->private_mgr;
+
+	rc = poolm->ops->alloc(poolm, shm, size);
+	if (rc) {
+		ret = ERR_PTR(rc);
+		goto err_kfree;
+	}
+
+	mutex_lock(&teedev->mutex);
+	shm->id = idr_alloc(&teedev->idr, shm, 1, 0, GFP_KERNEL);
+	mutex_unlock(&teedev->mutex);
+	if (shm->id < 0) {
+		ret = ERR_PTR(shm->id);
+		goto err_pool_free;
+	}
+
+	if (flags & TEE_SHM_DMA_BUF) {
+		DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
+
+		exp_info.ops = &tee_shm_dma_buf_ops;
+		exp_info.size = shm->size;
+		exp_info.flags = O_RDWR;
+		exp_info.priv = shm;
+
+		shm->dmabuf = dma_buf_export(&exp_info);
+		if (IS_ERR(shm->dmabuf)) {
+			ret = ERR_CAST(shm->dmabuf);
+			goto err_rem;
+		}
+	}
+	mutex_lock(&teedev->mutex);
+	list_add_tail(&shm->link, &ctx->list_shm);
+	mutex_unlock(&teedev->mutex);
+
+	return shm;
+err_rem:
+	mutex_lock(&teedev->mutex);
+	idr_remove(&teedev->idr, shm->id);
+	mutex_unlock(&teedev->mutex);
+err_pool_free:
+	poolm->ops->free(poolm, shm);
+err_kfree:
+	kfree(shm);
+err_dev_put:
+	tee_device_put(teedev);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tee_shm_alloc);
+
+/**
+ * tee_shm_get_fd() - Increase reference count and return file descriptor
+ * @shm:	Shared memory handle
+ * @returns user space file descriptor to shared memory
+ */
+int tee_shm_get_fd(struct tee_shm *shm)
+{
+	u32 req_flags = TEE_SHM_MAPPED | TEE_SHM_DMA_BUF;
+	int fd;
+
+	if ((shm->flags & req_flags) != req_flags)
+		return -EINVAL;
+
+	fd = dma_buf_fd(shm->dmabuf, O_CLOEXEC);
+	if (fd >= 0)
+		get_dma_buf(shm->dmabuf);
+	return fd;
+}
+
+/**
+ * tee_shm_free() - Free shared memory
+ * @shm:	Handle to shared memory to free
+ */
+void tee_shm_free(struct tee_shm *shm)
+{
+	/*
+	 * dma_buf_put() decreases the dmabuf reference counter and will
+	 * call tee_shm_release() when the last reference is gone.
+	 *
+	 * In the case of driver private memory we call tee_shm_release
+	 * directly instead as it doesn't have a reference counter.
+	 */
+	if (shm->flags & TEE_SHM_DMA_BUF)
+		dma_buf_put(shm->dmabuf);
+	else
+		tee_shm_release(shm);
+}
+EXPORT_SYMBOL_GPL(tee_shm_free);
+
+/**
+ * tee_shm_va2pa() - Get physical address of a virtual address
+ * @shm:	Shared memory handle
+ * @va:		Virtual address to tranlsate
+ * @pa:		Returned physical address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa)
+{
+	/* Check that we're in the range of the shm */
+	if ((char *)va < (char *)shm->kaddr)
+		return -EINVAL;
+	if ((char *)va >= ((char *)shm->kaddr + shm->size))
+		return -EINVAL;
+
+	return tee_shm_get_pa(
+			shm, (unsigned long)va - (unsigned long)shm->kaddr, pa);
+}
+EXPORT_SYMBOL_GPL(tee_shm_va2pa);
+
+/**
+ * tee_shm_pa2va() - Get virtual address of a physical address
+ * @shm:	Shared memory handle
+ * @pa:		Physical address to tranlsate
+ * @va:		Returned virtual address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va)
+{
+	/* Check that we're in the range of the shm */
+	if (pa < shm->paddr)
+		return -EINVAL;
+	if (pa >= (shm->paddr + shm->size))
+		return -EINVAL;
+
+	if (va) {
+		void *v = tee_shm_get_va(shm, pa - shm->paddr);
+
+		if (IS_ERR(v))
+			return PTR_ERR(v);
+		*va = v;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tee_shm_pa2va);
+
+/**
+ * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @returns virtual address of the shared memory + offs if offs is within
+ *	the bounds of this shared memory, else an ERR_PTR
+ */
+void *tee_shm_get_va(struct tee_shm *shm, size_t offs)
+{
+	if (offs >= shm->size)
+		return ERR_PTR(-EINVAL);
+	return (char *)shm->kaddr + offs;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_va);
+
+/**
+ * tee_shm_get_pa() - Get physical address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @pa:		Physical address to return
+ * @returns 0 if offs is within the bounds of this shared memory, else an
+ *	error code.
+ */
+int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa)
+{
+	if (offs >= shm->size)
+		return -EINVAL;
+	if (pa)
+		*pa = shm->paddr + offs;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_pa);
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx:	Context owning the shared memory
+ * @id:		Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id)
+{
+	struct tee_device *teedev;
+	struct tee_shm *shm;
+
+	if (!ctx)
+		return ERR_PTR(-EINVAL);
+
+	teedev = ctx->teedev;
+	mutex_lock(&teedev->mutex);
+	shm = idr_find(&teedev->idr, id);
+	if (!shm || shm->ctx != ctx)
+		shm = ERR_PTR(-EINVAL);
+	else if (shm->flags & TEE_SHM_DMA_BUF)
+		get_dma_buf(shm->dmabuf);
+	mutex_unlock(&teedev->mutex);
+	return shm;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_from_id);
+
+/**
+ * tee_shm_get_id() - Get id of a shared memory object
+ * @shm:	Shared memory handle
+ * @returns id
+ */
+int tee_shm_get_id(struct tee_shm *shm)
+{
+	return shm->id;
+}
+EXPORT_SYMBOL_GPL(tee_shm_get_id);
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm:	Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm)
+{
+	if (shm->flags & TEE_SHM_DMA_BUF)
+		dma_buf_put(shm->dmabuf);
+}
+EXPORT_SYMBOL_GPL(tee_shm_put);
diff --git a/drivers/tee/tee_shm_pool.c b/drivers/tee/tee_shm_pool.c
new file mode 100644
index 000000000000..fb4f8522a526
--- /dev/null
+++ b/drivers/tee/tee_shm_pool.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2015, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/device.h>
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+#include <linux/tee_drv.h>
+#include "tee_private.h"
+
+static int pool_op_gen_alloc(struct tee_shm_pool_mgr *poolm,
+			     struct tee_shm *shm, size_t size)
+{
+	unsigned long va;
+	struct gen_pool *genpool = poolm->private_data;
+	size_t s = roundup(size, 1 << genpool->min_alloc_order);
+
+	va = gen_pool_alloc(genpool, s);
+	if (!va)
+		return -ENOMEM;
+
+	memset((void *)va, 0, s);
+	shm->kaddr = (void *)va;
+	shm->paddr = gen_pool_virt_to_phys(genpool, va);
+	shm->size = s;
+	return 0;
+}
+
+static void pool_op_gen_free(struct tee_shm_pool_mgr *poolm,
+			     struct tee_shm *shm)
+{
+	gen_pool_free(poolm->private_data, (unsigned long)shm->kaddr,
+		      shm->size);
+	shm->kaddr = NULL;
+}
+
+static const struct tee_shm_pool_mgr_ops pool_ops_generic = {
+	.alloc = pool_op_gen_alloc,
+	.free = pool_op_gen_free,
+};
+
+static void pool_res_mem_destroy(struct tee_shm_pool *pool)
+{
+	gen_pool_destroy(pool->private_mgr.private_data);
+	gen_pool_destroy(pool->dma_buf_mgr.private_data);
+}
+
+static int pool_res_mem_mgr_init(struct tee_shm_pool_mgr *mgr,
+				 struct tee_shm_pool_mem_info *info,
+				 int min_alloc_order)
+{
+	size_t page_mask = PAGE_SIZE - 1;
+	struct gen_pool *genpool = NULL;
+	int rc;
+
+	/*
+	 * Start and end must be page aligned
+	 */
+	if ((info->vaddr & page_mask) || (info->paddr & page_mask) ||
+	    (info->size & page_mask))
+		return -EINVAL;
+
+	genpool = gen_pool_create(min_alloc_order, -1);
+	if (!genpool)
+		return -ENOMEM;
+
+	gen_pool_set_algo(genpool, gen_pool_best_fit, NULL);
+	rc = gen_pool_add_virt(genpool, info->vaddr, info->paddr, info->size,
+			       -1);
+	if (rc) {
+		gen_pool_destroy(genpool);
+		return rc;
+	}
+
+	mgr->private_data = genpool;
+	mgr->ops = &pool_ops_generic;
+	return 0;
+}
+
+/**
+ * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
+ * memory range
+ * @priv_info:	Information for driver private shared memory pool
+ * @dmabuf_info: Information for dma-buf shared memory pool
+ *
+ * Start and end of pools will must be page aligned.
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *
+tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
+			   struct tee_shm_pool_mem_info *dmabuf_info)
+{
+	struct tee_shm_pool *pool = NULL;
+	int ret;
+
+	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * Create the pool for driver private shared memory
+	 */
+	ret = pool_res_mem_mgr_init(&pool->private_mgr, priv_info,
+				    3 /* 8 byte aligned */);
+	if (ret)
+		goto err;
+
+	/*
+	 * Create the pool for dma_buf shared memory
+	 */
+	ret = pool_res_mem_mgr_init(&pool->dma_buf_mgr, dmabuf_info,
+				    PAGE_SHIFT);
+	if (ret)
+		goto err;
+
+	pool->destroy = pool_res_mem_destroy;
+	return pool;
+err:
+	if (ret == -ENOMEM)
+		pr_err("%s: can't allocate memory for res_mem shared memory pool\n", __func__);
+	if (pool && pool->private_mgr.private_data)
+		gen_pool_destroy(pool->private_mgr.private_data);
+	kfree(pool);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_alloc_res_mem);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool:	The shared memory pool to free
+ *
+ * There must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+void tee_shm_pool_free(struct tee_shm_pool *pool)
+{
+	pool->destroy(pool);
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(tee_shm_pool_free);
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
new file mode 100644
index 000000000000..0f175b8f6456
--- /dev/null
+++ b/include/linux/tee_drv.h
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __TEE_DRV_H
+#define __TEE_DRV_H
+
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/list.h>
+#include <linux/tee.h>
+
+/*
+ * The file describes the API provided by the generic TEE driver to the
+ * specific TEE driver.
+ */
+
+#define TEE_SHM_MAPPED		0x1	/* Memory mapped by the kernel */
+#define TEE_SHM_DMA_BUF		0x2	/* Memory with dma-buf handle */
+
+struct tee_device;
+struct tee_shm;
+struct tee_shm_pool;
+
+/**
+ * struct tee_context - driver specific context on file pointer data
+ * @teedev:	pointer to this drivers struct tee_device
+ * @list_shm:	List of shared memory object owned by this context
+ * @data:	driver specific context data, managed by the driver
+ */
+struct tee_context {
+	struct tee_device *teedev;
+	struct list_head list_shm;
+	void *data;
+};
+
+struct tee_param_memref {
+	size_t shm_offs;
+	size_t size;
+	struct tee_shm *shm;
+};
+
+struct tee_param_value {
+	u64 a;
+	u64 b;
+	u64 c;
+};
+
+struct tee_param {
+	u64 attr;
+	union {
+		struct tee_param_memref memref;
+		struct tee_param_value value;
+	} u;
+};
+
+/**
+ * struct tee_driver_ops - driver operations vtable
+ * @get_version:	returns version of driver
+ * @open:		called when the device file is opened
+ * @release:		release this open file
+ * @open_session:	open a new session
+ * @close_session:	close a session
+ * @invoke_func:	invoke a trusted function
+ * @cancel_req:		request cancel of an ongoing invoke or open
+ * @supp_revc:		called for supplicant to get a command
+ * @supp_send:		called for supplicant to send a response
+ */
+struct tee_driver_ops {
+	void (*get_version)(struct tee_device *teedev,
+			    struct tee_ioctl_version_data *vers);
+	int (*open)(struct tee_context *ctx);
+	void (*release)(struct tee_context *ctx);
+	int (*open_session)(struct tee_context *ctx,
+			    struct tee_ioctl_open_session_arg *arg,
+			    struct tee_param *param);
+	int (*close_session)(struct tee_context *ctx, u32 session);
+	int (*invoke_func)(struct tee_context *ctx,
+			   struct tee_ioctl_invoke_arg *arg,
+			   struct tee_param *param);
+	int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session);
+	int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params,
+			 struct tee_param *param);
+	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
+			 struct tee_param *param);
+};
+
+/**
+ * struct tee_desc - Describes the TEE driver to the subsystem
+ * @name:	name of driver
+ * @ops:	driver operations vtable
+ * @owner:	module providing the driver
+ * @flags:	Extra properties of driver, defined by TEE_DESC_* below
+ */
+#define TEE_DESC_PRIVILEGED	0x1
+struct tee_desc {
+	const char *name;
+	const struct tee_driver_ops *ops;
+	struct module *owner;
+	u32 flags;
+};
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc:	Descriptor for this driver
+ * @dev:	Parent device for this device
+ * @pool:	Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+				    struct device *dev,
+				    struct tee_shm_pool *pool,
+				    void *driver_data);
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev:	Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev);
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev:	Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev);
+
+/**
+ * struct tee_shm_pool_mem_info - holds information needed to create a shared
+ * memory pool
+ * @vaddr:	Virtual address of start of pool
+ * @paddr:	Physical address of start of pool
+ * @size:	Size in bytes of the pool
+ */
+struct tee_shm_pool_mem_info {
+	unsigned long vaddr;
+	phys_addr_t paddr;
+	size_t size;
+};
+
+/**
+ * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved
+ * memory range
+ * @priv_info:	 Information for driver private shared memory pool
+ * @dmabuf_info: Information for dma-buf shared memory pool
+ *
+ * Start and end of pools will must be page aligned.
+ *
+ * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied
+ * in @dmabuf, others will use the range provided by @priv.
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *
+tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info,
+			   struct tee_shm_pool_mem_info *dmabuf_info);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool:	The shared memory pool to free
+ *
+ * The must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+void tee_shm_pool_free(struct tee_shm_pool *pool);
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev);
+
+/**
+ * tee_shm_alloc() - Allocate shared memory
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ * @flags:	Flags setting properties for the requested shared memory.
+ *
+ * Memory allocated as global shared memory is automatically freed when the
+ * TEE file pointer is closed. The @flags field uses the bits defined by
+ * TEE_SHM_* above. TEE_SHM_MAPPED must currently always be set. If
+ * TEE_SHM_DMA_BUF global shared memory will be allocated and associated
+ * with a dma-buf handle, else driver private memory.
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags);
+
+/**
+ * tee_shm_free() - Free shared memory
+ * @shm:	Handle to shared memory to free
+ */
+void tee_shm_free(struct tee_shm *shm);
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm:	Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm);
+
+/**
+ * tee_shm_va2pa() - Get physical address of a virtual address
+ * @shm:	Shared memory handle
+ * @va:		Virtual address to tranlsate
+ * @pa:		Returned physical address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa);
+
+/**
+ * tee_shm_pa2va() - Get virtual address of a physical address
+ * @shm:	Shared memory handle
+ * @pa:		Physical address to tranlsate
+ * @va:		Returned virtual address
+ * @returns 0 on success and < 0 on failure
+ */
+int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va);
+
+/**
+ * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @returns virtual address of the shared memory + offs if offs is within
+ *	the bounds of this shared memory, else an ERR_PTR
+ */
+void *tee_shm_get_va(struct tee_shm *shm, size_t offs);
+
+/**
+ * tee_shm_get_pa() - Get physical address of a shared memory plus an offset
+ * @shm:	Shared memory handle
+ * @offs:	Offset from start of this shared memory
+ * @pa:		Physical address to return
+ * @returns 0 if offs is within the bounds of this shared memory, else an
+ *	error code.
+ */
+int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa);
+
+/**
+ * tee_shm_get_id() - Get id of a shared memory object
+ * @shm:	Shared memory handle
+ * @returns id
+ */
+int tee_shm_get_id(struct tee_shm *shm);
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx:	Context owning the shared memory
+ * @id:		Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id);
+
+#endif /*__TEE_DRV_H*/
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
new file mode 100644
index 000000000000..370d8845ab21
--- /dev/null
+++ b/include/uapi/linux/tee.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2015-2016, Linaro Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __TEE_H
+#define __TEE_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/*
+ * This file describes the API provided by a TEE driver to user space.
+ *
+ * Each TEE driver defines a TEE specific protocol which is used for the
+ * data passed back and forth using TEE_IOC_CMD.
+ */
+
+/* Helpers to make the ioctl defines */
+#define TEE_IOC_MAGIC	0xa4
+#define TEE_IOC_BASE	0
+
+/* Flags relating to shared memory */
+#define TEE_IOCTL_SHM_MAPPED	0x1	/* memory mapped in normal world */
+#define TEE_IOCTL_SHM_DMA_BUF	0x2	/* dma-buf handle on shared memory */
+
+#define TEE_MAX_ARG_SIZE	1024
+
+#define TEE_GEN_CAP_GP		(1 << 0)/* GlobalPlatform compliant TEE */
+
+/*
+ * TEE Implementation ID
+ */
+#define TEE_IMPL_ID_OPTEE	1
+
+/*
+ * OP-TEE specific capabilities
+ */
+#define TEE_OPTEE_CAP_TZ	(1 << 0)
+
+/**
+ * struct tee_ioctl_version_data - TEE version
+ * @impl_id:	[out] TEE implementation id
+ * @impl_caps:	[out] Implementation specific capabilities
+ * @gen_caps:	[out] Generic capabilities, defined by TEE_GEN_CAPS_* above
+ *
+ * Identifies the TEE implementation, @impl_id is one of TEE_IMPL_ID_* above.
+ * @impl_caps is implementation specific, for example TEE_OPTEE_CAP_*
+ * is valid when @impl_id == TEE_IMPL_ID_OPTEE.
+ */
+struct tee_ioctl_version_data {
+	__u32 impl_id;
+	__u32 impl_caps;
+	__u32 gen_caps;
+};
+
+/**
+ * TEE_IOC_VERSION - query version of TEE
+ *
+ * Takes a tee_ioctl_version_data struct and returns with the TEE version
+ * data filled in.
+ */
+#define TEE_IOC_VERSION		_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 0, \
+				     struct tee_ioctl_version_data)
+
+/**
+ * struct tee_ioctl_shm_alloc_data - Shared memory allocate argument
+ * @size:	[in/out] Size of shared memory to allocate
+ * @flags:	[in/out] Flags to/from allocation.
+ * @id:		[out] Identifier of the shared memory
+ *
+ * The flags field should currently be zero as input. Updated by the call
+ * with actual flags as defined by TEE_IOCTL_SHM_* above.
+ * This structure is used as argument for TEE_IOC_SHM_ALLOC below.
+ */
+struct tee_ioctl_shm_alloc_data {
+	__u64 size;
+	__u32 flags;
+	__s32 id;
+};
+
+/**
+ * TEE_IOC_SHM_ALLOC - allocate shared memory
+ *
+ * Allocates shared memory between the user space process and secure OS.
+ *
+ * Returns a file descriptor on success or < 0 on failure
+ *
+ * The returned file descriptor is used to map the shared memory into user
+ * space. The shared memory is freed when the descriptor is closed and the
+ * memory is unmapped.
+ */
+#define TEE_IOC_SHM_ALLOC	_IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 1, \
+				     struct tee_ioctl_shm_alloc_data)
+
+/**
+ * struct tee_ioctl_buf_data - Variable sized buffer
+ * @buf_ptr:	[in] A __user pointer to a buffer
+ * @buf_len:	[in] Length of the buffer above
+ *
+ * Used as argument for TEE_IOC_OPEN_SESSION, TEE_IOC_INVOKE,
+ * TEE_IOC_SUPPL_RECV, and TEE_IOC_SUPPL_SEND below.
+ */
+struct tee_ioctl_buf_data {
+	__u64 buf_ptr;
+	__u64 buf_len;
+};
+
+/*
+ * Attributes for struct tee_ioctl_param, selects field in the union
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_NONE		0	/* parameter not used */
+
+/*
+ * These defines value parameters (struct tee_ioctl_param_value)
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT	1
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT	2
+#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT	3	/* input and output */
+
+/*
+ * These defines shared memory reference parameters (struct
+ * tee_ioctl_param_memref)
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT	5
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT	6
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT	7	/* input and output */
+
+/*
+ * Mask for the type part of the attribute, leaves room for more types
+ */
+#define TEE_IOCTL_PARAM_ATTR_TYPE_MASK		0xff
+
+/*
+ * Matches TEEC_LOGIN_* in GP TEE Client API
+ * Are only defined for GP compliant TEEs
+ */
+#define TEE_IOCTL_LOGIN_PUBLIC			0
+#define TEE_IOCTL_LOGIN_USER			1
+#define TEE_IOCTL_LOGIN_GROUP			2
+#define TEE_IOCTL_LOGIN_APPLICATION		4
+#define TEE_IOCTL_LOGIN_USER_APPLICATION	5
+#define TEE_IOCTL_LOGIN_GROUP_APPLICATION	6
+
+/**
+ * struct tee_ioctl_param - parameter
+ * @attr: attributes
+ * @a: if a memref, offset into the shared memory object, else a value parameter
+ * @b: if a memref, size of the buffer, else a value parameter
+ * @c: if a memref, shared memory identifier, else a value parameter
+ *
+ * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in
+ * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and
+ * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE
+ * indicates that none of the members are used.
+ *
+ * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an
+ * identifier representing the shared memory object. A memref can reference
+ * a part of a shared memory by specifying an offset (@a) and size (@b) of
+ * the object. To supply the entire shared memory object set the offset
+ * (@a) to 0 and size (@b) to the previously returned size of the object.
+ */
+struct tee_ioctl_param {
+	__u64 attr;
+	__u64 a;
+	__u64 b;
+	__u64 c;
+};
+
+#define TEE_IOCTL_UUID_LEN		16
+
+/**
+ * struct tee_ioctl_open_session_arg - Open session argument
+ * @uuid:	[in] UUID of the Trusted Application
+ * @clnt_uuid:	[in] UUID of client
+ * @clnt_login:	[in] Login class of client, TEE_IOCTL_LOGIN_* above
+ * @cancel_id:	[in] Cancellation id, a unique value to identify this request
+ * @session:	[out] Session id
+ * @ret:	[out] return value
+ * @ret_origin	[out] origin of the return value
+ * @num_params	[in] number of parameters following this struct
+ */
+struct tee_ioctl_open_session_arg {
+	__u8 uuid[TEE_IOCTL_UUID_LEN];
+	__u8 clnt_uuid[TEE_IOCTL_UUID_LEN];
+	__u32 clnt_login;
+	__u32 cancel_id;
+	__u32 session;
+	__u32 ret;
+	__u32 ret_origin;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_OPEN_SESSION - opens a session to a Trusted Application
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_ioctl_open_session_arg followed by any array of struct
+ * tee_ioctl_param
+ */
+#define TEE_IOC_OPEN_SESSION	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 2, \
+				     struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted
+ * Application
+ * @func:	[in] Trusted Application function, specific to the TA
+ * @session:	[in] Session id
+ * @cancel_id:	[in] Cancellation id, a unique value to identify this request
+ * @ret:	[out] return value
+ * @ret_origin	[out] origin of the return value
+ * @num_params	[in] number of parameters following this struct
+ */
+struct tee_ioctl_invoke_arg {
+	__u32 func;
+	__u32 session;
+	__u32 cancel_id;
+	__u32 ret;
+	__u32 ret_origin;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_INVOKE - Invokes a function in a Trusted Application
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_invoke_func_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_INVOKE		_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 3, \
+				     struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_ioctl_cancel_arg - Cancels an open session or invoke ioctl
+ * @cancel_id:	[in] Cancellation id, a unique value to identify this request
+ * @session:	[in] Session id, if the session is opened, else set to 0
+ */
+struct tee_ioctl_cancel_arg {
+	__u32 cancel_id;
+	__u32 session;
+};
+
+/**
+ * TEE_IOC_CANCEL - Cancels an open session or invoke
+ */
+#define TEE_IOC_CANCEL		_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 4, \
+				     struct tee_ioctl_cancel_arg)
+
+/**
+ * struct tee_ioctl_close_session_arg - Closes an open session
+ * @session:	[in] Session id
+ */
+struct tee_ioctl_close_session_arg {
+	__u32 session;
+};
+
+/**
+ * TEE_IOC_CLOSE_SESSION - Closes a session
+ */
+#define TEE_IOC_CLOSE_SESSION	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 5, \
+				     struct tee_ioctl_close_session_arg)
+
+/**
+ * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function
+ * @func:	[in] supplicant function
+ * @num_params	[in/out] number of parameters following this struct
+ *
+ * @num_params is the number of params that tee-supplicant has room to
+ * receive when input, @num_params is the number of actual params
+ * tee-supplicant receives when output.
+ */
+struct tee_iocl_supp_recv_arg {
+	__u32 func;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_SUPPL_RECV - Receive a request for a supplicant function
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_iocl_supp_recv_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_SUPPL_RECV	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 6, \
+				     struct tee_ioctl_buf_data)
+
+/**
+ * struct tee_iocl_supp_send_arg - Send a response to a received request
+ * @ret:	[out] return value
+ * @num_params	[in] number of parameters following this struct
+ */
+struct tee_iocl_supp_send_arg {
+	__u32 ret;
+	__u32 num_params;
+	/* num_params tells the actual number of element in params */
+	struct tee_ioctl_param params[];
+};
+
+/**
+ * TEE_IOC_SUPPL_SEND - Receive a request for a supplicant function
+ *
+ * Takes a struct tee_ioctl_buf_data which contains a struct
+ * tee_iocl_supp_send_arg followed by any array of struct tee_param
+ */
+#define TEE_IOC_SUPPL_SEND	_IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 7, \
+				     struct tee_ioctl_buf_data)
+
+/*
+ * Five syscalls are used when communicating with the TEE driver.
+ * open(): opens the device associated with the driver
+ * ioctl(): as described above operating on the file descriptor from open()
+ * close(): two cases
+ *   - closes the device file descriptor
+ *   - closes a file descriptor connected to allocated shared memory
+ * mmap(): maps shared memory into user space using information from struct
+ *	   tee_ioctl_shm_alloc_data
+ * munmap(): unmaps previously shared memory
+ */
+
+#endif /*__TEE_H*/
-- 
cgit v1.2.3


From 688769f643bfce894f14dc7141bfc6c010f52750 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 9 Mar 2017 15:45:14 -0600
Subject: PCI/MSI: Make pci_msi_shutdown() and pci_msix_shutdown() static

pci_msi_shutdown() and pci_msix_shutdown() are used only in
drivers/pci/msi.c, so make them static.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi.c   | 4 ++--
 include/linux/pci.h | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d571bc330686..4d062c3bf5f0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -882,7 +882,7 @@ int pci_msi_vec_count(struct pci_dev *dev)
 }
 EXPORT_SYMBOL(pci_msi_vec_count);
 
-void pci_msi_shutdown(struct pci_dev *dev)
+static void pci_msi_shutdown(struct pci_dev *dev)
 {
 	struct msi_desc *desc;
 	u32 mask;
@@ -994,7 +994,7 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
 }
 EXPORT_SYMBOL(pci_enable_msix);
 
-void pci_msix_shutdown(struct pci_dev *dev)
+static void pci_msix_shutdown(struct pci_dev *dev)
 {
 	struct msi_desc *entry;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..10917c122974 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1297,11 +1297,9 @@ struct msix_entry {
 
 #ifdef CONFIG_PCI_MSI
 int pci_msi_vec_count(struct pci_dev *dev);
-void pci_msi_shutdown(struct pci_dev *dev);
 void pci_disable_msi(struct pci_dev *dev);
 int pci_msix_vec_count(struct pci_dev *dev);
 int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec);
-void pci_msix_shutdown(struct pci_dev *dev);
 void pci_disable_msix(struct pci_dev *dev);
 void pci_restore_msi_state(struct pci_dev *dev);
 int pci_msi_enabled(void);
@@ -1327,13 +1325,11 @@ int pci_irq_get_node(struct pci_dev *pdev, int vec);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
-static inline void pci_msi_shutdown(struct pci_dev *dev) { }
 static inline void pci_disable_msi(struct pci_dev *dev) { }
 static inline int pci_msix_vec_count(struct pci_dev *dev) { return -ENOSYS; }
 static inline int pci_enable_msix(struct pci_dev *dev,
 				  struct msix_entry *entries, int nvec)
 { return -ENOSYS; }
-static inline void pci_msix_shutdown(struct pci_dev *dev) { }
 static inline void pci_disable_msix(struct pci_dev *dev) { }
 static inline void pci_restore_msi_state(struct pci_dev *dev) { }
 static inline int pci_msi_enabled(void) { return 0; }
-- 
cgit v1.2.3


From abb521e36b9286c262971974ebaeda2d67dadd86 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 8 Mar 2017 08:57:00 -0800
Subject: ethtool: add CRC32 as an RSS hash function

CRC32 engines are usually easily available in hardware and generate
OK spread for RSS hash.  Add CRC32 RSS hash function to ethtool API.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h | 2 ++
 net/core/ethtool.c      | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9ded8c6d8176..83cc9863444b 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -60,6 +60,7 @@ enum ethtool_phys_id_state {
 enum {
 	ETH_RSS_HASH_TOP_BIT, /* Configurable RSS hash function - Toeplitz */
 	ETH_RSS_HASH_XOR_BIT, /* Configurable RSS hash function - Xor */
+	ETH_RSS_HASH_CRC32_BIT, /* Configurable RSS hash function - Crc32 */
 
 	/*
 	 * Add your fresh new hash function bits above and remember to update
@@ -73,6 +74,7 @@ enum {
 
 #define ETH_RSS_HASH_TOP	__ETH_RSS_HASH(TOP)
 #define ETH_RSS_HASH_XOR	__ETH_RSS_HASH(XOR)
+#define ETH_RSS_HASH_CRC32	__ETH_RSS_HASH(CRC32)
 
 #define ETH_RSS_HASH_UNKNOWN	0
 #define ETH_RSS_HASH_NO_CHANGE	0
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index aecb2c7241b6..905a88ad28e0 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -109,6 +109,7 @@ static const char
 rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
 	[ETH_RSS_HASH_TOP_BIT] =	"toeplitz",
 	[ETH_RSS_HASH_XOR_BIT] =	"xor",
+	[ETH_RSS_HASH_CRC32_BIT] =	"crc32",
 };
 
 static const char
-- 
cgit v1.2.3


From d976a525c371276cebd2517349d1d3568a0e48b5 Mon Sep 17 00:00:00 2001
From: Joao Pinto <Joao.Pinto@synopsys.com>
Date: Fri, 10 Mar 2017 18:24:51 +0000
Subject: net: stmmac: multiple queues dt configuration

This patch adds the multiple queues configuration in the Device Tree.
It was also created a set of structures to keep the RX and TX queues
configurations to be used in the driver.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt   | 40 ++++++++++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 91 ++++++++++++++++++++++
 include/linux/stmmac.h                             | 30 +++++++
 3 files changed, 161 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 11b27dfd1627..04a258e2d4e0 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -72,6 +72,27 @@ Optional properties:
 	- snps,mb: mixed-burst
 	- snps,rb: rebuild INCRx Burst
 - mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus.
+- Multiple RX Queues parameters: below the list of all the parameters to
+				 configure the multiple RX queues:
+	- snps,rx-queues-to-use: number of RX queues to be used in the driver
+	- Choose one of these RX scheduling algorithms:
+		- snps,rx-sched-sp: Strict priority
+		- snps,rx-sched-wsp: Weighted Strict priority
+	- For each RX queue
+		- Choose one of these modes:
+			- snps,dcb-algorithm: Queue to be enabled as DCB
+			- snps,avb-algorithm: Queue to be enabled as AVB
+		- snps,map-to-dma-channel: Channel to map
+- Multiple TX Queues parameters: below the list of all the parameters to
+				 configure the multiple TX queues:
+	- snps,tx-queues-to-use: number of TX queues to be used in the driver
+	- Choose one of these TX scheduling algorithms:
+		- snps,tx-sched-wrr: Weighted Round Robin
+		- snps,tx-sched-wfq: Weighted Fair Queuing
+		- snps,tx-sched-dwrr: Deficit Weighted Round Robin
+		- snps,tx-sched-sp: Strict priority
+	- For each TX queue
+		- snps,weight: TX queue weight (if using a weighted algorithm)
 
 Examples:
 
@@ -81,6 +102,23 @@ Examples:
 		snps,blen = <256 128 64 32 0 0 0>;
 	};
 
+	mtl_rx_setup: rx-queues-config {
+		snps,rx-queues-to-use = <1>;
+		snps,rx-sched-sp;
+		queue0 {
+			snps,dcb-algorithm;
+			snps,map-to-dma-channel = <0x0>;
+		};
+	};
+
+	mtl_tx_setup: tx-queues-config {
+		snps,tx-queues-to-use = <1>;
+		snps,tx-sched-wrr;
+		queue0 {
+			snps,weight = <0x10>;
+		};
+	};
+
 	gmac0: ethernet@e0800000 {
 		compatible = "st,spear600-gmac";
 		reg = <0xe0800000 0x8000>;
@@ -104,4 +142,6 @@ Examples:
 			phy1: ethernet-phy@0 {
 			};
 		};
+		snps,mtl-rx-config = <&mtl_rx_setup>;
+		snps,mtl-tx-config = <&mtl_tx_setup>;
 	};
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index fe49c3105755..0b76e3de502d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -131,6 +131,95 @@ static struct stmmac_axi *stmmac_axi_setup(struct platform_device *pdev)
 	return axi;
 }
 
+/**
+ * stmmac_mtl_setup - parse DT parameters for multiple queues configuration
+ * @pdev: platform device
+ */
+static void stmmac_mtl_setup(struct platform_device *pdev,
+			     struct plat_stmmacenet_data *plat)
+{
+	struct device_node *q_node;
+	struct device_node *rx_node;
+	struct device_node *tx_node;
+	u8 queue = 0;
+
+	rx_node = of_parse_phandle(pdev->dev.of_node, "snps,mtl-rx-config", 0);
+	if (!rx_node)
+		return;
+
+	tx_node = of_parse_phandle(pdev->dev.of_node, "snps,mtl-tx-config", 0);
+	if (!tx_node) {
+		of_node_put(rx_node);
+		return;
+	}
+
+	/* Processing RX queues common config */
+	if (of_property_read_u8(rx_node, "snps,rx-queues-to-use",
+				&plat->rx_queues_to_use))
+		plat->rx_queues_to_use = 1;
+
+	if (of_property_read_bool(rx_node, "snps,rx-sched-sp"))
+		plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP;
+	else if (of_property_read_bool(rx_node, "snps,rx-sched-wsp"))
+		plat->rx_sched_algorithm = MTL_RX_ALGORITHM_WSP;
+	else
+		plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP;
+
+	/* Processing individual RX queue config */
+	for_each_child_of_node(rx_node, q_node) {
+		if (queue >= plat->rx_queues_to_use)
+			break;
+
+		if (of_property_read_bool(q_node, "snps,dcb-algorithm"))
+			plat->rx_queues_cfg[queue].mode_to_use = MTL_RX_DCB;
+		else if (of_property_read_bool(q_node, "snps,avb-algorithm"))
+			plat->rx_queues_cfg[queue].mode_to_use = MTL_RX_AVB;
+		else
+			plat->rx_queues_cfg[queue].mode_to_use = MTL_RX_DCB;
+
+		if (of_property_read_u8(q_node, "snps,map-to-dma-channel",
+					&plat->rx_queues_cfg[queue].chan))
+			plat->rx_queues_cfg[queue].chan = queue;
+		/* TODO: Dynamic mapping to be included in the future */
+
+		queue++;
+	}
+
+	/* Processing TX queues common config */
+	if (of_property_read_u8(tx_node, "snps,tx-queues-to-use",
+				&plat->tx_queues_to_use))
+		plat->tx_queues_to_use = 1;
+
+	if (of_property_read_bool(tx_node, "snps,tx-sched-wrr"))
+		plat->tx_sched_algorithm = MTL_TX_ALGORITHM_WRR;
+	else if (of_property_read_bool(tx_node, "snps,tx-sched-wfq"))
+		plat->tx_sched_algorithm = MTL_TX_ALGORITHM_WFQ;
+	else if (of_property_read_bool(tx_node, "snps,tx-sched-dwrr"))
+		plat->tx_sched_algorithm = MTL_TX_ALGORITHM_DWRR;
+	else if (of_property_read_bool(tx_node, "snps,tx-sched-sp"))
+		plat->tx_sched_algorithm = MTL_TX_ALGORITHM_SP;
+	else
+		plat->tx_sched_algorithm = MTL_TX_ALGORITHM_SP;
+
+	queue = 0;
+
+	/* Processing individual TX queue config */
+	for_each_child_of_node(tx_node, q_node) {
+		if (queue >= plat->tx_queues_to_use)
+			break;
+
+		if (of_property_read_u8(q_node, "snps,weight",
+					&plat->tx_queues_cfg[queue].weight))
+			plat->tx_queues_cfg[queue].weight = 0x10 + queue;
+
+		queue++;
+	}
+
+	of_node_put(rx_node);
+	of_node_put(tx_node);
+	of_node_put(q_node);
+}
+
 /**
  * stmmac_dt_phy - parse device-tree driver parameters to allocate PHY resources
  * @plat: driver data platform structure
@@ -340,6 +429,8 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 
 	plat->axi = stmmac_axi_setup(pdev);
 
+	stmmac_mtl_setup(pdev, plat);
+
 	/* clock setup */
 	plat->stmmac_clk = devm_clk_get(&pdev->dev,
 					STMMAC_RESOURCE_NAME);
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index fc273e9d5f67..266ff2af91e5 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -28,6 +28,9 @@
 
 #include <linux/platform_device.h>
 
+#define MTL_MAX_RX_QUEUES	8
+#define MTL_MAX_TX_QUEUES	8
+
 #define STMMAC_RX_COE_NONE	0
 #define STMMAC_RX_COE_TYPE1	1
 #define STMMAC_RX_COE_TYPE2	2
@@ -44,6 +47,18 @@
 #define	STMMAC_CSR_150_250M	0x4	/* MDC = clk_scr_i/102 */
 #define	STMMAC_CSR_250_300M	0x5	/* MDC = clk_scr_i/122 */
 
+/* MTL algorithms identifiers */
+#define MTL_TX_ALGORITHM_WRR	0x0
+#define MTL_TX_ALGORITHM_WFQ	0x1
+#define MTL_TX_ALGORITHM_DWRR	0x2
+#define MTL_TX_ALGORITHM_SP	0x3
+#define MTL_RX_ALGORITHM_SP	0x4
+#define MTL_RX_ALGORITHM_WSP	0x5
+
+/* RX Queue Mode */
+#define MTL_RX_DCB		0x0
+#define MTL_RX_AVB		0x1
+
 /* The MDC clock could be set higher than the IEEE 802.3
  * specified frequency limit 0f 2.5 MHz, by programming a clock divider
  * of value different than the above defined values. The resultant MDIO
@@ -109,6 +124,15 @@ struct stmmac_axi {
 	bool axi_rb;
 };
 
+struct stmmac_rxq_cfg {
+	u8 mode_to_use;
+	u8 chan;
+};
+
+struct stmmac_txq_cfg {
+	u8 weight;
+};
+
 struct plat_stmmacenet_data {
 	int bus_id;
 	int phy_addr;
@@ -133,6 +157,12 @@ struct plat_stmmacenet_data {
 	int unicast_filter_entries;
 	int tx_fifo_size;
 	int rx_fifo_size;
+	u8 rx_queues_to_use;
+	u8 tx_queues_to_use;
+	u8 rx_sched_algorithm;
+	u8 tx_sched_algorithm;
+	struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES];
+	struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES];
 	void (*fix_mac_speed)(void *priv, unsigned int speed);
 	int (*init)(struct platform_device *pdev, void *priv);
 	void (*exit)(struct platform_device *pdev, void *priv);
-- 
cgit v1.2.3


From 19d9187317979cf0c25f67017d2676149abc46b2 Mon Sep 17 00:00:00 2001
From: Joao Pinto <Joao.Pinto@synopsys.com>
Date: Fri, 10 Mar 2017 18:24:59 +0000
Subject: net: stmmac: configuration of CBS in case of a TX AVB queue

This patch adds the configuration of the AVB Credit-Based Shaper.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt   | 22 +++++++++--
 drivers/net/ethernet/stmicro/stmmac/common.h       |  4 ++
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h       | 33 ++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  | 46 +++++++++++++++++++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 29 ++++++++++++++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 29 ++++++++++++--
 include/linux/stmmac.h                             | 12 ++++--
 7 files changed, 164 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index 04a258e2d4e0..a7b0e41cb264 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -92,8 +92,15 @@ Optional properties:
 		- snps,tx-sched-dwrr: Deficit Weighted Round Robin
 		- snps,tx-sched-sp: Strict priority
 	- For each TX queue
-		- snps,weight: TX queue weight (if using a weighted algorithm)
-
+		- snps,weight: TX queue weight (if using a DCB weight algorithm)
+		- Choose one of these modes:
+			- snps,dcb-algorithm: TX queue will be working in DCB
+			- snps,avb-algorithm: TX queue will be working in AVB
+		- Configure Credit Base Shaper (if AVB Mode selected):
+			- snps,send_slope: enable Low Power Interface
+			- snps,idle_slope: unlock on WoL
+			- snps,high_credit: max write outstanding req. limit
+			- snps,low_credit: max read outstanding req. limit
 Examples:
 
 	stmmac_axi_setup: stmmac-axi-config {
@@ -112,10 +119,19 @@ Examples:
 	};
 
 	mtl_tx_setup: tx-queues-config {
-		snps,tx-queues-to-use = <1>;
+		snps,tx-queues-to-use = <2>;
 		snps,tx-sched-wrr;
 		queue0 {
 			snps,weight = <0x10>;
+			snps,dcb-algorithm;
+		};
+
+		queue1 {
+			snps,avb-algorithm;
+			snps,send_slope = <0x1000>;
+			snps,idle_slope = <0x1000>;
+			snps,high_credit = <0x3E800>;
+			snps,low_credit = <0xFFC18000>;
 		};
 	};
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index e7fb98569d2d..9f0d26da6813 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -467,6 +467,10 @@ struct stmmac_ops {
 					u32 weight, u32 queue);
 	/* RX MTL queue to RX dma mapping */
 	void (*map_mtl_to_dma)(struct mac_device_info *hw, u32 queue, u32 chan);
+	/* Configure AV Algorithm */
+	void (*config_cbs)(struct mac_device_info *hw, u32 send_slope,
+			   u32 idle_slope, u32 high_credit, u32 low_credit,
+			   u32 queue);
 	/* Dump MAC registers */
 	void (*dump_regs)(struct mac_device_info *hw, u32 *reg_space);
 	/* Handle extra events on specific interrupts hw dependent */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 1ddf75c0eab5..54bcdb4d10db 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -233,6 +233,15 @@ enum power_event {
 #define MTL_OP_MODE_RTC_96		(2 << MTL_OP_MODE_RTC_SHIFT)
 #define MTL_OP_MODE_RTC_128		(3 << MTL_OP_MODE_RTC_SHIFT)
 
+/* MTL ETS Control register */
+#define MTL_ETS_CTRL_BASE_ADDR		0x00000d10
+#define MTL_ETS_CTRL_BASE_OFFSET	0x40
+#define MTL_ETSX_CTRL_BASE_ADDR(x)	(MTL_ETS_CTRL_BASE_ADDR + \
+					((x) * MTL_ETS_CTRL_BASE_OFFSET))
+
+#define MTL_ETS_CTRL_CC			BIT(3)
+#define MTL_ETS_CTRL_AVALG		BIT(2)
+
 /* MTL Queue Quantum Weight */
 #define MTL_TXQ_WEIGHT_BASE_ADDR	0x00000d18
 #define MTL_TXQ_WEIGHT_BASE_OFFSET	0x40
@@ -240,6 +249,30 @@ enum power_event {
 					((x) * MTL_TXQ_WEIGHT_BASE_OFFSET))
 #define MTL_TXQ_WEIGHT_ISCQW_MASK	GENMASK(20, 0)
 
+/* MTL sendSlopeCredit register */
+#define MTL_SEND_SLP_CRED_BASE_ADDR	0x00000d1c
+#define MTL_SEND_SLP_CRED_OFFSET	0x40
+#define MTL_SEND_SLP_CREDX_BASE_ADDR(x)	(MTL_SEND_SLP_CRED_BASE_ADDR + \
+					((x) * MTL_SEND_SLP_CRED_OFFSET))
+
+#define MTL_SEND_SLP_CRED_SSC_MASK	GENMASK(13, 0)
+
+/* MTL hiCredit register */
+#define MTL_HIGH_CRED_BASE_ADDR		0x00000d20
+#define MTL_HIGH_CRED_OFFSET		0x40
+#define MTL_HIGH_CREDX_BASE_ADDR(x)	(MTL_HIGH_CRED_BASE_ADDR + \
+					((x) * MTL_HIGH_CRED_OFFSET))
+
+#define MTL_HIGH_CRED_HC_MASK		GENMASK(28, 0)
+
+/* MTL loCredit register */
+#define MTL_LOW_CRED_BASE_ADDR		0x00000d24
+#define MTL_LOW_CRED_OFFSET		0x40
+#define MTL_LOW_CREDX_BASE_ADDR(x)	(MTL_LOW_CRED_BASE_ADDR + \
+					((x) * MTL_LOW_CRED_OFFSET))
+
+#define MTL_HIGH_CRED_LC_MASK		GENMASK(28, 0)
+
 /*  MTL debug */
 #define MTL_DEBUG_TXSTSFSTS		BIT(5)
 #define MTL_DEBUG_TXFSTS		BIT(4)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 670cfee415fd..10599dbc232f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -66,9 +66,9 @@ static void dwmac4_rx_queue_enable(struct mac_device_info *hw,
 	u32 value = readl(ioaddr + GMAC_RXQ_CTRL0);
 
 	value &= GMAC_RX_QUEUE_CLEAR(queue);
-	if (mode == MTL_RX_AVB)
+	if (mode == MTL_QUEUE_AVB)
 		value |= GMAC_RX_AV_QUEUE_ENABLE(queue);
-	else if (mode == MTL_RX_DCB)
+	else if (mode == MTL_QUEUE_DCB)
 		value |= GMAC_RX_DCB_QUEUE_ENABLE(queue);
 
 	writel(value, ioaddr + GMAC_RXQ_CTRL0);
@@ -155,6 +155,47 @@ static void dwmac4_map_mtl_dma(struct mac_device_info *hw, u32 queue, u32 chan)
 		writel(value, ioaddr + MTL_RXQ_DMA_MAP1);
 }
 
+static void dwmac4_config_cbs(struct mac_device_info *hw,
+			      u32 send_slope, u32 idle_slope,
+			      u32 high_credit, u32 low_credit, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	pr_debug("Queue %d configured as AVB. Parameters:\n", queue);
+	pr_debug("\tsend_slope: 0x%08x\n", send_slope);
+	pr_debug("\tidle_slope: 0x%08x\n", idle_slope);
+	pr_debug("\thigh_credit: 0x%08x\n", high_credit);
+	pr_debug("\tlow_credit: 0x%08x\n", low_credit);
+
+	/* enable AV algorithm */
+	value = readl(ioaddr + MTL_ETSX_CTRL_BASE_ADDR(queue));
+	value |= MTL_ETS_CTRL_AVALG;
+	value |= MTL_ETS_CTRL_CC;
+	writel(value, ioaddr + MTL_ETSX_CTRL_BASE_ADDR(queue));
+
+	/* configure send slope */
+	value = readl(ioaddr + MTL_SEND_SLP_CREDX_BASE_ADDR(queue));
+	value &= ~MTL_SEND_SLP_CRED_SSC_MASK;
+	value |= send_slope & MTL_SEND_SLP_CRED_SSC_MASK;
+	writel(value, ioaddr + MTL_SEND_SLP_CREDX_BASE_ADDR(queue));
+
+	/* configure idle slope (same register as tx weight) */
+	dwmac4_set_mtl_tx_queue_weight(hw, idle_slope, queue);
+
+	/* configure high credit */
+	value = readl(ioaddr + MTL_HIGH_CREDX_BASE_ADDR(queue));
+	value &= ~MTL_HIGH_CRED_HC_MASK;
+	value |= high_credit & MTL_HIGH_CRED_HC_MASK;
+	writel(value, ioaddr + MTL_HIGH_CREDX_BASE_ADDR(queue));
+
+	/* configure high credit */
+	value = readl(ioaddr + MTL_LOW_CREDX_BASE_ADDR(queue));
+	value &= ~MTL_HIGH_CRED_LC_MASK;
+	value |= low_credit & MTL_HIGH_CRED_LC_MASK;
+	writel(value, ioaddr + MTL_LOW_CREDX_BASE_ADDR(queue));
+}
+
 static void dwmac4_dump_regs(struct mac_device_info *hw, u32 *reg_space)
 {
 	void __iomem *ioaddr = hw->pcsr;
@@ -566,6 +607,7 @@ static const struct stmmac_ops dwmac4_ops = {
 	.prog_mtl_tx_algorithms = dwmac4_prog_mtl_tx_algorithms,
 	.set_mtl_tx_queue_weight = dwmac4_set_mtl_tx_queue_weight,
 	.map_mtl_to_dma = dwmac4_map_mtl_dma,
+	.config_cbs = dwmac4_config_cbs,
 	.dump_regs = dwmac4_dump_regs,
 	.host_irq_status = dwmac4_irq_status,
 	.host_mtl_irq_status = dwmac4_irq_mtl_status,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2449487be534..915636ff2fc1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1670,6 +1670,31 @@ static void stmmac_set_tx_queue_weight(struct stmmac_priv *priv)
 	}
 }
 
+/**
+ *  stmmac_configure_cbs - Configure CBS in TX queue
+ *  @priv: driver private structure
+ *  Description: It is used for configuring CBS in AVB TX queues
+ */
+static void stmmac_configure_cbs(struct stmmac_priv *priv)
+{
+	u32 tx_queues_count = priv->plat->tx_queues_to_use;
+	u32 mode_to_use;
+	u32 queue;
+
+	for (queue = 0; queue < tx_queues_count; queue++) {
+		mode_to_use = priv->plat->tx_queues_cfg[queue].mode_to_use;
+		if (mode_to_use == MTL_QUEUE_DCB)
+			continue;
+
+		priv->hw->mac->config_cbs(priv->hw,
+				priv->plat->tx_queues_cfg[queue].send_slope,
+				priv->plat->tx_queues_cfg[queue].idle_slope,
+				priv->plat->tx_queues_cfg[queue].high_credit,
+				priv->plat->tx_queues_cfg[queue].low_credit,
+				queue);
+	}
+}
+
 /**
  *  stmmac_rx_queue_dma_chan_map - Map RX queue to RX dma channel
  *  @priv: driver private structure
@@ -1710,6 +1735,10 @@ static void stmmac_mtl_configuration(struct stmmac_priv *priv)
 		priv->hw->mac->prog_mtl_tx_algorithms(priv->hw,
 						priv->plat->tx_sched_algorithm);
 
+	/* Configure CBS in AVB TX queues */
+	if (tx_queues_count > 1 && priv->hw->mac->config_cbs)
+		stmmac_configure_cbs(priv);
+
 	/* Map RX MTL to DMA channels */
 	if (rx_queues_count > 1 && priv->hw->mac->map_mtl_to_dma)
 		stmmac_rx_queue_dma_chan_map(priv);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 0b76e3de502d..37f550ae76a5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -171,11 +171,11 @@ static void stmmac_mtl_setup(struct platform_device *pdev,
 			break;
 
 		if (of_property_read_bool(q_node, "snps,dcb-algorithm"))
-			plat->rx_queues_cfg[queue].mode_to_use = MTL_RX_DCB;
+			plat->rx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB;
 		else if (of_property_read_bool(q_node, "snps,avb-algorithm"))
-			plat->rx_queues_cfg[queue].mode_to_use = MTL_RX_AVB;
+			plat->rx_queues_cfg[queue].mode_to_use = MTL_QUEUE_AVB;
 		else
-			plat->rx_queues_cfg[queue].mode_to_use = MTL_RX_DCB;
+			plat->rx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB;
 
 		if (of_property_read_u8(q_node, "snps,map-to-dma-channel",
 					&plat->rx_queues_cfg[queue].chan))
@@ -212,6 +212,29 @@ static void stmmac_mtl_setup(struct platform_device *pdev,
 					&plat->tx_queues_cfg[queue].weight))
 			plat->tx_queues_cfg[queue].weight = 0x10 + queue;
 
+		if (of_property_read_bool(q_node, "snps,dcb-algorithm")) {
+			plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB;
+		} else if (of_property_read_bool(q_node,
+						 "snps,avb-algorithm")) {
+			plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_AVB;
+
+			/* Credit Base Shaper parameters used by AVB */
+			if (of_property_read_u32(q_node, "snps,send_slope",
+				&plat->tx_queues_cfg[queue].send_slope))
+				plat->tx_queues_cfg[queue].send_slope = 0x0;
+			if (of_property_read_u32(q_node, "snps,idle_slope",
+				&plat->tx_queues_cfg[queue].idle_slope))
+				plat->tx_queues_cfg[queue].idle_slope = 0x0;
+			if (of_property_read_u32(q_node, "snps,high_credit",
+				&plat->tx_queues_cfg[queue].high_credit))
+				plat->tx_queues_cfg[queue].high_credit = 0x0;
+			if (of_property_read_u32(q_node, "snps,low_credit",
+				&plat->tx_queues_cfg[queue].low_credit))
+				plat->tx_queues_cfg[queue].low_credit = 0x0;
+		} else {
+			plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB;
+		}
+
 		queue++;
 	}
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 266ff2af91e5..be47b859e954 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -55,9 +55,9 @@
 #define MTL_RX_ALGORITHM_SP	0x4
 #define MTL_RX_ALGORITHM_WSP	0x5
 
-/* RX Queue Mode */
-#define MTL_RX_DCB		0x0
-#define MTL_RX_AVB		0x1
+/* RX/TX Queue Mode */
+#define MTL_QUEUE_DCB		0x0
+#define MTL_QUEUE_AVB		0x1
 
 /* The MDC clock could be set higher than the IEEE 802.3
  * specified frequency limit 0f 2.5 MHz, by programming a clock divider
@@ -131,6 +131,12 @@ struct stmmac_rxq_cfg {
 
 struct stmmac_txq_cfg {
 	u8 weight;
+	u8 mode_to_use;
+	/* Credit Base Shaper parameters */
+	u32 send_slope;
+	u32 idle_slope;
+	u32 high_credit;
+	u32 low_credit;
 };
 
 struct plat_stmmacenet_data {
-- 
cgit v1.2.3


From bd174169c7a12a37b3b4aa2221f084ade010b182 Mon Sep 17 00:00:00 2001
From: David Windsor <dwindsor@gmail.com>
Date: Fri, 10 Mar 2017 10:34:12 -0500
Subject: locking/refcount: Add refcount_t API kernel-doc comments

Signed-off-by: David Windsor <dwindsor@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: elena.reshetova@intel.com
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/1489160052-20293-1-git-send-email-dwindsor@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/refcount.h |  19 ++++++++
 lib/refcount.c           | 122 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 129 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0023fee4bbbc..b34aa649d204 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -6,17 +6,36 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 
+/**
+ * refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
 typedef struct refcount_struct {
 	atomic_t refs;
 } refcount_t;
 
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
 
+/**
+ * refcount_set - set a refcount's value
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ */
 static inline void refcount_set(refcount_t *r, unsigned int n)
 {
 	atomic_set(&r->refs, n);
 }
 
+/**
+ * refcount_read - get a refcount's value
+ * @r: the refcount
+ *
+ * Return: the refcount's value
+ */
 static inline unsigned int refcount_read(const refcount_t *r)
 {
 	return atomic_read(&r->refs);
diff --git a/lib/refcount.c b/lib/refcount.c
index aa09ad3c30b0..8e206ce5609b 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -37,6 +37,24 @@
 #include <linux/refcount.h>
 #include <linux/bug.h>
 
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
 bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
 	unsigned int old, new, val = atomic_read(&r->refs);
@@ -64,18 +82,39 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_add_not_zero);
 
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
 void refcount_add(unsigned int i, refcount_t *r)
 {
 	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
 }
 EXPORT_SYMBOL_GPL(refcount_add);
 
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
  * and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
  */
 bool refcount_inc_not_zero(refcount_t *r)
 {
@@ -103,11 +142,17 @@ bool refcount_inc_not_zero(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
 
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
  *
  * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
  */
 void refcount_inc(refcount_t *r)
 {
@@ -115,6 +160,26 @@ void refcount_inc(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_inc);
 
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
 bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 {
 	unsigned int old, new, val = atomic_read(&r->refs);
@@ -140,13 +205,18 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_sub_and_test);
 
-/*
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
  * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
  * decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
  */
 bool refcount_dec_and_test(refcount_t *r)
 {
@@ -154,21 +224,26 @@ bool refcount_dec_and_test(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_test);
 
-/*
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
  * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
  * when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before.
  */
-
 void refcount_dec(refcount_t *r)
 {
 	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
 }
 EXPORT_SYMBOL_GPL(refcount_dec);
 
-/*
+/**
+ * refcount_dec_if_one - decrement a refcount if it is 1
+ * @r: the refcount
+ *
  * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
  * success thereof.
  *
@@ -178,6 +253,8 @@ EXPORT_SYMBOL_GPL(refcount_dec);
  * It can be used like a try-delete operator; this explicit case is provided
  * and not cmpxchg in generic, because that would allow implementing unsafe
  * operations.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
  */
 bool refcount_dec_if_one(refcount_t *r)
 {
@@ -185,11 +262,16 @@ bool refcount_dec_if_one(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_if_one);
 
-/*
+/**
+ * refcount_dec_not_one - decrement a refcount if it is not 1
+ * @r: the refcount
+ *
  * No atomic_t counterpart, it decrements unless the value is 1, in which case
  * it will return false.
  *
  * Was often done like: atomic_add_unless(&var, -1, 1)
+ *
+ * Return: true if the decrement operation was successful, false otherwise
  */
 bool refcount_dec_not_one(refcount_t *r)
 {
@@ -219,13 +301,21 @@ bool refcount_dec_not_one(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_not_one);
 
-/*
+/**
+ * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
+ *                               refcount to 0
+ * @r: the refcount
+ * @lock: the mutex to be locked
+ *
  * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
  * to decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true and hold mutex if able to decrement refcount to 0, false
+ *         otherwise
  */
 bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
 {
@@ -242,13 +332,21 @@ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
 
-/*
+/**
+ * refcount_dec_and_lock - return holding spinlock if able to decrement
+ *                         refcount to 0
+ * @r: the refcount
+ * @lock: the spinlock to be locked
+ *
  * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
  * decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true and hold spinlock if able to decrement refcount to 0, false
+ *         otherwise
  */
 bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
 {
-- 
cgit v1.2.3


From 0c68f666d4cc8835ed888ffdd58f76d4d8e2da51 Mon Sep 17 00:00:00 2001
From: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Date: Sat, 11 Mar 2017 16:13:03 -0500
Subject: etherdevice: remove unused eth_addr_greater

eth_addr_greater() was introduced for the mv88e6xxx driver, but is not
used anymore. There is no other user, thus remove this function.

Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index c62b709b1ce0..2d9f80848d4b 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -446,21 +446,6 @@ static inline void eth_addr_dec(u8 *addr)
 	u64_to_ether_addr(u, addr);
 }
 
-/**
- * ether_addr_greater - Compare two Ethernet addresses
- * @addr1: Pointer to a six-byte array containing the Ethernet address
- * @addr2: Pointer other six-byte array containing the Ethernet address
- *
- * Compare two Ethernet addresses, returns true addr1 is greater than addr2
- */
-static inline bool ether_addr_greater(const u8 *addr1, const u8 *addr2)
-{
-	u64 u1 = ether_addr_to_u64(addr1);
-	u64 u2 = ether_addr_to_u64(addr2);
-
-	return u1 > u2;
-}
-
 /**
  * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
  * @dev: Pointer to a device structure
-- 
cgit v1.2.3


From 1b028f784e8c341e762c264f70dc0ca1418c8b7a Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dsafonov@virtuozzo.com>
Date: Mon, 6 Mar 2017 17:17:19 +0300
Subject: x86/mm: Introduce mmap_compat_base() for 32-bit mmap()

mmap() uses a base address, from which it starts to look for a free space
for allocation.

The base address is stored in mm->mmap_base, which is calculated during
exec(). The address depends on task's size, set rlimit for stack, ASLR
randomization. The base depends on the task size and the number of random
bits which are different for 64-bit and 32bit applications.

Due to the fact, that the base address is fixed, its mmap() from a compat
(32bit) syscall issued by a 64bit task will return a address which is based
on the 64bit base address and does not fit into the 32bit address space
(4GB). The returned pointer is truncated to 32bit, which results in an
invalid address.

To solve store a seperate compat address base plus a compat legacy address
base in mm_struct. These bases are calculated at exec() time and can be
used later to address the 32bit compat mmap() issued by 64 bit
applications.

As a consequence of this change 32-bit applications issuing a 64-bit
syscall (after doing a long jump) will get a 64-bit mapping now. Before
this change 32-bit applications always got a 32bit mapping.

[ tglx: Massaged changelog and added a comment ]

Signed-off-by: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: 0x7f454c46@gmail.com
Cc: linux-mm@kvack.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: http://lkml.kernel.org/r/20170306141721.9188-4-dsafonov@virtuozzo.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/Kconfig                 |  7 +++++++
 arch/x86/Kconfig             |  1 +
 arch/x86/include/asm/elf.h   |  3 +++
 arch/x86/kernel/sys_x86_64.c | 23 ++++++++++++++++++----
 arch/x86/mm/mmap.c           | 47 ++++++++++++++++++++++++++++++++------------
 include/linux/mm_types.h     |  5 +++++
 6 files changed, 69 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..c4d6833aacd9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS
 	  This value can be changed after boot using the
 	  /proc/sys/vm/mmap_rnd_compat_bits tunable
 
+config HAVE_ARCH_COMPAT_MMAP_BASES
+	bool
+	help
+	  This allows 64bit applications to invoke 32-bit mmap() syscall
+	  and vice-versa 32-bit applications to call 64-bit mmap().
+	  Required for applications doing different bitness syscalls.
+
 config HAVE_COPY_THREAD_TLS
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..2bab9d093b51 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -106,6 +106,7 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
+	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b908141cf0c4..ac5be5ba8527 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -303,6 +303,9 @@ static inline int mmap_is_ia32(void)
 		test_thread_flag(TIF_ADDR32));
 }
 
+extern unsigned long tasksize_32bit(void);
+extern unsigned long tasksize_64bit(void);
+
 #ifdef CONFIG_X86_32
 
 #define __STACK_RND_MASK(is32bit) (0x7ff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 50215a4b9347..c54817baabc7 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -17,6 +17,8 @@
 #include <linux/uaccess.h>
 #include <linux/elf.h>
 
+#include <asm/elf.h>
+#include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
 
@@ -98,6 +100,18 @@ out:
 	return error;
 }
 
+static unsigned long get_mmap_base(int is_legacy)
+{
+	struct mm_struct *mm = current->mm;
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	if (in_compat_syscall())
+		return is_legacy ? mm->mmap_compat_legacy_base
+				 : mm->mmap_compat_base;
+#endif
+	return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
+}
+
 static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
@@ -114,10 +128,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		if (current->flags & PF_RANDOMIZE) {
 			*begin = randomize_page(*begin, 0x02000000);
 		}
-	} else {
-		*begin = current->mm->mmap_legacy_base;
-		*end = TASK_SIZE;
+		return;
 	}
+
+	*begin	= get_mmap_base(1);
+	*end	= in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
 }
 
 unsigned long
@@ -191,7 +206,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
-	info.high_limit = mm->mmap_base;
+	info.high_limit = get_mmap_base(0);
 	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
 	if (filp) {
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1e9cb945dca1..529ab79800af 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -36,11 +36,16 @@ struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
 
-static inline unsigned long tasksize_32bit(void)
+unsigned long tasksize_32bit(void)
 {
 	return IA32_PAGE_OFFSET;
 }
 
+unsigned long tasksize_64bit(void)
+{
+	return TASK_SIZE_MAX;
+}
+
 static unsigned long stack_maxrandom_size(unsigned long task_size)
 {
 	unsigned long max = 0;
@@ -81,6 +86,8 @@ static unsigned long arch_rnd(unsigned int rndbits)
 
 unsigned long arch_mmap_rnd(void)
 {
+	if (!(current->flags & PF_RANDOMIZE))
+		return 0;
 	return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
 }
 
@@ -114,22 +121,36 @@ static unsigned long mmap_legacy_base(unsigned long rnd,
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
-void arch_pick_mmap_layout(struct mm_struct *mm)
+static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
+		unsigned long random_factor, unsigned long task_size)
 {
-	unsigned long random_factor = 0UL;
-
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = arch_mmap_rnd();
-
-	mm->mmap_legacy_base = mmap_legacy_base(random_factor, TASK_SIZE);
+	*legacy_base = mmap_legacy_base(random_factor, task_size);
+	if (mmap_is_legacy())
+		*base = *legacy_base;
+	else
+		*base = mmap_base(random_factor, task_size);
+}
 
-	if (mmap_is_legacy()) {
-		mm->mmap_base = mm->mmap_legacy_base;
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	if (mmap_is_legacy())
 		mm->get_unmapped_area = arch_get_unmapped_area;
-	} else {
-		mm->mmap_base = mmap_base(random_factor, TASK_SIZE);
+	else
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-	}
+
+	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
+			arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	/*
+	 * The mmap syscall mapping base decision depends solely on the
+	 * syscall type (64-bit or compat). This applies for 64bit
+	 * applications and 32bit applications. The 64bit syscall uses
+	 * mmap_base, the compat syscall uses mmap_compat_base.
+	 */
+	arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
+			arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+#endif
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f60f45fe226f..45cdb27791a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -367,6 +367,11 @@ struct mm_struct {
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
 	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	/* Base adresses for compatible mmap() */
+	unsigned long mmap_compat_base;
+	unsigned long mmap_compat_legacy_base;
+#endif
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long highest_vm_end;		/* highest vma end address */
 	pgd_t * pgd;
-- 
cgit v1.2.3


From c6182ac96096f35c7216e4e6a3c64c7374dadeb7 Mon Sep 17 00:00:00 2001
From: George McCollister <george.mccollister@gmail.com>
Date: Thu, 9 Mar 2017 08:14:43 -0600
Subject: regulator: pfuze100-regulator: add coin support

Add support for PF0200 coin cell/super capacitor charger which works as
a current limited voltage source via the LICELL pin. When VIN goes below
a certain threshold LICELL is used to provide power for VSNVS which is
usually used to hold up secure non-volatile storage and the real-time
clock on the SoC.

Signed-off-by: George McCollister <george.mccollister@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/regulator/pfuze100.txt     |  8 +++++++-
 drivers/regulator/pfuze100-regulator.c             | 24 ++++++++++++++++++++++
 include/linux/regulator/pfuze100.h                 |  1 +
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/pfuze100.txt b/Documentation/devicetree/bindings/regulator/pfuze100.txt
index 9b40db88f637..444c47831a40 100644
--- a/Documentation/devicetree/bindings/regulator/pfuze100.txt
+++ b/Documentation/devicetree/bindings/regulator/pfuze100.txt
@@ -13,7 +13,7 @@ Required child node:
   --PFUZE100
   sw1ab,sw1c,sw2,sw3a,sw3b,sw4,swbst,vsnvs,vrefddr,vgen1~vgen6
   --PFUZE200
-  sw1ab,sw2,sw3a,sw3b,swbst,vsnvs,vrefddr,vgen1~vgen6
+  sw1ab,sw2,sw3a,sw3b,swbst,vsnvs,vrefddr,vgen1~vgen6,coin
   --PFUZE3000
   sw1a,sw1b,sw2,sw3,swbst,vsnvs,vrefddr,vldo1,vldo2,vccsd,v33,vldo3,vldo4
 
@@ -205,6 +205,12 @@ Example 2: PFUZE200
 				regulator-max-microvolt = <3300000>;
 				regulator-always-on;
 			};
+
+			coin_reg: coin {
+				regulator-min-microvolt = <2500000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
 		};
 	};
 
diff --git a/drivers/regulator/pfuze100-regulator.c b/drivers/regulator/pfuze100-regulator.c
index cb18b5c4f2db..716abcc834c8 100644
--- a/drivers/regulator/pfuze100-regulator.c
+++ b/drivers/regulator/pfuze100-regulator.c
@@ -40,6 +40,7 @@
 #define PFUZE100_REVID		0x3
 #define PFUZE100_FABID		0x4
 
+#define PFUZE100_COINVOL	0x1a
 #define PFUZE100_SW1ABVOL	0x20
 #define PFUZE100_SW1CVOL	0x2e
 #define PFUZE100_SW2VOL		0x35
@@ -81,6 +82,10 @@ static const int pfuze100_vsnvs[] = {
 	1000000, 1100000, 1200000, 1300000, 1500000, 1800000, 3000000,
 };
 
+static const int pfuze100_coin[] = {
+	2500000, 2700000, 2800000, 2900000, 3000000, 3100000, 3200000, 3300000,
+};
+
 static const int pfuze3000_sw2lo[] = {
 	1500000, 1550000, 1600000, 1650000, 1700000, 1750000, 1800000, 1850000,
 };
@@ -230,6 +235,23 @@ static struct regulator_ops pfuze100_swb_regulator_ops = {
 		.stby_mask = 0x20,	\
 	}
 
+#define PFUZE100_COIN_REG(_chip, _name, base, mask, voltages)	\
+	[_chip ## _ ##  _name] = {	\
+		.desc = {	\
+			.name = #_name,	\
+			.n_voltages = ARRAY_SIZE(voltages),	\
+			.ops = &pfuze100_swb_regulator_ops,	\
+			.type = REGULATOR_VOLTAGE,	\
+			.id = _chip ## _ ## _name,	\
+			.owner = THIS_MODULE,	\
+			.volt_table = voltages,	\
+			.vsel_reg = (base),	\
+			.vsel_mask = (mask),	\
+			.enable_reg = (base),	\
+			.enable_mask = 0x8,	\
+		},	\
+	}
+
 #define PFUZE3000_VCC_REG(_chip, _name, base, min, max, step)	{	\
 	.desc = {	\
 		.name = #_name,	\
@@ -317,6 +339,7 @@ static struct pfuze_regulator pfuze200_regulators[] = {
 	PFUZE100_VGEN_REG(PFUZE200, VGEN4, PFUZE100_VGEN4VOL, 1800000, 3300000, 100000),
 	PFUZE100_VGEN_REG(PFUZE200, VGEN5, PFUZE100_VGEN5VOL, 1800000, 3300000, 100000),
 	PFUZE100_VGEN_REG(PFUZE200, VGEN6, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
+	PFUZE100_COIN_REG(PFUZE200, COIN, PFUZE100_COINVOL, 0x7, pfuze100_coin),
 };
 
 static struct pfuze_regulator pfuze3000_regulators[] = {
@@ -371,6 +394,7 @@ static struct of_regulator_match pfuze200_matches[] = {
 	{ .name = "vgen4",	},
 	{ .name = "vgen5",	},
 	{ .name = "vgen6",	},
+	{ .name = "coin",	},
 };
 
 /* PFUZE3000 */
diff --git a/include/linux/regulator/pfuze100.h b/include/linux/regulator/pfuze100.h
index 70c6c66c5bcf..e0ccf46f66cf 100644
--- a/include/linux/regulator/pfuze100.h
+++ b/include/linux/regulator/pfuze100.h
@@ -48,6 +48,7 @@
 #define PFUZE200_VGEN4		10
 #define PFUZE200_VGEN5		11
 #define PFUZE200_VGEN6		12
+#define PFUZE200_COIN		13
 
 #define PFUZE3000_SW1A		0
 #define PFUZE3000_SW1B		1
-- 
cgit v1.2.3


From e422267322cd319e2695a535e47c5b1feeac45eb Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 8 Mar 2017 02:11:36 +0530
Subject: perf: Add PERF_RECORD_NAMESPACES to include namespaces related info

With the advert of container technologies like docker, that depend on
namespaces for isolation, there is a need for tracing support for
namespaces. This patch introduces new PERF_RECORD_NAMESPACES event for
recording namespaces related info. By recording info for every
namespace, it is left to userspace to take a call on the definition of a
container and trace containers by updating perf tool accordingly.

Each namespace has a combination of device and inode numbers. Though
every namespace has the same device number currently, that may change in
future to avoid the need for a namespace of namespaces. Considering such
possibility, record both device and inode numbers separately for each
namespace.

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/148891929686.25309.2827618988917007768.stgit@hbathini.in.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/perf_event.h      |   2 +
 include/uapi/linux/perf_event.h |  32 ++++++++-
 kernel/events/core.c            | 139 ++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                   |   2 +
 kernel/nsproxy.c                |   3 +
 5 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000fdb211c7d..f19a82362851 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1112,6 +1112,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
 
 extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
+extern void perf_event_namespaces(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
 /* Callchains */
@@ -1315,6 +1316,7 @@ static inline int perf_unregister_guest_info_callbacks
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
+static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
 static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24ac..bec0aad0e15c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
 				use_clockid    :  1, /* use @clockid for time fields */
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
-				__reserved_1   : 36;
+				namespaces     :  1, /* include namespaces data */
+				__reserved_1   : 35;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -610,6 +611,23 @@ struct perf_event_header {
 	__u16	size;
 };
 
+struct perf_ns_link_info {
+	__u64	dev;
+	__u64	ino;
+};
+
+enum {
+	NET_NS_INDEX		= 0,
+	UTS_NS_INDEX		= 1,
+	IPC_NS_INDEX		= 2,
+	PID_NS_INDEX		= 3,
+	USER_NS_INDEX		= 4,
+	MNT_NS_INDEX		= 5,
+	CGROUP_NS_INDEX		= 6,
+
+	NR_NAMESPACES,		/* number of available namespaces */
+};
+
 enum perf_event_type {
 
 	/*
@@ -862,6 +880,18 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
 
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid;
+	 *	u32				tid;
+	 *	u64				nr_namespaces;
+	 *	{ u64				dev, inode; } [nr_namespaces];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_NAMESPACES			= 16,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f41548f2e32..16c877a121c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
 
 #include "internal.h"
 
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_mmap_events);
 	if (event->attr.comm)
 		atomic_dec(&nr_comm_events);
+	if (event->attr.namespaces)
+		atomic_dec(&nr_namespaces_events);
 	if (event->attr.task)
 		atomic_dec(&nr_task_events);
 	if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
 void perf_event_fork(struct task_struct *task)
 {
 	perf_event_task(task, NULL, 1);
+	perf_event_namespaces(task);
 }
 
 /*
@@ -6592,6 +6598,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
 	perf_event_comm_event(&comm_event);
 }
 
+/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+	struct task_struct		*task;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				nr_namespaces;
+		struct perf_ns_link_info	link_info[NR_NAMESPACES];
+	} event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+	return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+					 void *data)
+{
+	struct perf_namespaces_event *namespaces_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_namespaces_match(event))
+		return;
+
+	perf_event_header__init_id(&namespaces_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				namespaces_event->event_id.header.size);
+	if (ret)
+		return;
+
+	namespaces_event->event_id.pid = perf_event_pid(event,
+							namespaces_event->task);
+	namespaces_event->event_id.tid = perf_event_tid(event,
+							namespaces_event->task);
+
+	perf_output_put(&handle, namespaces_event->event_id);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+				   struct task_struct *task,
+				   const struct proc_ns_operations *ns_ops)
+{
+	struct path ns_path;
+	struct inode *ns_inode;
+	void *error;
+
+	error = ns_get_path(&ns_path, task, ns_ops);
+	if (!error) {
+		ns_inode = ns_path.dentry->d_inode;
+		ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+		ns_link_info->ino = ns_inode->i_ino;
+	}
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+	struct perf_namespaces_event namespaces_event;
+	struct perf_ns_link_info *ns_link_info;
+
+	if (!atomic_read(&nr_namespaces_events))
+		return;
+
+	namespaces_event = (struct perf_namespaces_event){
+		.task	= task,
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_NAMESPACES,
+				.misc = 0,
+				.size = sizeof(namespaces_event.event_id),
+			},
+			/* .pid */
+			/* .tid */
+			.nr_namespaces = NR_NAMESPACES,
+			/* .link_info[NR_NAMESPACES] */
+		},
+	};
+
+	ns_link_info = namespaces_event.event_id.link_info;
+
+	perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+			       task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+	perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+			       task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+	perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+			       task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+	perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+			       task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+	perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+			       task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+	perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+			       task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+	perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+			       task, &cgroupns_operations);
+#endif
+
+	perf_iterate_sb(perf_event_namespaces_output,
+			&namespaces_event,
+			NULL);
+}
+
 /*
  * mmap tracking
  */
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
 		atomic_inc(&nr_mmap_events);
 	if (event->attr.comm)
 		atomic_inc(&nr_comm_events);
+	if (event->attr.namespaces)
+		atomic_inc(&nr_namespaces_events);
 	if (event->attr.task)
 		atomic_inc(&nr_task_events);
 	if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EACCES;
 	}
 
+	if (attr.namespaces) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
 	if (attr.freq) {
 		if (attr.sample_freq > sysctl_perf_event_sample_rate)
 			return -EINVAL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..afa2947286cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2352,6 +2352,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 		}
 	}
 
+	perf_event_namespaces(current);
+
 bad_unshare_cleanup_cred:
 	if (new_cred)
 		put_cred(new_cred);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
+#include <linux/perf_event.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 		goto out;
 	}
 	switch_task_namespaces(tsk, new_nsproxy);
+
+	perf_event_namespaces(tsk);
 out:
 	fput(file);
 	return err;
-- 
cgit v1.2.3


From be086e7c53f1fac51eed14523b28f2214b548dd2 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Sat, 11 Mar 2017 18:39:18 +0200
Subject: qed*: Utilize Firmware 8.15.3.0

This patch advances the qed* drivers into using the newer firmware -
This solves several firmware bugs, mostly related [but not limited to]
various init/deinit issues in various offloaded protocols.

It also introduces a major 4-Cached SGE change in firmware, which can be
seen in the storage drivers' changes.

In addition, this firmware is required for supporting the new QL41xxx
series of adapters; While this patch doesn't add the actual support,
the firmware contains the necessary initialization & firmware logic to
operate such adapters [actual support would be added later on].

Changes from Previous versions:
-------------------------------
 - V2 - fix kbuild-test robot warnings

Signed-off-by: Tomer Tayar <Tomer.Tayar@cavium.com>
Signed-off-by: Ram Amrani <Ram.Amrani@cavium.com>
Signed-off-by: Manish Rangankar <Manish.Rangankar@cavium.com>
Signed-off-by: Chad Dupuis <Chad.Dupuis@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/qedr/main.c                  |    5 +-
 drivers/infiniband/hw/qedr/qedr.h                  |    3 +-
 drivers/infiniband/hw/qedr/qedr_cm.c               |    3 -
 drivers/infiniband/hw/qedr/qedr_hsi.h              |   56 -
 drivers/infiniband/hw/qedr/verbs.c                 |    3 +-
 drivers/net/ethernet/qlogic/qed/qed.h              |    2 +-
 drivers/net/ethernet/qlogic/qed/qed_cxt.c          |    5 +-
 drivers/net/ethernet/qlogic/qed/qed_debug.c        | 1564 ++++++++++++--------
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          |  764 +++++++++-
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    |   56 +-
 drivers/net/ethernet/qlogic/qed/qed_init_ops.c     |    2 +-
 drivers/net/ethernet/qlogic/qed/qed_ll2.c          |   48 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     |   30 +
 drivers/net/ethernet/qlogic/qed/qed_roce.c         |  220 ++-
 drivers/net/ethernet/qlogic/qed/qed_roce.h         |   11 +-
 drivers/net/ethernet/qlogic/qed/qed_spq.c          |   13 +-
 drivers/net/ethernet/qlogic/qede/qede.h            |    2 +-
 drivers/scsi/qedf/Makefile                         |    2 +-
 drivers/scsi/qedf/drv_fcoe_fw_funcs.c              |  190 +++
 drivers/scsi/qedf/drv_fcoe_fw_funcs.h              |   93 ++
 drivers/scsi/qedf/drv_scsi_fw_funcs.c              |   44 +
 drivers/scsi/qedf/drv_scsi_fw_funcs.h              |   85 ++
 drivers/scsi/qedf/qedf.h                           |   23 +-
 drivers/scsi/qedf/qedf_els.c                       |   25 +-
 drivers/scsi/qedf/qedf_io.c                        |  670 +++------
 drivers/scsi/qedi/Makefile                         |    2 +-
 drivers/scsi/qedi/qedi_fw.c                        | 1068 ++++++-------
 drivers/scsi/qedi/qedi_fw_api.c                    |  781 ++++++++++
 drivers/scsi/qedi/qedi_fw_iscsi.h                  |  117 ++
 drivers/scsi/qedi/qedi_fw_scsi.h                   |   55 +
 drivers/scsi/qedi/qedi_iscsi.c                     |   12 +-
 drivers/scsi/qedi/qedi_iscsi.h                     |    2 +-
 drivers/scsi/qedi/qedi_version.h                   |    4 +-
 include/linux/qed/common_hsi.h                     |   30 +-
 include/linux/qed/eth_common.h                     |    3 +
 include/linux/qed/fcoe_common.h                    |  180 +--
 include/linux/qed/iscsi_common.h                   |  241 ++-
 include/linux/qed/roce_common.h                    |   17 +
 include/linux/qed/storage_common.h                 |   30 +-
 include/linux/qed/tcp_common.h                     |    1 +
 40 files changed, 4234 insertions(+), 2228 deletions(-)
 delete mode 100644 drivers/infiniband/hw/qedr/qedr_hsi.h
 create mode 100644 drivers/scsi/qedf/drv_fcoe_fw_funcs.c
 create mode 100644 drivers/scsi/qedf/drv_fcoe_fw_funcs.h
 create mode 100644 drivers/scsi/qedf/drv_scsi_fw_funcs.c
 create mode 100644 drivers/scsi/qedf/drv_scsi_fw_funcs.h
 create mode 100644 drivers/scsi/qedi/qedi_fw_api.c
 create mode 100644 drivers/scsi/qedi/qedi_fw_iscsi.h
 create mode 100644 drivers/scsi/qedi/qedi_fw_scsi.h

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index b9b47e5cc8b3..ced0461d6e9f 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -587,9 +587,8 @@ void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle)
 #define EVENT_TYPE_CQ		1
 #define EVENT_TYPE_QP		2
 	struct qedr_dev *dev = (struct qedr_dev *)context;
-	union event_ring_data *data = fw_handle;
-	u64 roce_handle64 = ((u64)data->roce_handle.hi << 32) +
-			    data->roce_handle.lo;
+	struct regpair *async_handle = (struct regpair *)fw_handle;
+	u64 roce_handle64 = ((u64) async_handle->hi << 32) + async_handle->lo;
 	u8 event_type = EVENT_TYPE_NOT_DEFINED;
 	struct ib_event event;
 	struct ib_cq *ibcq;
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index bb32e4792ec9..5cb9195513bd 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -38,7 +38,8 @@
 #include <linux/qed/qed_chain.h>
 #include <linux/qed/qed_roce_if.h>
 #include <linux/qed/qede_roce.h>
-#include "qedr_hsi.h"
+#include <linux/qed/roce_common.h>
+#include "qedr_hsi_rdma.h"
 
 #define QEDR_MODULE_VERSION	"8.10.10.0"
 #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA"
diff --git a/drivers/infiniband/hw/qedr/qedr_cm.c b/drivers/infiniband/hw/qedr/qedr_cm.c
index 699632893dd9..a6280ce3e2a5 100644
--- a/drivers/infiniband/hw/qedr/qedr_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_cm.c
@@ -43,14 +43,11 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
 
-#include "qedr_hsi.h"
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_roce_if.h>
 #include "qedr.h"
-#include "qedr_hsi.h"
 #include "verbs.h"
 #include <rdma/qedr-abi.h>
-#include "qedr_hsi.h"
 #include "qedr_cm.h"
 
 void qedr_inc_sw_gsi_cons(struct qedr_qp_hwq_info *info)
diff --git a/drivers/infiniband/hw/qedr/qedr_hsi.h b/drivers/infiniband/hw/qedr/qedr_hsi.h
deleted file mode 100644
index 66d27521373f..000000000000
--- a/drivers/infiniband/hw/qedr/qedr_hsi.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* QLogic qedr NIC Driver
- * Copyright (c) 2015-2016  QLogic Corporation
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and /or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __QED_HSI_ROCE__
-#define __QED_HSI_ROCE__
-
-#include <linux/qed/common_hsi.h>
-#include <linux/qed/roce_common.h>
-#include "qedr_hsi_rdma.h"
-
-/* Affiliated asynchronous events / errors enumeration */
-enum roce_async_events_type {
-	ROCE_ASYNC_EVENT_NONE = 0,
-	ROCE_ASYNC_EVENT_COMM_EST = 1,
-	ROCE_ASYNC_EVENT_SQ_DRAINED,
-	ROCE_ASYNC_EVENT_SRQ_LIMIT,
-	ROCE_ASYNC_EVENT_LAST_WQE_REACHED,
-	ROCE_ASYNC_EVENT_CQ_ERR,
-	ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR,
-	ROCE_ASYNC_EVENT_LOCAL_CATASTROPHIC_ERR,
-	ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR,
-	ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR,
-	ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR,
-	ROCE_ASYNC_EVENT_SRQ_EMPTY,
-	MAX_ROCE_ASYNC_EVENTS_TYPE
-};
-
-#endif /* __QED_HSI_ROCE__ */
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 6b3bb32803bd..2091902848e6 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -43,7 +43,8 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
 
-#include "qedr_hsi.h"
+#include <linux/qed/common_hsi.h>
+#include "qedr_hsi_rdma.h"
 #include <linux/qed/qed_if.h>
 #include "qedr.h"
 #include "verbs.h"
diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 00c17fa6545b..be99092d7209 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -51,7 +51,7 @@
 #include "qed_hsi.h"
 
 extern const struct qed_common_ops qed_common_ops_pass;
-#define DRV_MODULE_VERSION "8.10.10.20"
+#define DRV_MODULE_VERSION "8.10.10.21"
 
 #define MAX_HWFNS_PER_DEVICE    (4)
 #define NAME_SIZE 16
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index d42d03df751a..89d210a54335 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -71,8 +71,7 @@
 #define TM_ALIGN        BIT(TM_SHIFT)
 #define TM_ELEM_SIZE    4
 
-/* For RoCE we configure to 64K to cover for RoCE max tasks 256K purpose. */
-#define ILT_DEFAULT_HW_P_SIZE	(IS_ENABLED(CONFIG_QED_RDMA) ? 4 : 3)
+#define ILT_DEFAULT_HW_P_SIZE	4
 
 #define ILT_PAGE_IN_BYTES(hw_p_size)	(1U << ((hw_p_size) + 12))
 #define ILT_CFG_REG(cli, reg)	PSWRQ2_REG_ ## cli ## _ ## reg ## _RT_OFFSET
@@ -1126,7 +1125,7 @@ int qed_cxt_mngr_alloc(struct qed_hwfn *p_hwfn)
 	clients[ILT_CLI_TSDM].first.reg = ILT_CFG_REG(TSDM, FIRST_ILT);
 	clients[ILT_CLI_TSDM].last.reg = ILT_CFG_REG(TSDM, LAST_ILT);
 	clients[ILT_CLI_TSDM].p_size.reg = ILT_CFG_REG(TSDM, P_SIZE);
-	/* default ILT page size for all clients is 32K */
+	/* default ILT page size for all clients is 64K */
 	for (i = 0; i < ILT_CLI_MAX; i++)
 		p_mngr->clients[i].p_size.val = ILT_DEFAULT_HW_P_SIZE;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 68f19ca57f96..5e81e8a7a109 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -17,7 +17,6 @@
 
 /* Chip IDs enum */
 enum chip_ids {
-	CHIP_RESERVED,
 	CHIP_BB_B0,
 	CHIP_K2,
 	MAX_CHIP_IDS
@@ -40,6 +39,7 @@ enum mem_groups {
 	MEM_GROUP_BTB_RAM,
 	MEM_GROUP_RDIF_CTX,
 	MEM_GROUP_TDIF_CTX,
+	MEM_GROUP_CFC_MEM,
 	MEM_GROUP_CONN_CFC_MEM,
 	MEM_GROUP_TASK_CFC_MEM,
 	MEM_GROUP_CAU_PI,
@@ -72,6 +72,7 @@ static const char * const s_mem_group_names[] = {
 	"BTB_RAM",
 	"RDIF_CTX",
 	"TDIF_CTX",
+	"CFC_MEM",
 	"CONN_CFC_MEM",
 	"TASK_CFC_MEM",
 	"CAU_PI",
@@ -185,13 +186,16 @@ struct dbg_array {
 	u32 size_in_dwords;
 };
 
+struct chip_platform_defs {
+	u8 num_ports;
+	u8 num_pfs;
+	u8 num_vfs;
+};
+
 /* Chip constant definitions */
 struct chip_defs {
 	const char *name;
-	struct {
-		u8 num_ports;
-		u8 num_pfs;
-	} per_platform[MAX_PLATFORM_IDS];
+	struct chip_platform_defs per_platform[MAX_PLATFORM_IDS];
 };
 
 /* Platform constant definitions */
@@ -405,22 +409,23 @@ struct phy_defs {
 /***************************** Constant Arrays *******************************/
 
 /* Debug arrays */
-static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {NULL} };
+static struct dbg_array s_dbg_arrays[MAX_BIN_DBG_BUFFER_TYPE] = { {0} };
 
 /* Chip constant definitions array */
 static struct chip_defs s_chip_defs[MAX_CHIP_IDS] = {
-	{ "reserved", { {0, 0}, {0, 0}, {0, 0}, {0, 0} } },
 	{ "bb_b0",
-	  { {MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB}, {0, 0}, {0, 0}, {0, 0} } },
-	{ "k2", { {MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2}, {0, 0}, {0, 0}, {0, 0} } }
+	  { {MAX_NUM_PORTS_BB, MAX_NUM_PFS_BB, MAX_NUM_VFS_BB}, {0, 0, 0},
+	    {0, 0, 0}, {0, 0, 0} } },
+	{ "k2",
+	  { {MAX_NUM_PORTS_K2, MAX_NUM_PFS_K2, MAX_NUM_VFS_K2}, {0, 0, 0},
+	    {0, 0, 0}, {0, 0, 0} } }
 };
 
 /* Storm constant definitions array */
 static struct storm_defs s_storm_defs[] = {
 	/* Tstorm */
 	{'T', BLOCK_TSEM,
-	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT,
-	  DBG_BUS_CLIENT_RBCT}, true,
+	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT}, true,
 	 TSEM_REG_FAST_MEMORY,
 	 TSEM_REG_DBG_FRAME_MODE, TSEM_REG_SLOW_DBG_ACTIVE,
 	 TSEM_REG_SLOW_DBG_MODE, TSEM_REG_DBG_MODE1_CFG,
@@ -432,8 +437,7 @@ static struct storm_defs s_storm_defs[] = {
 	 4, TCM_REG_SM_TASK_CTX},
 	/* Mstorm */
 	{'M', BLOCK_MSEM,
-	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT,
-	  DBG_BUS_CLIENT_RBCM}, false,
+	 {DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM}, false,
 	 MSEM_REG_FAST_MEMORY,
 	 MSEM_REG_DBG_FRAME_MODE, MSEM_REG_SLOW_DBG_ACTIVE,
 	 MSEM_REG_SLOW_DBG_MODE, MSEM_REG_DBG_MODE1_CFG,
@@ -445,8 +449,7 @@ static struct storm_defs s_storm_defs[] = {
 	 7, MCM_REG_SM_TASK_CTX},
 	/* Ustorm */
 	{'U', BLOCK_USEM,
-	 {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU,
-	  DBG_BUS_CLIENT_RBCU}, false,
+	 {DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU}, false,
 	 USEM_REG_FAST_MEMORY,
 	 USEM_REG_DBG_FRAME_MODE, USEM_REG_SLOW_DBG_ACTIVE,
 	 USEM_REG_SLOW_DBG_MODE, USEM_REG_DBG_MODE1_CFG,
@@ -458,8 +461,7 @@ static struct storm_defs s_storm_defs[] = {
 	 3, UCM_REG_SM_TASK_CTX},
 	/* Xstorm */
 	{'X', BLOCK_XSEM,
-	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX,
-	  DBG_BUS_CLIENT_RBCX}, false,
+	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX}, false,
 	 XSEM_REG_FAST_MEMORY,
 	 XSEM_REG_DBG_FRAME_MODE, XSEM_REG_SLOW_DBG_ACTIVE,
 	 XSEM_REG_SLOW_DBG_MODE, XSEM_REG_DBG_MODE1_CFG,
@@ -471,8 +473,7 @@ static struct storm_defs s_storm_defs[] = {
 	 0, 0},
 	/* Ystorm */
 	{'Y', BLOCK_YSEM,
-	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX,
-	  DBG_BUS_CLIENT_RBCY}, false,
+	 {DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY}, false,
 	 YSEM_REG_FAST_MEMORY,
 	 YSEM_REG_DBG_FRAME_MODE, YSEM_REG_SLOW_DBG_ACTIVE,
 	 YSEM_REG_SLOW_DBG_MODE, YSEM_REG_DBG_MODE1_CFG,
@@ -484,8 +485,7 @@ static struct storm_defs s_storm_defs[] = {
 	 12, YCM_REG_SM_TASK_CTX},
 	/* Pstorm */
 	{'P', BLOCK_PSEM,
-	 {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS,
-	  DBG_BUS_CLIENT_RBCS}, true,
+	 {DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS}, true,
 	 PSEM_REG_FAST_MEMORY,
 	 PSEM_REG_DBG_FRAME_MODE, PSEM_REG_SLOW_DBG_ACTIVE,
 	 PSEM_REG_SLOW_DBG_MODE, PSEM_REG_DBG_MODE1_CFG,
@@ -499,8 +499,9 @@ static struct storm_defs s_storm_defs[] = {
 
 /* Block definitions array */
 static struct block_defs block_grc_defs = {
-	"grc", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
+	"grc",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
 	GRC_REG_DBG_SELECT, GRC_REG_DBG_DWORD_ENABLE,
 	GRC_REG_DBG_SHIFT, GRC_REG_DBG_FORCE_VALID,
 	GRC_REG_DBG_FORCE_FRAME,
@@ -508,29 +509,30 @@ static struct block_defs block_grc_defs = {
 };
 
 static struct block_defs block_miscs_defs = {
-	"miscs", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"miscs", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_misc_defs = {
-	"misc", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"misc", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_dbu_defs = {
-	"dbu", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"dbu", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_pglue_b_defs = {
-	"pglue_b", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH},
+	"pglue_b",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCH, DBG_BUS_CLIENT_RBCH},
 	PGLUE_B_REG_DBG_SELECT, PGLUE_B_REG_DBG_DWORD_ENABLE,
 	PGLUE_B_REG_DBG_SHIFT, PGLUE_B_REG_DBG_FORCE_VALID,
 	PGLUE_B_REG_DBG_FORCE_FRAME,
@@ -538,8 +540,9 @@ static struct block_defs block_pglue_b_defs = {
 };
 
 static struct block_defs block_cnig_defs = {
-	"cnig", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
+	"cnig",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
 	CNIG_REG_DBG_SELECT_K2, CNIG_REG_DBG_DWORD_ENABLE_K2,
 	CNIG_REG_DBG_SHIFT_K2, CNIG_REG_DBG_FORCE_VALID_K2,
 	CNIG_REG_DBG_FORCE_FRAME_K2,
@@ -547,15 +550,16 @@ static struct block_defs block_cnig_defs = {
 };
 
 static struct block_defs block_cpmu_defs = {
-	"cpmu", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"cpmu", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	true, false, DBG_RESET_REG_MISCS_PL_HV, 8
 };
 
 static struct block_defs block_ncsi_defs = {
-	"ncsi", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	"ncsi",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
 	NCSI_REG_DBG_SELECT, NCSI_REG_DBG_DWORD_ENABLE,
 	NCSI_REG_DBG_SHIFT, NCSI_REG_DBG_FORCE_VALID,
 	NCSI_REG_DBG_FORCE_FRAME,
@@ -563,15 +567,16 @@ static struct block_defs block_ncsi_defs = {
 };
 
 static struct block_defs block_opte_defs = {
-	"opte", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"opte", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	true, false, DBG_RESET_REG_MISCS_PL_HV, 4
 };
 
 static struct block_defs block_bmb_defs = {
-	"bmb", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCB},
+	"bmb",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCB},
 	BMB_REG_DBG_SELECT, BMB_REG_DBG_DWORD_ENABLE,
 	BMB_REG_DBG_SHIFT, BMB_REG_DBG_FORCE_VALID,
 	BMB_REG_DBG_FORCE_FRAME,
@@ -579,8 +584,9 @@ static struct block_defs block_bmb_defs = {
 };
 
 static struct block_defs block_pcie_defs = {
-	"pcie", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	"pcie",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
 	PCIE_REG_DBG_COMMON_SELECT, PCIE_REG_DBG_COMMON_DWORD_ENABLE,
 	PCIE_REG_DBG_COMMON_SHIFT, PCIE_REG_DBG_COMMON_FORCE_VALID,
 	PCIE_REG_DBG_COMMON_FORCE_FRAME,
@@ -588,15 +594,16 @@ static struct block_defs block_pcie_defs = {
 };
 
 static struct block_defs block_mcp_defs = {
-	"mcp", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"mcp", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_mcp2_defs = {
-	"mcp2", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
+	"mcp2",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCZ, DBG_BUS_CLIENT_RBCZ},
 	MCP2_REG_DBG_SELECT, MCP2_REG_DBG_DWORD_ENABLE,
 	MCP2_REG_DBG_SHIFT, MCP2_REG_DBG_FORCE_VALID,
 	MCP2_REG_DBG_FORCE_FRAME,
@@ -604,8 +611,9 @@ static struct block_defs block_mcp2_defs = {
 };
 
 static struct block_defs block_pswhst_defs = {
-	"pswhst", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswhst",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWHST_REG_DBG_SELECT, PSWHST_REG_DBG_DWORD_ENABLE,
 	PSWHST_REG_DBG_SHIFT, PSWHST_REG_DBG_FORCE_VALID,
 	PSWHST_REG_DBG_FORCE_FRAME,
@@ -613,8 +621,9 @@ static struct block_defs block_pswhst_defs = {
 };
 
 static struct block_defs block_pswhst2_defs = {
-	"pswhst2", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswhst2",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWHST2_REG_DBG_SELECT, PSWHST2_REG_DBG_DWORD_ENABLE,
 	PSWHST2_REG_DBG_SHIFT, PSWHST2_REG_DBG_FORCE_VALID,
 	PSWHST2_REG_DBG_FORCE_FRAME,
@@ -622,8 +631,9 @@ static struct block_defs block_pswhst2_defs = {
 };
 
 static struct block_defs block_pswrd_defs = {
-	"pswrd", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswrd",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWRD_REG_DBG_SELECT, PSWRD_REG_DBG_DWORD_ENABLE,
 	PSWRD_REG_DBG_SHIFT, PSWRD_REG_DBG_FORCE_VALID,
 	PSWRD_REG_DBG_FORCE_FRAME,
@@ -631,8 +641,9 @@ static struct block_defs block_pswrd_defs = {
 };
 
 static struct block_defs block_pswrd2_defs = {
-	"pswrd2", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswrd2",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWRD2_REG_DBG_SELECT, PSWRD2_REG_DBG_DWORD_ENABLE,
 	PSWRD2_REG_DBG_SHIFT, PSWRD2_REG_DBG_FORCE_VALID,
 	PSWRD2_REG_DBG_FORCE_FRAME,
@@ -640,8 +651,9 @@ static struct block_defs block_pswrd2_defs = {
 };
 
 static struct block_defs block_pswwr_defs = {
-	"pswwr", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswwr",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWWR_REG_DBG_SELECT, PSWWR_REG_DBG_DWORD_ENABLE,
 	PSWWR_REG_DBG_SHIFT, PSWWR_REG_DBG_FORCE_VALID,
 	PSWWR_REG_DBG_FORCE_FRAME,
@@ -649,15 +661,16 @@ static struct block_defs block_pswwr_defs = {
 };
 
 static struct block_defs block_pswwr2_defs = {
-	"pswwr2", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"pswwr2", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	true, false, DBG_RESET_REG_MISC_PL_HV, 3
 };
 
 static struct block_defs block_pswrq_defs = {
-	"pswrq", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswrq",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWRQ_REG_DBG_SELECT, PSWRQ_REG_DBG_DWORD_ENABLE,
 	PSWRQ_REG_DBG_SHIFT, PSWRQ_REG_DBG_FORCE_VALID,
 	PSWRQ_REG_DBG_FORCE_FRAME,
@@ -665,8 +678,9 @@ static struct block_defs block_pswrq_defs = {
 };
 
 static struct block_defs block_pswrq2_defs = {
-	"pswrq2", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"pswrq2",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PSWRQ2_REG_DBG_SELECT, PSWRQ2_REG_DBG_DWORD_ENABLE,
 	PSWRQ2_REG_DBG_SHIFT, PSWRQ2_REG_DBG_FORCE_VALID,
 	PSWRQ2_REG_DBG_FORCE_FRAME,
@@ -674,8 +688,9 @@ static struct block_defs block_pswrq2_defs = {
 };
 
 static struct block_defs block_pglcs_defs = {
-	"pglcs", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	"pglcs",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
 	PGLCS_REG_DBG_SELECT, PGLCS_REG_DBG_DWORD_ENABLE,
 	PGLCS_REG_DBG_SHIFT, PGLCS_REG_DBG_FORCE_VALID,
 	PGLCS_REG_DBG_FORCE_FRAME,
@@ -683,8 +698,9 @@ static struct block_defs block_pglcs_defs = {
 };
 
 static struct block_defs block_ptu_defs = {
-	"ptu", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"ptu",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	PTU_REG_DBG_SELECT, PTU_REG_DBG_DWORD_ENABLE,
 	PTU_REG_DBG_SHIFT, PTU_REG_DBG_FORCE_VALID,
 	PTU_REG_DBG_FORCE_FRAME,
@@ -692,8 +708,9 @@ static struct block_defs block_ptu_defs = {
 };
 
 static struct block_defs block_dmae_defs = {
-	"dmae", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"dmae",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	DMAE_REG_DBG_SELECT, DMAE_REG_DBG_DWORD_ENABLE,
 	DMAE_REG_DBG_SHIFT, DMAE_REG_DBG_FORCE_VALID,
 	DMAE_REG_DBG_FORCE_FRAME,
@@ -701,8 +718,9 @@ static struct block_defs block_dmae_defs = {
 };
 
 static struct block_defs block_tcm_defs = {
-	"tcm", {true, true, true}, true, DBG_TSTORM_ID,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	"tcm",
+	{true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
 	TCM_REG_DBG_SELECT, TCM_REG_DBG_DWORD_ENABLE,
 	TCM_REG_DBG_SHIFT, TCM_REG_DBG_FORCE_VALID,
 	TCM_REG_DBG_FORCE_FRAME,
@@ -710,8 +728,9 @@ static struct block_defs block_tcm_defs = {
 };
 
 static struct block_defs block_mcm_defs = {
-	"mcm", {true, true, true}, true, DBG_MSTORM_ID,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"mcm",
+	{true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	MCM_REG_DBG_SELECT, MCM_REG_DBG_DWORD_ENABLE,
 	MCM_REG_DBG_SHIFT, MCM_REG_DBG_FORCE_VALID,
 	MCM_REG_DBG_FORCE_FRAME,
@@ -719,8 +738,9 @@ static struct block_defs block_mcm_defs = {
 };
 
 static struct block_defs block_ucm_defs = {
-	"ucm", {true, true, true}, true, DBG_USTORM_ID,
-	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	"ucm",
+	{true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
 	UCM_REG_DBG_SELECT, UCM_REG_DBG_DWORD_ENABLE,
 	UCM_REG_DBG_SHIFT, UCM_REG_DBG_FORCE_VALID,
 	UCM_REG_DBG_FORCE_FRAME,
@@ -728,8 +748,9 @@ static struct block_defs block_ucm_defs = {
 };
 
 static struct block_defs block_xcm_defs = {
-	"xcm", {true, true, true}, true, DBG_XSTORM_ID,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	"xcm",
+	{true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
 	XCM_REG_DBG_SELECT, XCM_REG_DBG_DWORD_ENABLE,
 	XCM_REG_DBG_SHIFT, XCM_REG_DBG_FORCE_VALID,
 	XCM_REG_DBG_FORCE_FRAME,
@@ -737,8 +758,9 @@ static struct block_defs block_xcm_defs = {
 };
 
 static struct block_defs block_ycm_defs = {
-	"ycm", {true, true, true}, true, DBG_YSTORM_ID,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	"ycm",
+	{true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
 	YCM_REG_DBG_SELECT, YCM_REG_DBG_DWORD_ENABLE,
 	YCM_REG_DBG_SHIFT, YCM_REG_DBG_FORCE_VALID,
 	YCM_REG_DBG_FORCE_FRAME,
@@ -746,8 +768,9 @@ static struct block_defs block_ycm_defs = {
 };
 
 static struct block_defs block_pcm_defs = {
-	"pcm", {true, true, true}, true, DBG_PSTORM_ID,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	"pcm",
+	{true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
 	PCM_REG_DBG_SELECT, PCM_REG_DBG_DWORD_ENABLE,
 	PCM_REG_DBG_SHIFT, PCM_REG_DBG_FORCE_VALID,
 	PCM_REG_DBG_FORCE_FRAME,
@@ -755,8 +778,9 @@ static struct block_defs block_pcm_defs = {
 };
 
 static struct block_defs block_qm_defs = {
-	"qm", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCQ},
+	"qm",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCQ},
 	QM_REG_DBG_SELECT, QM_REG_DBG_DWORD_ENABLE,
 	QM_REG_DBG_SHIFT, QM_REG_DBG_FORCE_VALID,
 	QM_REG_DBG_FORCE_FRAME,
@@ -764,8 +788,9 @@ static struct block_defs block_qm_defs = {
 };
 
 static struct block_defs block_tm_defs = {
-	"tm", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	"tm",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
 	TM_REG_DBG_SELECT, TM_REG_DBG_DWORD_ENABLE,
 	TM_REG_DBG_SHIFT, TM_REG_DBG_FORCE_VALID,
 	TM_REG_DBG_FORCE_FRAME,
@@ -773,8 +798,9 @@ static struct block_defs block_tm_defs = {
 };
 
 static struct block_defs block_dorq_defs = {
-	"dorq", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	"dorq",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
 	DORQ_REG_DBG_SELECT, DORQ_REG_DBG_DWORD_ENABLE,
 	DORQ_REG_DBG_SHIFT, DORQ_REG_DBG_FORCE_VALID,
 	DORQ_REG_DBG_FORCE_FRAME,
@@ -782,8 +808,9 @@ static struct block_defs block_dorq_defs = {
 };
 
 static struct block_defs block_brb_defs = {
-	"brb", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
+	"brb",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
 	BRB_REG_DBG_SELECT, BRB_REG_DBG_DWORD_ENABLE,
 	BRB_REG_DBG_SHIFT, BRB_REG_DBG_FORCE_VALID,
 	BRB_REG_DBG_FORCE_FRAME,
@@ -791,8 +818,9 @@ static struct block_defs block_brb_defs = {
 };
 
 static struct block_defs block_src_defs = {
-	"src", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	"src",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
 	SRC_REG_DBG_SELECT, SRC_REG_DBG_DWORD_ENABLE,
 	SRC_REG_DBG_SHIFT, SRC_REG_DBG_FORCE_VALID,
 	SRC_REG_DBG_FORCE_FRAME,
@@ -800,8 +828,9 @@ static struct block_defs block_src_defs = {
 };
 
 static struct block_defs block_prs_defs = {
-	"prs", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
+	"prs",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR},
 	PRS_REG_DBG_SELECT, PRS_REG_DBG_DWORD_ENABLE,
 	PRS_REG_DBG_SHIFT, PRS_REG_DBG_FORCE_VALID,
 	PRS_REG_DBG_FORCE_FRAME,
@@ -809,8 +838,9 @@ static struct block_defs block_prs_defs = {
 };
 
 static struct block_defs block_tsdm_defs = {
-	"tsdm", {true, true, true}, true, DBG_TSTORM_ID,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	"tsdm",
+	{true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
 	TSDM_REG_DBG_SELECT, TSDM_REG_DBG_DWORD_ENABLE,
 	TSDM_REG_DBG_SHIFT, TSDM_REG_DBG_FORCE_VALID,
 	TSDM_REG_DBG_FORCE_FRAME,
@@ -818,8 +848,9 @@ static struct block_defs block_tsdm_defs = {
 };
 
 static struct block_defs block_msdm_defs = {
-	"msdm", {true, true, true}, true, DBG_MSTORM_ID,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"msdm",
+	{true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	MSDM_REG_DBG_SELECT, MSDM_REG_DBG_DWORD_ENABLE,
 	MSDM_REG_DBG_SHIFT, MSDM_REG_DBG_FORCE_VALID,
 	MSDM_REG_DBG_FORCE_FRAME,
@@ -827,8 +858,9 @@ static struct block_defs block_msdm_defs = {
 };
 
 static struct block_defs block_usdm_defs = {
-	"usdm", {true, true, true}, true, DBG_USTORM_ID,
-	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	"usdm",
+	{true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
 	USDM_REG_DBG_SELECT, USDM_REG_DBG_DWORD_ENABLE,
 	USDM_REG_DBG_SHIFT, USDM_REG_DBG_FORCE_VALID,
 	USDM_REG_DBG_FORCE_FRAME,
@@ -836,8 +868,9 @@ static struct block_defs block_usdm_defs = {
 };
 
 static struct block_defs block_xsdm_defs = {
-	"xsdm", {true, true, true}, true, DBG_XSTORM_ID,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	"xsdm",
+	{true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
 	XSDM_REG_DBG_SELECT, XSDM_REG_DBG_DWORD_ENABLE,
 	XSDM_REG_DBG_SHIFT, XSDM_REG_DBG_FORCE_VALID,
 	XSDM_REG_DBG_FORCE_FRAME,
@@ -845,8 +878,9 @@ static struct block_defs block_xsdm_defs = {
 };
 
 static struct block_defs block_ysdm_defs = {
-	"ysdm", {true, true, true}, true, DBG_YSTORM_ID,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	"ysdm",
+	{true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
 	YSDM_REG_DBG_SELECT, YSDM_REG_DBG_DWORD_ENABLE,
 	YSDM_REG_DBG_SHIFT, YSDM_REG_DBG_FORCE_VALID,
 	YSDM_REG_DBG_FORCE_FRAME,
@@ -854,8 +888,9 @@ static struct block_defs block_ysdm_defs = {
 };
 
 static struct block_defs block_psdm_defs = {
-	"psdm", {true, true, true}, true, DBG_PSTORM_ID,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	"psdm",
+	{true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
 	PSDM_REG_DBG_SELECT, PSDM_REG_DBG_DWORD_ENABLE,
 	PSDM_REG_DBG_SHIFT, PSDM_REG_DBG_FORCE_VALID,
 	PSDM_REG_DBG_FORCE_FRAME,
@@ -863,8 +898,9 @@ static struct block_defs block_psdm_defs = {
 };
 
 static struct block_defs block_tsem_defs = {
-	"tsem", {true, true, true}, true, DBG_TSTORM_ID,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	"tsem",
+	{true, true}, true, DBG_TSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
 	TSEM_REG_DBG_SELECT, TSEM_REG_DBG_DWORD_ENABLE,
 	TSEM_REG_DBG_SHIFT, TSEM_REG_DBG_FORCE_VALID,
 	TSEM_REG_DBG_FORCE_FRAME,
@@ -872,8 +908,9 @@ static struct block_defs block_tsem_defs = {
 };
 
 static struct block_defs block_msem_defs = {
-	"msem", {true, true, true}, true, DBG_MSTORM_ID,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"msem",
+	{true, true}, true, DBG_MSTORM_ID,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	MSEM_REG_DBG_SELECT, MSEM_REG_DBG_DWORD_ENABLE,
 	MSEM_REG_DBG_SHIFT, MSEM_REG_DBG_FORCE_VALID,
 	MSEM_REG_DBG_FORCE_FRAME,
@@ -881,8 +918,9 @@ static struct block_defs block_msem_defs = {
 };
 
 static struct block_defs block_usem_defs = {
-	"usem", {true, true, true}, true, DBG_USTORM_ID,
-	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	"usem",
+	{true, true}, true, DBG_USTORM_ID,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
 	USEM_REG_DBG_SELECT, USEM_REG_DBG_DWORD_ENABLE,
 	USEM_REG_DBG_SHIFT, USEM_REG_DBG_FORCE_VALID,
 	USEM_REG_DBG_FORCE_FRAME,
@@ -890,8 +928,9 @@ static struct block_defs block_usem_defs = {
 };
 
 static struct block_defs block_xsem_defs = {
-	"xsem", {true, true, true}, true, DBG_XSTORM_ID,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	"xsem",
+	{true, true}, true, DBG_XSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
 	XSEM_REG_DBG_SELECT, XSEM_REG_DBG_DWORD_ENABLE,
 	XSEM_REG_DBG_SHIFT, XSEM_REG_DBG_FORCE_VALID,
 	XSEM_REG_DBG_FORCE_FRAME,
@@ -899,8 +938,9 @@ static struct block_defs block_xsem_defs = {
 };
 
 static struct block_defs block_ysem_defs = {
-	"ysem", {true, true, true}, true, DBG_YSTORM_ID,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
+	"ysem",
+	{true, true}, true, DBG_YSTORM_ID,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCY},
 	YSEM_REG_DBG_SELECT, YSEM_REG_DBG_DWORD_ENABLE,
 	YSEM_REG_DBG_SHIFT, YSEM_REG_DBG_FORCE_VALID,
 	YSEM_REG_DBG_FORCE_FRAME,
@@ -908,8 +948,9 @@ static struct block_defs block_ysem_defs = {
 };
 
 static struct block_defs block_psem_defs = {
-	"psem", {true, true, true}, true, DBG_PSTORM_ID,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	"psem",
+	{true, true}, true, DBG_PSTORM_ID,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
 	PSEM_REG_DBG_SELECT, PSEM_REG_DBG_DWORD_ENABLE,
 	PSEM_REG_DBG_SHIFT, PSEM_REG_DBG_FORCE_VALID,
 	PSEM_REG_DBG_FORCE_FRAME,
@@ -917,8 +958,9 @@ static struct block_defs block_psem_defs = {
 };
 
 static struct block_defs block_rss_defs = {
-	"rss", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
+	"rss",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT},
 	RSS_REG_DBG_SELECT, RSS_REG_DBG_DWORD_ENABLE,
 	RSS_REG_DBG_SHIFT, RSS_REG_DBG_FORCE_VALID,
 	RSS_REG_DBG_FORCE_FRAME,
@@ -926,8 +968,9 @@ static struct block_defs block_rss_defs = {
 };
 
 static struct block_defs block_tmld_defs = {
-	"tmld", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"tmld",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	TMLD_REG_DBG_SELECT, TMLD_REG_DBG_DWORD_ENABLE,
 	TMLD_REG_DBG_SHIFT, TMLD_REG_DBG_FORCE_VALID,
 	TMLD_REG_DBG_FORCE_FRAME,
@@ -935,8 +978,9 @@ static struct block_defs block_tmld_defs = {
 };
 
 static struct block_defs block_muld_defs = {
-	"muld", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	"muld",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
 	MULD_REG_DBG_SELECT, MULD_REG_DBG_DWORD_ENABLE,
 	MULD_REG_DBG_SHIFT, MULD_REG_DBG_FORCE_VALID,
 	MULD_REG_DBG_FORCE_FRAME,
@@ -944,8 +988,9 @@ static struct block_defs block_muld_defs = {
 };
 
 static struct block_defs block_yuld_defs = {
-	"yuld", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
+	"yuld",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCU, DBG_BUS_CLIENT_RBCU},
 	YULD_REG_DBG_SELECT, YULD_REG_DBG_DWORD_ENABLE,
 	YULD_REG_DBG_SHIFT, YULD_REG_DBG_FORCE_VALID,
 	YULD_REG_DBG_FORCE_FRAME,
@@ -953,8 +998,9 @@ static struct block_defs block_yuld_defs = {
 };
 
 static struct block_defs block_xyld_defs = {
-	"xyld", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
+	"xyld",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCX, DBG_BUS_CLIENT_RBCX},
 	XYLD_REG_DBG_SELECT, XYLD_REG_DBG_DWORD_ENABLE,
 	XYLD_REG_DBG_SHIFT, XYLD_REG_DBG_FORCE_VALID,
 	XYLD_REG_DBG_FORCE_FRAME,
@@ -962,8 +1008,9 @@ static struct block_defs block_xyld_defs = {
 };
 
 static struct block_defs block_prm_defs = {
-	"prm", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"prm",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	PRM_REG_DBG_SELECT, PRM_REG_DBG_DWORD_ENABLE,
 	PRM_REG_DBG_SHIFT, PRM_REG_DBG_FORCE_VALID,
 	PRM_REG_DBG_FORCE_FRAME,
@@ -971,8 +1018,9 @@ static struct block_defs block_prm_defs = {
 };
 
 static struct block_defs block_pbf_pb1_defs = {
-	"pbf_pb1", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
+	"pbf_pb1",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
 	PBF_PB1_REG_DBG_SELECT, PBF_PB1_REG_DBG_DWORD_ENABLE,
 	PBF_PB1_REG_DBG_SHIFT, PBF_PB1_REG_DBG_FORCE_VALID,
 	PBF_PB1_REG_DBG_FORCE_FRAME,
@@ -981,8 +1029,9 @@ static struct block_defs block_pbf_pb1_defs = {
 };
 
 static struct block_defs block_pbf_pb2_defs = {
-	"pbf_pb2", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
+	"pbf_pb2",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
 	PBF_PB2_REG_DBG_SELECT, PBF_PB2_REG_DBG_DWORD_ENABLE,
 	PBF_PB2_REG_DBG_SHIFT, PBF_PB2_REG_DBG_FORCE_VALID,
 	PBF_PB2_REG_DBG_FORCE_FRAME,
@@ -991,8 +1040,9 @@ static struct block_defs block_pbf_pb2_defs = {
 };
 
 static struct block_defs block_rpb_defs = {
-	"rpb", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"rpb",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	RPB_REG_DBG_SELECT, RPB_REG_DBG_DWORD_ENABLE,
 	RPB_REG_DBG_SHIFT, RPB_REG_DBG_FORCE_VALID,
 	RPB_REG_DBG_FORCE_FRAME,
@@ -1000,8 +1050,9 @@ static struct block_defs block_rpb_defs = {
 };
 
 static struct block_defs block_btb_defs = {
-	"btb", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCV},
+	"btb",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCR, DBG_BUS_CLIENT_RBCV},
 	BTB_REG_DBG_SELECT, BTB_REG_DBG_DWORD_ENABLE,
 	BTB_REG_DBG_SHIFT, BTB_REG_DBG_FORCE_VALID,
 	BTB_REG_DBG_FORCE_FRAME,
@@ -1009,8 +1060,9 @@ static struct block_defs block_btb_defs = {
 };
 
 static struct block_defs block_pbf_defs = {
-	"pbf", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
+	"pbf",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCV},
 	PBF_REG_DBG_SELECT, PBF_REG_DBG_DWORD_ENABLE,
 	PBF_REG_DBG_SHIFT, PBF_REG_DBG_FORCE_VALID,
 	PBF_REG_DBG_FORCE_FRAME,
@@ -1018,8 +1070,9 @@ static struct block_defs block_pbf_defs = {
 };
 
 static struct block_defs block_rdif_defs = {
-	"rdif", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
+	"rdif",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCT, DBG_BUS_CLIENT_RBCM},
 	RDIF_REG_DBG_SELECT, RDIF_REG_DBG_DWORD_ENABLE,
 	RDIF_REG_DBG_SHIFT, RDIF_REG_DBG_FORCE_VALID,
 	RDIF_REG_DBG_FORCE_FRAME,
@@ -1027,8 +1080,9 @@ static struct block_defs block_rdif_defs = {
 };
 
 static struct block_defs block_tdif_defs = {
-	"tdif", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
+	"tdif",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCS, DBG_BUS_CLIENT_RBCS},
 	TDIF_REG_DBG_SELECT, TDIF_REG_DBG_DWORD_ENABLE,
 	TDIF_REG_DBG_SHIFT, TDIF_REG_DBG_FORCE_VALID,
 	TDIF_REG_DBG_FORCE_FRAME,
@@ -1036,8 +1090,9 @@ static struct block_defs block_tdif_defs = {
 };
 
 static struct block_defs block_cdu_defs = {
-	"cdu", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	"cdu",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
 	CDU_REG_DBG_SELECT, CDU_REG_DBG_DWORD_ENABLE,
 	CDU_REG_DBG_SHIFT, CDU_REG_DBG_FORCE_VALID,
 	CDU_REG_DBG_FORCE_FRAME,
@@ -1045,8 +1100,9 @@ static struct block_defs block_cdu_defs = {
 };
 
 static struct block_defs block_ccfc_defs = {
-	"ccfc", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	"ccfc",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
 	CCFC_REG_DBG_SELECT, CCFC_REG_DBG_DWORD_ENABLE,
 	CCFC_REG_DBG_SHIFT, CCFC_REG_DBG_FORCE_VALID,
 	CCFC_REG_DBG_FORCE_FRAME,
@@ -1054,8 +1110,9 @@ static struct block_defs block_ccfc_defs = {
 };
 
 static struct block_defs block_tcfc_defs = {
-	"tcfc", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
+	"tcfc",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCF, DBG_BUS_CLIENT_RBCF},
 	TCFC_REG_DBG_SELECT, TCFC_REG_DBG_DWORD_ENABLE,
 	TCFC_REG_DBG_SHIFT, TCFC_REG_DBG_FORCE_VALID,
 	TCFC_REG_DBG_FORCE_FRAME,
@@ -1063,8 +1120,9 @@ static struct block_defs block_tcfc_defs = {
 };
 
 static struct block_defs block_igu_defs = {
-	"igu", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"igu",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	IGU_REG_DBG_SELECT, IGU_REG_DBG_DWORD_ENABLE,
 	IGU_REG_DBG_SHIFT, IGU_REG_DBG_FORCE_VALID,
 	IGU_REG_DBG_FORCE_FRAME,
@@ -1072,8 +1130,9 @@ static struct block_defs block_igu_defs = {
 };
 
 static struct block_defs block_cau_defs = {
-	"cau", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
+	"cau",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCP, DBG_BUS_CLIENT_RBCP},
 	CAU_REG_DBG_SELECT, CAU_REG_DBG_DWORD_ENABLE,
 	CAU_REG_DBG_SHIFT, CAU_REG_DBG_FORCE_VALID,
 	CAU_REG_DBG_FORCE_FRAME,
@@ -1081,8 +1140,9 @@ static struct block_defs block_cau_defs = {
 };
 
 static struct block_defs block_umac_defs = {
-	"umac", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
+	"umac",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
 	UMAC_REG_DBG_SELECT, UMAC_REG_DBG_DWORD_ENABLE,
 	UMAC_REG_DBG_SHIFT, UMAC_REG_DBG_FORCE_VALID,
 	UMAC_REG_DBG_FORCE_FRAME,
@@ -1090,22 +1150,23 @@ static struct block_defs block_umac_defs = {
 };
 
 static struct block_defs block_xmac_defs = {
-	"xmac", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"xmac", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_dbg_defs = {
-	"dbg", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"dbg", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	true, true, DBG_RESET_REG_MISC_PL_PDA_VAUX, 3
 };
 
 static struct block_defs block_nig_defs = {
-	"nig", {true, true, true}, false, 0,
-	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
+	"nig",
+	{true, true}, false, 0,
+	{DBG_BUS_CLIENT_RBCN, DBG_BUS_CLIENT_RBCN},
 	NIG_REG_DBG_SELECT, NIG_REG_DBG_DWORD_ENABLE,
 	NIG_REG_DBG_SHIFT, NIG_REG_DBG_FORCE_VALID,
 	NIG_REG_DBG_FORCE_FRAME,
@@ -1113,8 +1174,9 @@ static struct block_defs block_nig_defs = {
 };
 
 static struct block_defs block_wol_defs = {
-	"wol", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
+	"wol",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
 	WOL_REG_DBG_SELECT, WOL_REG_DBG_DWORD_ENABLE,
 	WOL_REG_DBG_SHIFT, WOL_REG_DBG_FORCE_VALID,
 	WOL_REG_DBG_FORCE_FRAME,
@@ -1122,8 +1184,9 @@ static struct block_defs block_wol_defs = {
 };
 
 static struct block_defs block_bmbn_defs = {
-	"bmbn", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCB},
+	"bmbn",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCB},
 	BMBN_REG_DBG_SELECT, BMBN_REG_DBG_DWORD_ENABLE,
 	BMBN_REG_DBG_SHIFT, BMBN_REG_DBG_FORCE_VALID,
 	BMBN_REG_DBG_FORCE_FRAME,
@@ -1131,15 +1194,16 @@ static struct block_defs block_bmbn_defs = {
 };
 
 static struct block_defs block_ipc_defs = {
-	"ipc", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"ipc", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	true, false, DBG_RESET_REG_MISCS_PL_UA, 8
 };
 
 static struct block_defs block_nwm_defs = {
-	"nwm", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
+	"nwm",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
 	NWM_REG_DBG_SELECT, NWM_REG_DBG_DWORD_ENABLE,
 	NWM_REG_DBG_SHIFT, NWM_REG_DBG_FORCE_VALID,
 	NWM_REG_DBG_FORCE_FRAME,
@@ -1147,22 +1211,29 @@ static struct block_defs block_nwm_defs = {
 };
 
 static struct block_defs block_nws_defs = {
-	"nws", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
-	0, 0, 0, 0, 0,
+	"nws",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCW},
+	NWS_REG_DBG_SELECT, NWS_REG_DBG_DWORD_ENABLE,
+	NWS_REG_DBG_SHIFT, NWS_REG_DBG_FORCE_VALID,
+	NWS_REG_DBG_FORCE_FRAME,
 	true, false, DBG_RESET_REG_MISCS_PL_HV, 12
 };
 
 static struct block_defs block_ms_defs = {
-	"ms", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
-	0, 0, 0, 0, 0,
+	"ms",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCZ},
+	MS_REG_DBG_SELECT, MS_REG_DBG_DWORD_ENABLE,
+	MS_REG_DBG_SHIFT, MS_REG_DBG_FORCE_VALID,
+	MS_REG_DBG_FORCE_FRAME,
 	true, false, DBG_RESET_REG_MISCS_PL_HV, 13
 };
 
 static struct block_defs block_phy_pcie_defs = {
-	"phy_pcie", {false, false, true}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
+	"phy_pcie",
+	{false, true}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, DBG_BUS_CLIENT_RBCH},
 	PCIE_REG_DBG_COMMON_SELECT, PCIE_REG_DBG_COMMON_DWORD_ENABLE,
 	PCIE_REG_DBG_COMMON_SHIFT, PCIE_REG_DBG_COMMON_FORCE_VALID,
 	PCIE_REG_DBG_COMMON_FORCE_FRAME,
@@ -1170,22 +1241,57 @@ static struct block_defs block_phy_pcie_defs = {
 };
 
 static struct block_defs block_led_defs = {
-	"led", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"led", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_HV, 14
+};
+
+static struct block_defs block_avs_wrap_defs = {
+	"avs_wrap", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	true, false, DBG_RESET_REG_MISCS_PL_UA, 11
+};
+
+static struct block_defs block_rgfs_defs = {
+	"rgfs", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
-	true, true, DBG_RESET_REG_MISCS_PL_HV, 14
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_tgfs_defs = {
+	"tgfs", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_ptld_defs = {
+	"ptld", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
+};
+
+static struct block_defs block_ypld_defs = {
+	"ypld", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	0, 0, 0, 0, 0,
+	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_misc_aeu_defs = {
-	"misc_aeu", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"misc_aeu", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
 
 static struct block_defs block_bar0_map_defs = {
-	"bar0_map", {false, false, false}, false, 0,
-	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
+	"bar0_map", {false, false}, false, 0,
+	{MAX_DBG_BUS_CLIENTS, MAX_DBG_BUS_CLIENTS},
 	0, 0, 0, 0, 0,
 	false, false, MAX_DBG_RESET_REGS, 0
 };
@@ -1269,6 +1375,11 @@ static struct block_defs *s_block_defs[MAX_BLOCK_ID] = {
 	&block_ms_defs,
 	&block_phy_pcie_defs,
 	&block_led_defs,
+	&block_avs_wrap_defs,
+	&block_rgfs_defs,
+	&block_tgfs_defs,
+	&block_ptld_defs,
+	&block_ypld_defs,
 	&block_misc_aeu_defs,
 	&block_bar0_map_defs,
 };
@@ -1281,65 +1392,67 @@ static struct platform_defs s_platform_defs[] = {
 };
 
 static struct grc_param_defs s_grc_param_defs[] = {
-	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_TSTORM */
-	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_MSTORM */
-	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_USTORM */
-	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_XSTORM */
-	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_YSTORM */
-	{{1, 1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_PSTORM */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_REGS */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_RAM */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PBUF */
-	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_IOR */
-	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_VFC */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CM_CTX */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_ILT */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_RSS */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CAU */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_QM */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_MCP */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_RESERVED */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CFC */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_IGU */
-	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BRB */
-	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BTB */
-	{{0, 0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BMB */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_NIG */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_MULD */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PRS */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_DMAE */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_TM */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_SDM */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_DIF */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_STATIC */
-	{{0, 0, 0}, 0, 1, false, 0, 0},	/* DBG_GRC_PARAM_UNSTALL */
-	{{MAX_LCIDS, MAX_LCIDS, MAX_LCIDS}, 1, MAX_LCIDS, false, MAX_LCIDS,
+	{{1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_TSTORM */
+	{{1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_MSTORM */
+	{{1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_USTORM */
+	{{1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_XSTORM */
+	{{1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_YSTORM */
+	{{1, 1}, 0, 1, false, 1, 1},	/* DBG_GRC_PARAM_DUMP_PSTORM */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_REGS */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_RAM */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PBUF */
+	{{0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_IOR */
+	{{0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_VFC */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CM_CTX */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_ILT */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_RSS */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CAU */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_QM */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_MCP */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_RESERVED */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CFC */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_IGU */
+	{{0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BRB */
+	{{0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BTB */
+	{{0, 0}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_BMB */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_NIG */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_MULD */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PRS */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_DMAE */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_TM */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_SDM */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_DIF */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_STATIC */
+	{{0, 0}, 0, 1, false, 0, 0},	/* DBG_GRC_PARAM_UNSTALL */
+	{{MAX_LCIDS, MAX_LCIDS}, 1, MAX_LCIDS, false, MAX_LCIDS,
 	 MAX_LCIDS},			/* DBG_GRC_PARAM_NUM_LCIDS */
-	{{MAX_LTIDS, MAX_LTIDS, MAX_LTIDS}, 1, MAX_LTIDS, false, MAX_LTIDS,
+	{{MAX_LTIDS, MAX_LTIDS}, 1, MAX_LTIDS, false, MAX_LTIDS,
 	 MAX_LTIDS},			/* DBG_GRC_PARAM_NUM_LTIDS */
-	{{0, 0, 0}, 0, 1, true, 0, 0},	/* DBG_GRC_PARAM_EXCLUDE_ALL */
-	{{0, 0, 0}, 0, 1, true, 0, 0},	/* DBG_GRC_PARAM_CRASH */
-	{{0, 0, 0}, 0, 1, false, 1, 0},	/* DBG_GRC_PARAM_PARITY_SAFE */
-	{{1, 1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CM */
-	{{1, 1, 1}, 0, 1, false, 0, 1}	/* DBG_GRC_PARAM_DUMP_PHY */
+	{{0, 0}, 0, 1, true, 0, 0},	/* DBG_GRC_PARAM_EXCLUDE_ALL */
+	{{0, 0}, 0, 1, true, 0, 0},	/* DBG_GRC_PARAM_CRASH */
+	{{0, 0}, 0, 1, false, 1, 0},	/* DBG_GRC_PARAM_PARITY_SAFE */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_CM */
+	{{1, 1}, 0, 1, false, 0, 1},	/* DBG_GRC_PARAM_DUMP_PHY */
+	{{0, 0}, 0, 1, false, 0, 0},	/* DBG_GRC_PARAM_NO_MCP */
+	{{0, 0}, 0, 1, false, 0, 0}	/* DBG_GRC_PARAM_NO_FW_VER */
 };
 
 static struct rss_mem_defs s_rss_mem_defs[] = {
 	{ "rss_mem_cid", "rss_cid", 0,
-	  {256, 256, 320},
-	  {32, 32, 32} },
+	  {256, 320},
+	  {32, 32} },
 	{ "rss_mem_key_msb", "rss_key", 1024,
-	  {128, 128, 208},
-	  {256, 256, 256} },
+	  {128, 208},
+	  {256, 256} },
 	{ "rss_mem_key_lsb", "rss_key", 2048,
-	  {128, 128, 208},
-	  {64, 64, 64} },
+	  {128, 208},
+	  {64, 64} },
 	{ "rss_mem_info", "rss_info", 3072,
-	  {128, 128, 208},
-	  {16, 16, 16} },
+	  {128, 208},
+	  {16, 16} },
 	{ "rss_mem_ind", "rss_ind", 4096,
-	  {(128 * 128), (128 * 128), (128 * 208)},
-	  {16, 16, 16} }
+	  {(128 * 128), (128 * 208)},
+	  {16, 16} }
 };
 
 static struct vfc_ram_defs s_vfc_ram_defs[] = {
@@ -1352,32 +1465,32 @@ static struct vfc_ram_defs s_vfc_ram_defs[] = {
 static struct big_ram_defs s_big_ram_defs[] = {
 	{ "BRB", MEM_GROUP_BRB_MEM, MEM_GROUP_BRB_RAM, DBG_GRC_PARAM_DUMP_BRB,
 	  BRB_REG_BIG_RAM_ADDRESS, BRB_REG_BIG_RAM_DATA,
-	  {4800, 4800, 5632} },
+	  {4800, 5632} },
 	{ "BTB", MEM_GROUP_BTB_MEM, MEM_GROUP_BTB_RAM, DBG_GRC_PARAM_DUMP_BTB,
 	  BTB_REG_BIG_RAM_ADDRESS, BTB_REG_BIG_RAM_DATA,
-	  {2880, 2880, 3680} },
+	  {2880, 3680} },
 	{ "BMB", MEM_GROUP_BMB_MEM, MEM_GROUP_BMB_RAM, DBG_GRC_PARAM_DUMP_BMB,
 	  BMB_REG_BIG_RAM_ADDRESS, BMB_REG_BIG_RAM_DATA,
-	  {1152, 1152, 1152} }
+	  {1152, 1152} }
 };
 
 static struct reset_reg_defs s_reset_regs_defs[] = {
 	{ MISCS_REG_RESET_PL_UA, 0x0,
-	  {true, true, true} },		/* DBG_RESET_REG_MISCS_PL_UA */
+	  {true, true} },		/* DBG_RESET_REG_MISCS_PL_UA */
 	{ MISCS_REG_RESET_PL_HV, 0x0,
-	  {true, true, true} },		/* DBG_RESET_REG_MISCS_PL_HV */
+	  {true, true} },		/* DBG_RESET_REG_MISCS_PL_HV */
 	{ MISCS_REG_RESET_PL_HV_2, 0x0,
-	  {false, false, true} },	/* DBG_RESET_REG_MISCS_PL_HV_2 */
+	  {false, true} },	/* DBG_RESET_REG_MISCS_PL_HV_2 */
 	{ MISC_REG_RESET_PL_UA, 0x0,
-	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_UA */
+	  {true, true} },		/* DBG_RESET_REG_MISC_PL_UA */
 	{ MISC_REG_RESET_PL_HV, 0x0,
-	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_HV */
+	  {true, true} },		/* DBG_RESET_REG_MISC_PL_HV */
 	{ MISC_REG_RESET_PL_PDA_VMAIN_1, 0x4404040,
-	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_1 */
+	  {true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_1 */
 	{ MISC_REG_RESET_PL_PDA_VMAIN_2, 0x7c00007,
-	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_2 */
+	  {true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VMAIN_2 */
 	{ MISC_REG_RESET_PL_PDA_VAUX, 0x2,
-	  {true, true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VAUX */
+	  {true, true} },		/* DBG_RESET_REG_MISC_PL_PDA_VAUX */
 };
 
 static struct phy_defs s_phy_defs[] = {
@@ -1410,6 +1523,26 @@ static u32 qed_read_unaligned_dword(u8 *buf)
 	return dword;
 }
 
+/* Returns the value of the specified GRC param */
+static u32 qed_grc_get_param(struct qed_hwfn *p_hwfn,
+			     enum dbg_grc_params grc_param)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	return dev_data->grc.param_val[grc_param];
+}
+
+/* Initializes the GRC parameters */
+static void qed_dbg_grc_init_params(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+
+	if (!dev_data->grc.params_initialized) {
+		qed_dbg_grc_set_params_default(p_hwfn);
+		dev_data->grc.params_initialized = 1;
+	}
+}
+
 /* Initializes debug data for the specified device */
 static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
 					struct qed_ptt *p_ptt)
@@ -1431,6 +1564,10 @@ static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
 
 	dev_data->platform_id = PLATFORM_ASIC;
 	dev_data->mode_enable[MODE_ASIC] = 1;
+
+	/* Initializes the GRC parameters */
+	qed_dbg_grc_init_params(p_hwfn);
+
 	dev_data->initialized = true;
 	return DBG_STATUS_OK;
 }
@@ -1561,7 +1698,7 @@ static u32 qed_dump_fw_ver_param(struct qed_hwfn *p_hwfn,
 	int printed_chars;
 	u32 offset = 0;
 
-	if (dump) {
+	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_FW_VER)) {
 		/* Read FW image/version from PRAM in a non-reset SEMI */
 		bool found = false;
 		u8 storm_id;
@@ -1622,7 +1759,7 @@ static u32 qed_dump_mfw_ver_param(struct qed_hwfn *p_hwfn,
 {
 	char mfw_ver_str[16] = EMPTY_FW_VERSION_STR;
 
-	if (dump) {
+	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_FW_VER)) {
 		u32 global_section_offsize, global_section_addr, mfw_ver;
 		u32 public_data_addr, global_section_offsize_addr;
 		int printed_chars;
@@ -1683,15 +1820,13 @@ static u32 qed_dump_common_global_params(struct qed_hwfn *p_hwfn,
 					 bool dump,
 					 u8 num_specific_global_params)
 {
+	u8 num_params = NUM_COMMON_GLOBAL_PARAMS + num_specific_global_params;
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
 	u32 offset = 0;
 
 	/* Find platform string and dump global params section header */
 	offset += qed_dump_section_hdr(dump_buf + offset,
-				       dump,
-				       "global_params",
-				       NUM_COMMON_GLOBAL_PARAMS +
-				       num_specific_global_params);
+				       dump, "global_params", num_params);
 
 	/* Store params */
 	offset += qed_dump_fw_ver_param(p_hwfn, p_ptt, dump_buf + offset, dump);
@@ -1815,37 +1950,6 @@ static bool qed_is_mode_match(struct qed_hwfn *p_hwfn, u16 *modes_buf_offset)
 	}
 }
 
-/* Returns the value of the specified GRC param */
-static u32 qed_grc_get_param(struct qed_hwfn *p_hwfn,
-			     enum dbg_grc_params grc_param)
-{
-	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
-
-	return dev_data->grc.param_val[grc_param];
-}
-
-/* Clear all GRC params */
-static void qed_dbg_grc_clear_params(struct qed_hwfn *p_hwfn)
-{
-	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
-	u32 i;
-
-	for (i = 0; i < MAX_DBG_GRC_PARAMS; i++)
-		dev_data->grc.param_set_by_user[i] = 0;
-}
-
-/* Assign default GRC param values */
-static void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn)
-{
-	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
-	u32 i;
-
-	for (i = 0; i < MAX_DBG_GRC_PARAMS; i++)
-		if (!dev_data->grc.param_set_by_user[i])
-			dev_data->grc.param_val[i] =
-			    s_grc_param_defs[i].default_val[dev_data->chip_id];
-}
-
 /* Returns true if the specified entity (indicated by GRC param) should be
  * included in the dump, false otherwise.
  */
@@ -1971,7 +2075,7 @@ static void qed_grc_unreset_blocks(struct qed_hwfn *p_hwfn,
 	}
 }
 
-/* Returns the attention name offsets of the specified block */
+/* Returns the attention block data of the specified block */
 static const struct dbg_attn_block_type_data *
 qed_get_block_attn_data(enum block_id block_id, enum dbg_attn_type attn_type)
 {
@@ -2040,7 +2144,7 @@ static void qed_grc_clear_all_prty(struct qed_hwfn *p_hwfn,
  * The following parameters are dumped:
  * - 'count' = num_dumped_entries
  * - 'split' = split_type
- * - 'id'i = split_id (dumped only if split_id >= 0)
+ * - 'id' = split_id (dumped only if split_id >= 0)
  * - 'param_name' = param_val (user param, dumped only if param_name != NULL and
  *	param_val != NULL)
  */
@@ -2069,21 +2173,81 @@ static u32 qed_grc_dump_regs_hdr(u32 *dump_buf,
 	return offset;
 }
 
-/* Dumps GRC register/memory. Returns the dumped size in dwords. */
+/* Dumps the GRC registers in the specified address range.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_addr_range(struct qed_hwfn *p_hwfn,
+				   struct qed_ptt *p_ptt, u32 *dump_buf,
+				   bool dump, u32 addr, u32 len)
+{
+	u32 byte_addr = DWORDS_TO_BYTES(addr), offset = 0, i;
+
+	if (dump)
+		for (i = 0; i < len; i++, byte_addr += BYTES_IN_DWORD, offset++)
+			*(dump_buf + offset) = qed_rd(p_hwfn, p_ptt, byte_addr);
+	else
+		offset += len;
+	return offset;
+}
+
+/* Dumps GRC registers sequence header. Returns the dumped size in dwords. */
+static u32 qed_grc_dump_reg_entry_hdr(u32 *dump_buf, bool dump, u32 addr,
+				      u32 len)
+{
+	if (dump)
+		*dump_buf = addr | (len << REG_DUMP_LEN_SHIFT);
+	return 1;
+}
+
+/* Dumps GRC registers sequence. Returns the dumped size in dwords. */
 static u32 qed_grc_dump_reg_entry(struct qed_hwfn *p_hwfn,
 				  struct qed_ptt *p_ptt, u32 *dump_buf,
 				  bool dump, u32 addr, u32 len)
 {
-	u32 offset = 0, i;
+	u32 offset = 0;
+
+	offset += qed_grc_dump_reg_entry_hdr(dump_buf, dump, addr, len);
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset, dump, addr, len);
+	return offset;
+}
+
+/* Dumps GRC registers sequence with skip cycle.
+ * Returns the dumped size in dwords.
+ */
+static u32 qed_grc_dump_reg_entry_skip(struct qed_hwfn *p_hwfn,
+				       struct qed_ptt *p_ptt, u32 *dump_buf,
+				       bool dump, u32 addr, u32 total_len,
+				       u32 read_len, u32 skip_len)
+{
+	u32 offset = 0, reg_offset = 0;
 
+	offset += qed_grc_dump_reg_entry_hdr(dump_buf, dump, addr, total_len);
 	if (dump) {
-		*(dump_buf + offset++) = addr | (len << REG_DUMP_LEN_SHIFT);
-		for (i = 0; i < len; i++, addr++, offset++)
-			*(dump_buf + offset) = qed_rd(p_hwfn,
-						      p_ptt,
-						      DWORDS_TO_BYTES(addr));
+		while (reg_offset < total_len) {
+			u32 curr_len = min_t(u32,
+					     read_len,
+					     total_len - reg_offset);
+			offset += qed_grc_dump_addr_range(p_hwfn,
+							  p_ptt,
+							  dump_buf + offset,
+							  dump, addr, curr_len);
+			reg_offset += curr_len;
+			addr += curr_len;
+			if (reg_offset < total_len) {
+				curr_len = min_t(u32,
+						 skip_len,
+						 total_len - skip_len);
+				memset(dump_buf + offset, 0,
+				       DWORDS_TO_BYTES(curr_len));
+				offset += curr_len;
+				reg_offset += curr_len;
+				addr += curr_len;
+			}
+		}
 	} else {
-		offset += len + 1;
+		offset += total_len;
 	}
 
 	return offset;
@@ -2124,14 +2288,17 @@ static u32 qed_grc_dump_regs_entries(struct qed_hwfn *p_hwfn,
 				const struct dbg_dump_reg *reg =
 				    (const struct dbg_dump_reg *)
 				    &input_regs_arr.ptr[input_offset];
+				u32 addr, len;
 
+				addr = GET_FIELD(reg->data,
+						 DBG_DUMP_REG_ADDRESS);
+				len = GET_FIELD(reg->data, DBG_DUMP_REG_LENGTH);
 				offset +=
-					qed_grc_dump_reg_entry(p_hwfn, p_ptt,
-						    dump_buf + offset, dump,
-						    GET_FIELD(reg->data,
-							DBG_DUMP_REG_ADDRESS),
-						    GET_FIELD(reg->data,
-							DBG_DUMP_REG_LENGTH));
+				    qed_grc_dump_reg_entry(p_hwfn, p_ptt,
+							   dump_buf + offset,
+							   dump,
+							   addr,
+							   len);
 				(*num_dumped_reg_entries)++;
 			}
 		} else {
@@ -2194,8 +2361,14 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 				  const char *param_name, const char *param_val)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	struct chip_platform_defs *p_platform_defs;
 	u32 offset = 0, input_offset = 0;
-	u8 port_id, pf_id;
+	struct chip_defs *p_chip_defs;
+	u8 port_id, pf_id, vf_id;
+	u16 fid;
+
+	p_chip_defs = &s_chip_defs[dev_data->chip_id];
+	p_platform_defs = &p_chip_defs->per_platform[dev_data->platform_id];
 
 	if (dump)
 		DP_VERBOSE(p_hwfn, QED_MSG_DEBUG, "Dumping registers...\n");
@@ -2214,7 +2387,6 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 
 		switch (split_type_id) {
 		case SPLIT_TYPE_NONE:
-		case SPLIT_TYPE_VF:
 			offset += qed_grc_dump_split_data(p_hwfn,
 							  p_ptt,
 							  curr_input_regs_arr,
@@ -2227,10 +2399,7 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 							  param_val);
 			break;
 		case SPLIT_TYPE_PORT:
-			for (port_id = 0;
-			     port_id <
-			     s_chip_defs[dev_data->chip_id].
-			     per_platform[dev_data->platform_id].num_ports;
+			for (port_id = 0; port_id < p_platform_defs->num_ports;
 			     port_id++) {
 				if (dump)
 					qed_port_pretend(p_hwfn, p_ptt,
@@ -2247,20 +2416,48 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 			break;
 		case SPLIT_TYPE_PF:
 		case SPLIT_TYPE_PORT_PF:
-			for (pf_id = 0;
-			     pf_id <
-			     s_chip_defs[dev_data->chip_id].
-			     per_platform[dev_data->platform_id].num_pfs;
+			for (pf_id = 0; pf_id < p_platform_defs->num_pfs;
 			     pf_id++) {
-				if (dump)
-					qed_fid_pretend(p_hwfn, p_ptt, pf_id);
-				offset += qed_grc_dump_split_data(p_hwfn,
-							p_ptt,
-							curr_input_regs_arr,
-							dump_buf + offset,
-							dump, block_enable,
-							"pf", pf_id, param_name,
-							param_val);
+				u8 pfid_shift =
+					PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+
+				if (dump) {
+					fid = pf_id << pfid_shift;
+					qed_fid_pretend(p_hwfn, p_ptt, fid);
+				}
+
+				offset +=
+				    qed_grc_dump_split_data(p_hwfn, p_ptt,
+							    curr_input_regs_arr,
+							    dump_buf + offset,
+							    dump, block_enable,
+							    "pf", pf_id,
+							    param_name,
+							    param_val);
+			}
+			break;
+		case SPLIT_TYPE_VF:
+			for (vf_id = 0; vf_id < p_platform_defs->num_vfs;
+			     vf_id++) {
+				u8 vfvalid_shift =
+					PXP_PRETEND_CONCRETE_FID_VFVALID_SHIFT;
+				u8 vfid_shift =
+					PXP_PRETEND_CONCRETE_FID_VFID_SHIFT;
+
+				if (dump) {
+					fid = BIT(vfvalid_shift) |
+					      (vf_id << vfid_shift);
+					qed_fid_pretend(p_hwfn, p_ptt, fid);
+				}
+
+				offset +=
+				    qed_grc_dump_split_data(p_hwfn, p_ptt,
+							    curr_input_regs_arr,
+							    dump_buf + offset,
+							    dump, block_enable,
+							    "vf", vf_id,
+							    param_name,
+							    param_val);
 			}
 			break;
 		default:
@@ -2271,8 +2468,11 @@ static u32 qed_grc_dump_registers(struct qed_hwfn *p_hwfn,
 	}
 
 	/* Pretend to original PF */
-	if (dump)
-		qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+	if (dump) {
+		fid = p_hwfn->rel_pf_id << PXP_PRETEND_CONCRETE_FID_PFID_SHIFT;
+		qed_fid_pretend(p_hwfn, p_ptt, fid);
+	}
+
 	return offset;
 }
 
@@ -2291,13 +2491,14 @@ static u32 qed_grc_dump_reset_regs(struct qed_hwfn *p_hwfn,
 	/* Write reset registers */
 	for (i = 0; i < MAX_DBG_RESET_REGS; i++) {
 		if (s_reset_regs_defs[i].exists[dev_data->chip_id]) {
+			u32 addr = BYTES_TO_DWORDS(s_reset_regs_defs[i].addr);
+
 			offset += qed_grc_dump_reg_entry(p_hwfn,
 							 p_ptt,
 							 dump_buf + offset,
 							 dump,
-							 BYTES_TO_DWORDS
-							 (s_reset_regs_defs
-							  [i].addr), 1);
+							 addr,
+							 1);
 			num_regs++;
 		}
 	}
@@ -2339,6 +2540,7 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 				&attn_reg_arr[reg_idx];
 			u16 modes_buf_offset;
 			bool eval_mode;
+			u32 addr;
 
 			/* Check mode */
 			eval_mode = GET_FIELD(reg_data->mode.data,
@@ -2349,19 +2551,23 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 			if (!eval_mode ||
 			    qed_is_mode_match(p_hwfn, &modes_buf_offset)) {
 				/* Mode match - read and dump registers */
-				offset += qed_grc_dump_reg_entry(p_hwfn,
-							p_ptt,
-							dump_buf + offset,
-							dump,
-							reg_data->mask_address,
-							1);
-				offset += qed_grc_dump_reg_entry(p_hwfn,
-						p_ptt,
-						dump_buf + offset,
-						dump,
-						GET_FIELD(reg_data->data,
-						    DBG_ATTN_REG_STS_ADDRESS),
-						1);
+				addr = reg_data->mask_address;
+				offset +=
+				    qed_grc_dump_reg_entry(p_hwfn,
+							   p_ptt,
+							   dump_buf + offset,
+							   dump,
+							   addr,
+							   1);
+				addr = GET_FIELD(reg_data->data,
+						 DBG_ATTN_REG_STS_ADDRESS);
+				offset +=
+				    qed_grc_dump_reg_entry(p_hwfn,
+							   p_ptt,
+							   dump_buf + offset,
+							   dump,
+							   addr,
+							   1);
 				num_reg_entries += 2;
 			}
 		}
@@ -2369,18 +2575,21 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 
 	/* Write storm stall status registers */
 	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
+		u32 addr;
+
 		if (dev_data->block_in_reset[s_storm_defs[storm_id].block_id] &&
 		    dump)
 			continue;
 
+		addr =
+		    BYTES_TO_DWORDS(s_storm_defs[storm_id].sem_fast_mem_addr +
+				    SEM_FAST_REG_STALLED);
 		offset += qed_grc_dump_reg_entry(p_hwfn,
-					p_ptt,
-					dump_buf + offset,
-					dump,
-					BYTES_TO_DWORDS(s_storm_defs[storm_id].
-							sem_fast_mem_addr +
-							SEM_FAST_REG_STALLED),
-					1);
+						 p_ptt,
+						 dump_buf + offset,
+						 dump,
+						 addr,
+						 1);
 		num_reg_entries++;
 	}
 
@@ -2392,11 +2601,47 @@ static u32 qed_grc_dump_modified_regs(struct qed_hwfn *p_hwfn,
 	return offset;
 }
 
+/* Dumps registers that can't be represented in the debug arrays */
+static u32 qed_grc_dump_special_regs(struct qed_hwfn *p_hwfn,
+				     struct qed_ptt *p_ptt,
+				     u32 *dump_buf, bool dump)
+{
+	u32 offset = 0, addr;
+
+	offset += qed_grc_dump_regs_hdr(dump_buf,
+					dump, 2, "eng", -1, NULL, NULL);
+
+	/* Dump R/TDIF_REG_DEBUG_ERROR_INFO_SIZE (every 8'th register should be
+	 * skipped).
+	 */
+	addr = BYTES_TO_DWORDS(RDIF_REG_DEBUG_ERROR_INFO);
+	offset += qed_grc_dump_reg_entry_skip(p_hwfn,
+					      p_ptt,
+					      dump_buf + offset,
+					      dump,
+					      addr,
+					      RDIF_REG_DEBUG_ERROR_INFO_SIZE,
+					      7,
+					      1);
+	addr = BYTES_TO_DWORDS(TDIF_REG_DEBUG_ERROR_INFO);
+	offset +=
+	    qed_grc_dump_reg_entry_skip(p_hwfn,
+					p_ptt,
+					dump_buf + offset,
+					dump,
+					addr,
+					TDIF_REG_DEBUG_ERROR_INFO_SIZE,
+					7,
+					1);
+
+	return offset;
+}
+
 /* Dumps a GRC memory header (section and params).
  * The following parameters are dumped:
  * name - name is dumped only if it's not NULL.
- * addr - byte_addr is dumped only if name is NULL.
- * len - dword_len is always dumped.
+ * addr - addr is dumped only if name is NULL.
+ * len - len is always dumped.
  * width - bit_width is dumped if it's not zero.
  * packed - packed=1 is dumped if it's not false.
  * mem_group - mem_group is always dumped.
@@ -2408,8 +2653,8 @@ static u32 qed_grc_dump_mem_hdr(struct qed_hwfn *p_hwfn,
 				u32 *dump_buf,
 				bool dump,
 				const char *name,
-				u32 byte_addr,
-				u32 dword_len,
+				u32 addr,
+				u32 len,
 				u32 bit_width,
 				bool packed,
 				const char *mem_group,
@@ -2419,7 +2664,7 @@ static u32 qed_grc_dump_mem_hdr(struct qed_hwfn *p_hwfn,
 	u32 offset = 0;
 	char buf[64];
 
-	if (!dword_len)
+	if (!len)
 		DP_NOTICE(p_hwfn,
 			  "Unexpected GRC Dump error: dumped memory size must be non-zero\n");
 	if (bit_width)
@@ -2446,20 +2691,21 @@ static u32 qed_grc_dump_mem_hdr(struct qed_hwfn *p_hwfn,
 			DP_VERBOSE(p_hwfn,
 				   QED_MSG_DEBUG,
 				   "Dumping %d registers from %s...\n",
-				   dword_len, buf);
+				   len, buf);
 	} else {
 		/* Dump address */
 		offset += qed_dump_num_param(dump_buf + offset,
-					     dump, "addr", byte_addr);
-		if (dump && dword_len > 64)
+					     dump, "addr",
+					     DWORDS_TO_BYTES(addr));
+		if (dump && len > 64)
 			DP_VERBOSE(p_hwfn,
 				   QED_MSG_DEBUG,
 				   "Dumping %d registers from address 0x%x...\n",
-				   dword_len, byte_addr);
+				   len, (u32)DWORDS_TO_BYTES(addr));
 	}
 
 	/* Dump len */
-	offset += qed_dump_num_param(dump_buf + offset, dump, "len", dword_len);
+	offset += qed_dump_num_param(dump_buf + offset, dump, "len", len);
 
 	/* Dump bit width */
 	if (bit_width)
@@ -2492,8 +2738,8 @@ static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn,
 			    u32 *dump_buf,
 			    bool dump,
 			    const char *name,
-			    u32 byte_addr,
-			    u32 dword_len,
+			    u32 addr,
+			    u32 len,
 			    u32 bit_width,
 			    bool packed,
 			    const char *mem_group,
@@ -2505,21 +2751,14 @@ static u32 qed_grc_dump_mem(struct qed_hwfn *p_hwfn,
 				       dump_buf + offset,
 				       dump,
 				       name,
-				       byte_addr,
-				       dword_len,
+				       addr,
+				       len,
 				       bit_width,
 				       packed,
 				       mem_group, is_storm, storm_letter);
-	if (dump) {
-		u32 i;
-
-		for (i = 0; i < dword_len;
-		     i++, byte_addr += BYTES_IN_DWORD, offset++)
-			*(dump_buf + offset) = qed_rd(p_hwfn, p_ptt, byte_addr);
-	} else {
-		offset += dword_len;
-	}
-
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset, dump, addr, len);
 	return offset;
 }
 
@@ -2575,25 +2814,41 @@ static u32 qed_grc_dump_mem_entries(struct qed_hwfn *p_hwfn,
 			if (qed_grc_is_mem_included(p_hwfn,
 					(enum block_id)cond_hdr->block_id,
 					mem_group_id)) {
-				u32 mem_byte_addr =
-					DWORDS_TO_BYTES(GET_FIELD(mem->dword0,
-							DBG_DUMP_MEM_ADDRESS));
+				u32 mem_addr = GET_FIELD(mem->dword0,
+							 DBG_DUMP_MEM_ADDRESS);
 				u32 mem_len = GET_FIELD(mem->dword1,
 							DBG_DUMP_MEM_LENGTH);
+				enum dbg_grc_params grc_param;
 				char storm_letter = 'a';
 				bool is_storm = false;
 
 				/* Update memory length for CCFC/TCFC memories
 				 * according to number of LCIDs/LTIDs.
 				 */
-				if (mem_group_id == MEM_GROUP_CONN_CFC_MEM)
+				if (mem_group_id == MEM_GROUP_CONN_CFC_MEM) {
+					if (mem_len % MAX_LCIDS != 0) {
+						DP_NOTICE(p_hwfn,
+							  "Invalid CCFC connection memory size\n");
+						return 0;
+					}
+
+					grc_param = DBG_GRC_PARAM_NUM_LCIDS;
 					mem_len = qed_grc_get_param(p_hwfn,
-							DBG_GRC_PARAM_NUM_LCIDS)
-							* (mem_len / MAX_LCIDS);
-				else if (mem_group_id == MEM_GROUP_TASK_CFC_MEM)
+								    grc_param) *
+						  (mem_len / MAX_LCIDS);
+				} else if (mem_group_id ==
+					   MEM_GROUP_TASK_CFC_MEM) {
+					if (mem_len % MAX_LTIDS != 0) {
+						DP_NOTICE(p_hwfn,
+							  "Invalid TCFC task memory size\n");
+						return 0;
+					}
+
+					grc_param = DBG_GRC_PARAM_NUM_LTIDS;
 					mem_len = qed_grc_get_param(p_hwfn,
-							DBG_GRC_PARAM_NUM_LTIDS)
-							* (mem_len / MAX_LTIDS);
+								    grc_param) *
+						  (mem_len / MAX_LTIDS);
+				}
 
 				/* If memory is associated with Storm, update
 				 * Storm details.
@@ -2610,7 +2865,7 @@ static u32 qed_grc_dump_mem_entries(struct qed_hwfn *p_hwfn,
 				/* Dump memory */
 				offset += qed_grc_dump_mem(p_hwfn, p_ptt,
 						dump_buf + offset, dump, NULL,
-						mem_byte_addr, mem_len, 0,
+						mem_addr, mem_len, 0,
 						false,
 						s_mem_group_names[mem_group_id],
 						is_storm, storm_letter);
@@ -2799,29 +3054,31 @@ static u32 qed_grc_dump_iors(struct qed_hwfn *p_hwfn,
 	u32 offset = 0;
 
 	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
-		if (qed_grc_is_storm_included(p_hwfn,
-					      (enum dbg_storms)storm_id)) {
-			for (set_id = 0; set_id < NUM_IOR_SETS; set_id++) {
-				u32 addr =
-				    s_storm_defs[storm_id].sem_fast_mem_addr +
-				    SEM_FAST_REG_STORM_REG_FILE +
-				    DWORDS_TO_BYTES(IOR_SET_OFFSET(set_id));
+		struct storm_defs *storm = &s_storm_defs[storm_id];
 
-				buf[strlen(buf) - 1] = '0' + set_id;
-				offset += qed_grc_dump_mem(p_hwfn,
-							   p_ptt,
-							   dump_buf + offset,
-							   dump,
-							   buf,
-							   addr,
-							   IORS_PER_SET,
-							   32,
-							   false,
-							   "ior",
-							   true,
-							   s_storm_defs
-							   [storm_id].letter);
-			}
+		if (!qed_grc_is_storm_included(p_hwfn,
+					       (enum dbg_storms)storm_id))
+			continue;
+
+		for (set_id = 0; set_id < NUM_IOR_SETS; set_id++) {
+			u32 dwords, addr;
+
+			dwords = storm->sem_fast_mem_addr +
+				 SEM_FAST_REG_STORM_REG_FILE;
+			addr = BYTES_TO_DWORDS(dwords) + IOR_SET_OFFSET(set_id);
+			buf[strlen(buf) - 1] = '0' + set_id;
+			offset += qed_grc_dump_mem(p_hwfn,
+						   p_ptt,
+						   dump_buf + offset,
+						   dump,
+						   buf,
+						   addr,
+						   IORS_PER_SET,
+						   32,
+						   false,
+						   "ior",
+						   true,
+						   storm->letter);
 		}
 	}
 
@@ -2990,34 +3247,39 @@ static u32 qed_grc_dump_rss(struct qed_hwfn *p_hwfn,
 		struct rss_mem_defs *rss_defs = &s_rss_mem_defs[rss_mem_id];
 		u32 num_entries = rss_defs->num_entries[dev_data->chip_id];
 		u32 entry_width = rss_defs->entry_width[dev_data->chip_id];
-		u32 total_size = (num_entries * entry_width) / 32;
+		u32 total_dwords = (num_entries * entry_width) / 32;
+		u32 size = RSS_REG_RSS_RAM_DATA_SIZE;
 		bool packed = (entry_width == 16);
-		u32 addr = rss_defs->addr;
-		u32 i, j;
+		u32 rss_addr = rss_defs->addr;
+		u32 i, addr;
 
 		offset += qed_grc_dump_mem_hdr(p_hwfn,
 					       dump_buf + offset,
 					       dump,
 					       rss_defs->mem_name,
-					       addr,
-					       total_size,
+					       0,
+					       total_dwords,
 					       entry_width,
 					       packed,
 					       rss_defs->type_name, false, 0);
 
 		if (!dump) {
-			offset += total_size;
+			offset += total_dwords;
 			continue;
 		}
 
 		/* Dump RSS data */
-		for (i = 0; i < BYTES_TO_DWORDS(total_size); i++, addr++) {
-			qed_wr(p_hwfn, p_ptt, RSS_REG_RSS_RAM_ADDR, addr);
-			for (j = 0; j < BYTES_IN_DWORD; j++, offset++)
-				*(dump_buf + offset) =
-					qed_rd(p_hwfn, p_ptt,
-					       RSS_REG_RSS_RAM_DATA +
-					       DWORDS_TO_BYTES(j));
+		for (i = 0; i < total_dwords;
+		     i += RSS_REG_RSS_RAM_DATA_SIZE, rss_addr++) {
+			addr = BYTES_TO_DWORDS(RSS_REG_RSS_RAM_DATA);
+			qed_wr(p_hwfn, p_ptt, RSS_REG_RSS_RAM_ADDR, rss_addr);
+				offset += qed_grc_dump_addr_range(p_hwfn,
+								  p_ptt,
+								  dump_buf +
+								  offset,
+								  dump,
+								  addr,
+								  size);
 		}
 	}
 
@@ -3030,19 +3292,19 @@ static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
 				u32 *dump_buf, bool dump, u8 big_ram_id)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 total_blocks, ram_size, offset = 0, i;
 	char mem_name[12] = "???_BIG_RAM";
 	char type_name[8] = "???_RAM";
-	u32 ram_size, total_blocks;
-	u32 offset = 0, i, j;
+	struct big_ram_defs *big_ram;
 
-	total_blocks =
-		s_big_ram_defs[big_ram_id].num_of_blocks[dev_data->chip_id];
+	big_ram = &s_big_ram_defs[big_ram_id];
+	total_blocks = big_ram->num_of_blocks[dev_data->chip_id];
 	ram_size = total_blocks * BIG_RAM_BLOCK_SIZE_DWORDS;
 
-	strncpy(type_name, s_big_ram_defs[big_ram_id].instance_name,
-		strlen(s_big_ram_defs[big_ram_id].instance_name));
-	strncpy(mem_name, s_big_ram_defs[big_ram_id].instance_name,
-		strlen(s_big_ram_defs[big_ram_id].instance_name));
+	strncpy(type_name, big_ram->instance_name,
+		strlen(big_ram->instance_name));
+	strncpy(mem_name, big_ram->instance_name,
+		strlen(big_ram->instance_name));
 
 	/* Dump memory header */
 	offset += qed_grc_dump_mem_hdr(p_hwfn,
@@ -3059,13 +3321,17 @@ static u32 qed_grc_dump_big_ram(struct qed_hwfn *p_hwfn,
 
 	/* Read and dump Big RAM data */
 	for (i = 0; i < total_blocks / 2; i++) {
-		qed_wr(p_hwfn, p_ptt, s_big_ram_defs[big_ram_id].addr_reg_addr,
-		       i);
-		for (j = 0; j < 2 * BIG_RAM_BLOCK_SIZE_DWORDS; j++, offset++)
-			*(dump_buf + offset) = qed_rd(p_hwfn, p_ptt,
-						s_big_ram_defs[big_ram_id].
-							data_reg_addr +
-						DWORDS_TO_BYTES(j));
+		u32 addr, len;
+
+		qed_wr(p_hwfn, p_ptt, big_ram->addr_reg_addr, i);
+		addr = BYTES_TO_DWORDS(big_ram->data_reg_addr);
+		len = 2 * BIG_RAM_BLOCK_SIZE_DWORDS;
+		offset += qed_grc_dump_addr_range(p_hwfn,
+						  p_ptt,
+						  dump_buf + offset,
+						  dump,
+						  addr,
+						  len);
 	}
 
 	return offset;
@@ -3075,11 +3341,11 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 			    struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
 {
 	bool block_enable[MAX_BLOCK_ID] = { 0 };
+	u32 offset = 0, addr;
 	bool halted = false;
-	u32 offset = 0;
 
 	/* Halt MCP */
-	if (dump) {
+	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_MCP)) {
 		halted = !qed_mcp_halt(p_hwfn, p_ptt);
 		if (!halted)
 			DP_NOTICE(p_hwfn, "MCP halt failed!\n");
@@ -3091,7 +3357,7 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 				   dump_buf + offset,
 				   dump,
 				   NULL,
-				   MCP_REG_SCRATCH,
+				   BYTES_TO_DWORDS(MCP_REG_SCRATCH),
 				   MCP_REG_SCRATCH_SIZE,
 				   0, false, "MCP", false, 0);
 
@@ -3101,7 +3367,7 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 				   dump_buf + offset,
 				   dump,
 				   NULL,
-				   MCP_REG_CPU_REG_FILE,
+				   BYTES_TO_DWORDS(MCP_REG_CPU_REG_FILE),
 				   MCP_REG_CPU_REG_FILE_SIZE,
 				   0, false, "MCP", false, 0);
 
@@ -3115,12 +3381,13 @@ static u32 qed_grc_dump_mcp(struct qed_hwfn *p_hwfn,
 	/* Dump required non-MCP registers */
 	offset += qed_grc_dump_regs_hdr(dump_buf + offset,
 					dump, 1, "eng", -1, "block", "MCP");
+	addr = BYTES_TO_DWORDS(MISC_REG_SHARED_MEM_ADDR);
 	offset += qed_grc_dump_reg_entry(p_hwfn,
 					 p_ptt,
 					 dump_buf + offset,
 					 dump,
-					 BYTES_TO_DWORDS
-					 (MISC_REG_SHARED_MEM_ADDR), 1);
+					 addr,
+					 1);
 
 	/* Release MCP */
 	if (halted && qed_mcp_resume(p_hwfn, p_ptt))
@@ -3212,7 +3479,7 @@ static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
 {
 	u32 block_dwords = NUM_DBG_BUS_LINES * STATIC_DEBUG_LINE_DWORDS;
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
-	u32 offset = 0, block_id, line_id, addr, i;
+	u32 offset = 0, block_id, line_id;
 	struct block_defs *p_block_defs;
 
 	if (dump) {
@@ -3255,6 +3522,8 @@ static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
 		if (dump && !dev_data->block_in_reset[block_id]) {
 			u8 dbg_client_id =
 				p_block_defs->dbg_client_id[dev_data->chip_id];
+			u32 addr = BYTES_TO_DWORDS(DBG_REG_CALENDAR_OUT_DATA);
+			u32 len = STATIC_DEBUG_LINE_DWORDS;
 
 			/* Enable block's client */
 			qed_bus_enable_clients(p_hwfn, p_ptt,
@@ -3270,11 +3539,13 @@ static u32 qed_grc_dump_static_debug(struct qed_hwfn *p_hwfn,
 						    0xf, 0, 0, 0);
 
 				/* Read debug line info */
-				for (i = 0, addr = DBG_REG_CALENDAR_OUT_DATA;
-				     i < STATIC_DEBUG_LINE_DWORDS;
-				     i++, offset++, addr += BYTES_IN_DWORD)
-					dump_buf[offset] = qed_rd(p_hwfn, p_ptt,
-								  addr);
+				offset +=
+				    qed_grc_dump_addr_range(p_hwfn,
+							    p_ptt,
+							    dump_buf + offset,
+							    dump,
+							    addr,
+							    len);
 			}
 
 			/* Disable block's client and debug output */
@@ -3311,14 +3582,8 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 	u8 i, port_mode = 0;
 	u32 offset = 0;
 
-	/* Check if emulation platform */
 	*num_dumped_dwords = 0;
 
-	/* Fill GRC parameters that were not set by the user with their default
-	 * value.
-	 */
-	qed_dbg_grc_set_params_default(p_hwfn);
-
 	/* Find port mode */
 	if (dump) {
 		switch (qed_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE)) {
@@ -3370,15 +3635,14 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 	}
 
 	/* Disable all parities using MFW command */
-	if (dump) {
+	if (dump && !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_MCP)) {
 		parities_masked = !qed_mcp_mask_parities(p_hwfn, p_ptt, 1);
 		if (!parities_masked) {
+			DP_NOTICE(p_hwfn,
+				  "Failed to mask parities using MFW\n");
 			if (qed_grc_get_param
 			    (p_hwfn, DBG_GRC_PARAM_PARITY_SAFE))
 				return DBG_STATUS_MCP_COULD_NOT_MASK_PRTY;
-			else
-				DP_NOTICE(p_hwfn,
-					  "Failed to mask parities using MFW\n");
 		}
 	}
 
@@ -3409,6 +3673,11 @@ static enum dbg_status qed_grc_dump(struct qed_hwfn *p_hwfn,
 						 offset,
 						 dump,
 						 block_enable, NULL, NULL);
+
+		/* Dump special registers */
+		offset += qed_grc_dump_special_regs(p_hwfn,
+						    p_ptt,
+						    dump_buf + offset, dump);
 	}
 
 	/* Dump memories */
@@ -3583,9 +3852,9 @@ static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn,
 			}
 
 			if (mode_match) {
-				u32 grc_addr =
-					DWORDS_TO_BYTES(GET_FIELD(reg->data,
-						DBG_IDLE_CHK_INFO_REG_ADDRESS));
+				u32 addr =
+				    GET_FIELD(reg->data,
+					      DBG_IDLE_CHK_INFO_REG_ADDRESS);
 
 				/* Write register header */
 				struct dbg_idle_chk_result_reg_hdr *reg_hdr =
@@ -3597,16 +3866,19 @@ static u32 qed_idle_chk_dump_failure(struct qed_hwfn *p_hwfn,
 				memset(reg_hdr, 0, sizeof(*reg_hdr));
 				reg_hdr->size = reg->size;
 				SET_FIELD(reg_hdr->data,
-					DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID,
-					rule->num_cond_regs + reg_id);
+					  DBG_IDLE_CHK_RESULT_REG_HDR_REG_ID,
+					  rule->num_cond_regs + reg_id);
 
 				/* Write register values */
-				for (i = 0; i < reg->size;
-				     i++, offset++, grc_addr += 4)
-					dump_buf[offset] =
-						qed_rd(p_hwfn, p_ptt, grc_addr);
-				}
+				offset +=
+				    qed_grc_dump_addr_range(p_hwfn,
+							    p_ptt,
+							    dump_buf + offset,
+							    dump,
+							    addr,
+							    reg->size);
 			}
+		}
 	}
 
 	return offset;
@@ -3621,7 +3893,7 @@ qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
 	u32 cond_reg_values[IDLE_CHK_MAX_ENTRIES_SIZE];
-	u32 i, j, offset = 0;
+	u32 i, offset = 0;
 	u16 entry_id;
 	u8 reg_id;
 
@@ -3664,73 +3936,83 @@ qed_idle_chk_dump_rule_entries(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 		if (!check_rule && dump)
 			continue;
 
+		if (!dump) {
+			u32 entry_dump_size =
+				qed_idle_chk_dump_failure(p_hwfn,
+							  p_ptt,
+							  dump_buf + offset,
+							  false,
+							  rule->rule_id,
+							  rule,
+							  0,
+							  NULL);
+
+			offset += num_reg_entries * entry_dump_size;
+			(*num_failing_rules) += num_reg_entries;
+			continue;
+		}
+
 		/* Go over all register entries (number of entries is the same
 		 * for all condition registers).
 		 */
 		for (entry_id = 0; entry_id < num_reg_entries; entry_id++) {
 			/* Read current entry of all condition registers */
-			if (dump) {
-				u32 next_reg_offset = 0;
-
-				for (reg_id = 0;
-				     reg_id < rule->num_cond_regs;
-				     reg_id++) {
-					const struct dbg_idle_chk_cond_reg
-						*reg = &cond_regs[reg_id];
-
-					/* Find GRC address (if it's a memory,
-					 * the address of the specific entry is
-					 * calculated).
-					 */
-					u32 grc_addr =
-					   DWORDS_TO_BYTES(
-						GET_FIELD(reg->data,
-						    DBG_IDLE_CHK_COND_REG_ADDRESS));
-
-					if (reg->num_entries > 1 ||
-					    reg->start_entry > 0) {
-						u32 padded_entry_size =
-							reg->entry_size > 1 ?
-							roundup_pow_of_two
-							(reg->entry_size) : 1;
-
-						grc_addr +=
-							DWORDS_TO_BYTES(
-								(reg->start_entry +
-								entry_id)
-								* padded_entry_size);
-					}
+			u32 next_reg_offset = 0;
 
-					/* Read registers */
-					if (next_reg_offset + reg->entry_size >=
-					    IDLE_CHK_MAX_ENTRIES_SIZE) {
-						DP_NOTICE(p_hwfn,
-							  "idle check registers entry is too large\n");
-						return 0;
-					}
+			for (reg_id = 0; reg_id < rule->num_cond_regs;
+			     reg_id++) {
+				const struct dbg_idle_chk_cond_reg *reg =
+					&cond_regs[reg_id];
 
-					for (j = 0; j < reg->entry_size;
-					     j++, next_reg_offset++,
-					     grc_addr += 4)
-					     cond_reg_values[next_reg_offset] =
-						qed_rd(p_hwfn, p_ptt, grc_addr);
+				/* Find GRC address (if it's a memory,the
+				 * address of the specific entry is calculated).
+				 */
+				u32 addr =
+				    GET_FIELD(reg->data,
+					      DBG_IDLE_CHK_COND_REG_ADDRESS);
+
+				if (reg->num_entries > 1 ||
+				    reg->start_entry > 0) {
+					u32 padded_entry_size =
+					   reg->entry_size > 1 ?
+					   roundup_pow_of_two(reg->entry_size) :
+					   1;
+
+					addr += (reg->start_entry + entry_id) *
+						padded_entry_size;
 				}
+
+				/* Read registers */
+				if (next_reg_offset + reg->entry_size >=
+				    IDLE_CHK_MAX_ENTRIES_SIZE) {
+					DP_NOTICE(p_hwfn,
+						  "idle check registers entry is too large\n");
+					return 0;
+				}
+
+				next_reg_offset +=
+				    qed_grc_dump_addr_range(p_hwfn,
+							    p_ptt,
+							    cond_reg_values +
+							    next_reg_offset,
+							    dump, addr,
+							    reg->entry_size);
 			}
 
 			/* Call rule's condition function - a return value of
 			 * true indicates failure.
 			 */
 			if ((*cond_arr[rule->cond_id])(cond_reg_values,
-						       imm_values) || !dump) {
+						       imm_values)) {
 				offset +=
-					qed_idle_chk_dump_failure(p_hwfn,
-							p_ptt,
-							dump_buf + offset,
-							dump,
-							rule->rule_id,
-							rule,
-							entry_id,
-							cond_reg_values);
+				    qed_idle_chk_dump_failure(p_hwfn,
+							      p_ptt,
+							      dump_buf + offset,
+							      dump,
+							      rule->rule_id,
+							      rule,
+							      entry_id,
+							      cond_reg_values);
 				(*num_failing_rules)++;
 				break;
 			}
@@ -3818,13 +4100,18 @@ static enum dbg_status qed_find_nvram_image(struct qed_hwfn *p_hwfn,
 	struct mcp_file_att file_att;
 
 	/* Call NVRAM get file command */
-	if (qed_mcp_nvm_rd_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_NVM_GET_FILE_ATT,
-			       image_type, &ret_mcp_resp, &ret_mcp_param,
-			       &ret_txn_size, (u32 *)&file_att) != 0)
-		return DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
+	int nvm_result = qed_mcp_nvm_rd_cmd(p_hwfn,
+					    p_ptt,
+					    DRV_MSG_CODE_NVM_GET_FILE_ATT,
+					    image_type,
+					    &ret_mcp_resp,
+					    &ret_mcp_param,
+					    &ret_txn_size,
+					    (u32 *)&file_att);
 
 	/* Check response */
-	if ((ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK)
+	if (nvm_result ||
+	    (ret_mcp_resp & FW_MSG_CODE_MASK) != FW_MSG_CODE_NVM_OK)
 		return DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
 
 	/* Update return values */
@@ -3944,7 +4231,6 @@ static enum dbg_status qed_mcp_trace_get_meta_info(struct qed_hwfn *p_hwfn,
 	u32 running_mfw_addr =
 		MCP_REG_SCRATCH + SECTION_OFFSET(spad_trace_offsize) +
 		QED_SECTION_SIZE(spad_trace_offsize) + trace_data_size_bytes;
-	enum dbg_status status;
 	u32 nvram_image_type;
 
 	*running_bundle_id = qed_rd(p_hwfn, p_ptt, running_mfw_addr);
@@ -3955,30 +4241,12 @@ static enum dbg_status qed_mcp_trace_get_meta_info(struct qed_hwfn *p_hwfn,
 	nvram_image_type =
 	    (*running_bundle_id ==
 	     DIR_ID_1) ? NVM_TYPE_MFW_TRACE1 : NVM_TYPE_MFW_TRACE2;
-	status = qed_find_nvram_image(p_hwfn,
-				      p_ptt,
-				      nvram_image_type,
-				      trace_meta_offset_bytes,
-				      trace_meta_size_bytes);
-
-	return status;
-}
-
-/* Reads the MCP Trace data from the specified GRC address into the specified
- * buffer.
- */
-static void qed_mcp_trace_read_data(struct qed_hwfn *p_hwfn,
-				    struct qed_ptt *p_ptt,
-				    u32 grc_addr, u32 size_in_dwords, u32 *buf)
-{
-	u32 i;
 
-	DP_VERBOSE(p_hwfn,
-		   QED_MSG_DEBUG,
-		   "mcp_trace_read_data: reading trace data of size %d dwords from GRC address 0x%x\n",
-		   size_in_dwords, grc_addr);
-	for (i = 0; i < size_in_dwords; i++, grc_addr += BYTES_IN_DWORD)
-		buf[i] = qed_rd(p_hwfn, p_ptt, grc_addr);
+	return qed_find_nvram_image(p_hwfn,
+				    p_ptt,
+				    nvram_image_type,
+				    trace_meta_offset_bytes,
+				    trace_meta_size_bytes);
 }
 
 /* Reads the MCP Trace meta data (from NVRAM or buffer) into the specified
@@ -4034,11 +4302,14 @@ static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 					  bool dump, u32 *num_dumped_dwords)
 {
 	u32 trace_data_grc_addr, trace_data_size_bytes, trace_data_size_dwords;
-	u32 trace_meta_size_dwords, running_bundle_id, offset = 0;
-	u32 trace_meta_offset_bytes, trace_meta_size_bytes;
+	u32 trace_meta_size_dwords = 0, running_bundle_id, offset = 0;
+	u32 trace_meta_offset_bytes = 0, trace_meta_size_bytes = 0;
 	enum dbg_status status;
+	bool mcp_access;
 	int halted = 0;
 
+	mcp_access = !qed_grc_get_param(p_hwfn, DBG_GRC_PARAM_NO_MCP);
+
 	*num_dumped_dwords = 0;
 
 	/* Get trace data info */
@@ -4060,7 +4331,7 @@ static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 	 * consistent if halt fails, MCP trace is taken anyway, with a small
 	 * risk that it may be corrupt.
 	 */
-	if (dump) {
+	if (dump && mcp_access) {
 		halted = !qed_mcp_halt(p_hwfn, p_ptt);
 		if (!halted)
 			DP_NOTICE(p_hwfn, "MCP halt failed!\n");
@@ -4078,13 +4349,12 @@ static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 				     dump, "size", trace_data_size_dwords);
 
 	/* Read trace data from scratchpad into dump buffer */
-	if (dump)
-		qed_mcp_trace_read_data(p_hwfn,
-					p_ptt,
-					trace_data_grc_addr,
-					trace_data_size_dwords,
-					dump_buf + offset);
-	offset += trace_data_size_dwords;
+	offset += qed_grc_dump_addr_range(p_hwfn,
+					  p_ptt,
+					  dump_buf + offset,
+					  dump,
+					  BYTES_TO_DWORDS(trace_data_grc_addr),
+					  trace_data_size_dwords);
 
 	/* Resume MCP (only if halt succeeded) */
 	if (halted && qed_mcp_resume(p_hwfn, p_ptt) != 0)
@@ -4095,38 +4365,38 @@ static enum dbg_status qed_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 				       dump, "mcp_trace_meta", 1);
 
 	/* Read trace meta info */
-	status = qed_mcp_trace_get_meta_info(p_hwfn,
-					     p_ptt,
-					     trace_data_size_bytes,
-					     &running_bundle_id,
-					     &trace_meta_offset_bytes,
-					     &trace_meta_size_bytes);
-	if (status != DBG_STATUS_OK)
-		return status;
+	if (mcp_access) {
+		status = qed_mcp_trace_get_meta_info(p_hwfn,
+						     p_ptt,
+						     trace_data_size_bytes,
+						     &running_bundle_id,
+						     &trace_meta_offset_bytes,
+						     &trace_meta_size_bytes);
+		if (status == DBG_STATUS_OK)
+			trace_meta_size_dwords =
+				BYTES_TO_DWORDS(trace_meta_size_bytes);
+	}
 
-	/* Dump trace meta size param (trace_meta_size_bytes is always
-	 * dword-aligned).
-	 */
-	trace_meta_size_dwords = BYTES_TO_DWORDS(trace_meta_size_bytes);
-	offset += qed_dump_num_param(dump_buf + offset,	dump, "size",
-				     trace_meta_size_dwords);
+	/* Dump trace meta size param */
+	offset += qed_dump_num_param(dump_buf + offset,
+				     dump, "size", trace_meta_size_dwords);
 
 	/* Read trace meta image into dump buffer */
-	if (dump) {
+	if (dump && trace_meta_size_dwords)
 		status = qed_mcp_trace_read_meta(p_hwfn,
-						p_ptt,
-						trace_meta_offset_bytes,
-						trace_meta_size_bytes,
-						dump_buf + offset);
-		if (status != DBG_STATUS_OK)
-			return status;
-	}
-
-	offset += trace_meta_size_dwords;
+						 p_ptt,
+						 trace_meta_offset_bytes,
+						 trace_meta_size_bytes,
+						 dump_buf + offset);
+	if (status == DBG_STATUS_OK)
+		offset += trace_meta_size_dwords;
 
 	*num_dumped_dwords = offset;
 
-	return DBG_STATUS_OK;
+	/* If no mcp access, indicate that the dump doesn't contain the meta
+	 * data from NVRAM.
+	 */
+	return mcp_access ? status : DBG_STATUS_NVRAM_GET_IMAGE_FAILED;
 }
 
 /* Dump GRC FIFO */
@@ -4311,9 +4581,10 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 			       struct qed_ptt *p_ptt, u32 *dump_buf, bool dump)
 {
 	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	struct fw_asserts_ram_section *asserts;
 	char storm_letter_str[2] = "?";
 	struct fw_info fw_info;
-	u32 offset = 0, i;
+	u32 offset = 0;
 	u8 storm_id;
 
 	/* Dump global params */
@@ -4323,8 +4594,8 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 	offset += qed_dump_str_param(dump_buf + offset,
 				     dump, "dump-type", "fw-asserts");
 	for (storm_id = 0; storm_id < MAX_DBG_STORMS; storm_id++) {
-		u32 fw_asserts_section_addr, next_list_idx_addr, next_list_idx,
-			last_list_idx, element_addr;
+		u32 fw_asserts_section_addr, next_list_idx_addr, next_list_idx;
+		u32 last_list_idx, addr;
 
 		if (dev_data->block_in_reset[s_storm_defs[storm_id].block_id])
 			continue;
@@ -4332,6 +4603,8 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 		/* Read FW info for the current Storm */
 		qed_read_fw_info(p_hwfn, p_ptt, storm_id, &fw_info);
 
+		asserts = &fw_info.fw_asserts_section;
+
 		/* Dump FW Asserts section header and params */
 		storm_letter_str[0] = s_storm_defs[storm_id].letter;
 		offset += qed_dump_section_hdr(dump_buf + offset, dump,
@@ -4339,12 +4612,10 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 		offset += qed_dump_str_param(dump_buf + offset, dump, "storm",
 					     storm_letter_str);
 		offset += qed_dump_num_param(dump_buf + offset, dump, "size",
-					     fw_info.fw_asserts_section.
-					     list_element_dword_size);
+					     asserts->list_element_dword_size);
 
 		if (!dump) {
-			offset += fw_info.fw_asserts_section.
-				  list_element_dword_size;
+			offset += asserts->list_element_dword_size;
 			continue;
 		}
 
@@ -4352,28 +4623,22 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 		fw_asserts_section_addr =
 			s_storm_defs[storm_id].sem_fast_mem_addr +
 			SEM_FAST_REG_INT_RAM +
-			RAM_LINES_TO_BYTES(fw_info.fw_asserts_section.
-					   section_ram_line_offset);
+			RAM_LINES_TO_BYTES(asserts->section_ram_line_offset);
 		next_list_idx_addr =
 			fw_asserts_section_addr +
-			DWORDS_TO_BYTES(fw_info.fw_asserts_section.
-					list_next_index_dword_offset);
+			DWORDS_TO_BYTES(asserts->list_next_index_dword_offset);
 		next_list_idx = qed_rd(p_hwfn, p_ptt, next_list_idx_addr);
 		last_list_idx = (next_list_idx > 0
 				 ? next_list_idx
-				 : fw_info.fw_asserts_section.list_num_elements)
-				- 1;
-		element_addr =
-			fw_asserts_section_addr +
-			DWORDS_TO_BYTES(fw_info.fw_asserts_section.
-					list_dword_offset) +
-			last_list_idx *
-			DWORDS_TO_BYTES(fw_info.fw_asserts_section.
-					list_element_dword_size);
-		for (i = 0;
-		     i < fw_info.fw_asserts_section.list_element_dword_size;
-		     i++, offset++, element_addr += BYTES_IN_DWORD)
-			dump_buf[offset] = qed_rd(p_hwfn, p_ptt, element_addr);
+				 : asserts->list_num_elements) - 1;
+		addr = BYTES_TO_DWORDS(fw_asserts_section_addr) +
+		       asserts->list_dword_offset +
+		       last_list_idx * asserts->list_element_dword_size;
+		offset +=
+		    qed_grc_dump_addr_range(p_hwfn, p_ptt,
+					    dump_buf + offset,
+					    dump, addr,
+					    asserts->list_element_dword_size);
 	}
 
 	/* Dump last section */
@@ -4386,13 +4651,10 @@ static u32 qed_fw_asserts_dump(struct qed_hwfn *p_hwfn,
 enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr)
 {
 	/* Convert binary data to debug arrays */
-	u32 num_of_buffers = *(u32 *)bin_ptr;
-	struct bin_buffer_hdr *buf_array;
+	struct bin_buffer_hdr *buf_array = (struct bin_buffer_hdr *)bin_ptr;
 	u8 buf_id;
 
-	buf_array = (struct bin_buffer_hdr *)((u32 *)bin_ptr + 1);
-
-	for (buf_id = 0; buf_id < num_of_buffers; buf_id++) {
+	for (buf_id = 0; buf_id < MAX_BIN_DBG_BUFFER_TYPE; buf_id++) {
 		s_dbg_arrays[buf_id].ptr =
 		    (u32 *)(bin_ptr + buf_array[buf_id].offset);
 		s_dbg_arrays[buf_id].size_in_dwords =
@@ -4402,6 +4664,17 @@ enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr)
 	return DBG_STATUS_OK;
 }
 
+/* Assign default GRC param values */
+void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn)
+{
+	struct dbg_tools_data *dev_data = &p_hwfn->dbg_info;
+	u32 i;
+
+	for (i = 0; i < MAX_DBG_GRC_PARAMS; i++)
+		dev_data->grc.param_val[i] =
+		    s_grc_param_defs[i].default_val[dev_data->chip_id];
+}
+
 enum dbg_status qed_dbg_grc_get_dump_buf_size(struct qed_hwfn *p_hwfn,
 					      struct qed_ptt *p_ptt,
 					      u32 *buf_size)
@@ -4441,8 +4714,9 @@ enum dbg_status qed_dbg_grc_dump(struct qed_hwfn *p_hwfn,
 	/* GRC Dump */
 	status = qed_grc_dump(p_hwfn, p_ptt, dump_buf, true, num_dumped_dwords);
 
-	/* Clear all GRC params */
-	qed_dbg_grc_clear_params(p_hwfn);
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
 	return status;
 }
 
@@ -4495,6 +4769,10 @@ enum dbg_status qed_dbg_idle_chk_dump(struct qed_hwfn *p_hwfn,
 
 	/* Idle Check Dump */
 	*num_dumped_dwords = qed_idle_chk_dump(p_hwfn, p_ptt, dump_buf, true);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
 	return DBG_STATUS_OK;
 }
 
@@ -4519,11 +4797,15 @@ enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 	u32 needed_buf_size_in_dwords;
 	enum dbg_status status;
 
-	status = qed_dbg_mcp_trace_get_dump_buf_size(p_hwfn, p_ptt,
+	/* validate buffer size */
+	status =
+	    qed_dbg_mcp_trace_get_dump_buf_size(p_hwfn, p_ptt,
 						&needed_buf_size_in_dwords);
 
-	if (status != DBG_STATUS_OK)
+	if (status != DBG_STATUS_OK &&
+	    status != DBG_STATUS_NVRAM_GET_IMAGE_FAILED)
 		return status;
+
 	if (buf_size_in_dwords < needed_buf_size_in_dwords)
 		return DBG_STATUS_DUMP_BUF_TOO_SMALL;
 
@@ -4531,8 +4813,13 @@ enum dbg_status qed_dbg_mcp_trace_dump(struct qed_hwfn *p_hwfn,
 	qed_update_blocks_reset_state(p_hwfn, p_ptt);
 
 	/* Perform dump */
-	return qed_mcp_trace_dump(p_hwfn,
-				  p_ptt, dump_buf, true, num_dumped_dwords);
+	status = qed_mcp_trace_dump(p_hwfn,
+				    p_ptt, dump_buf, true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
 }
 
 enum dbg_status qed_dbg_reg_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
@@ -4567,8 +4854,14 @@ enum dbg_status qed_dbg_reg_fifo_dump(struct qed_hwfn *p_hwfn,
 
 	/* Update reset state */
 	qed_update_blocks_reset_state(p_hwfn, p_ptt);
-	return qed_reg_fifo_dump(p_hwfn,
-				 p_ptt, dump_buf, true, num_dumped_dwords);
+
+	status = qed_reg_fifo_dump(p_hwfn,
+				   p_ptt, dump_buf, true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
 }
 
 enum dbg_status qed_dbg_igu_fifo_get_dump_buf_size(struct qed_hwfn *p_hwfn,
@@ -4603,8 +4896,13 @@ enum dbg_status qed_dbg_igu_fifo_dump(struct qed_hwfn *p_hwfn,
 
 	/* Update reset state */
 	qed_update_blocks_reset_state(p_hwfn, p_ptt);
-	return qed_igu_fifo_dump(p_hwfn,
-				 p_ptt, dump_buf, true, num_dumped_dwords);
+
+	status = qed_igu_fifo_dump(p_hwfn,
+				   p_ptt, dump_buf, true, num_dumped_dwords);
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
 }
 
 enum dbg_status
@@ -4641,9 +4939,16 @@ enum dbg_status qed_dbg_protection_override_dump(struct qed_hwfn *p_hwfn,
 
 	/* Update reset state */
 	qed_update_blocks_reset_state(p_hwfn, p_ptt);
-	return qed_protection_override_dump(p_hwfn,
-					    p_ptt,
-					    dump_buf, true, num_dumped_dwords);
+
+	status = qed_protection_override_dump(p_hwfn,
+					      p_ptt,
+					      dump_buf,
+					      true, num_dumped_dwords);
+
+	/* Revert GRC params to their default */
+	qed_dbg_grc_set_params_default(p_hwfn);
+
+	return status;
 }
 
 enum dbg_status qed_dbg_fw_asserts_get_dump_buf_size(struct qed_hwfn *p_hwfn,
@@ -5045,13 +5350,10 @@ static char s_temp_buf[MAX_MSG_LEN];
 enum dbg_status qed_dbg_user_set_bin_ptr(const u8 * const bin_ptr)
 {
 	/* Convert binary data to debug arrays */
-	u32 num_of_buffers = *(u32 *)bin_ptr;
-	struct bin_buffer_hdr *buf_array;
+	struct bin_buffer_hdr *buf_array = (struct bin_buffer_hdr *)bin_ptr;
 	u8 buf_id;
 
-	buf_array = (struct bin_buffer_hdr *)((u32 *)bin_ptr + 1);
-
-	for (buf_id = 0; buf_id < num_of_buffers; buf_id++) {
+	for (buf_id = 0; buf_id < MAX_BIN_DBG_BUFFER_TYPE; buf_id++) {
 		s_dbg_arrays[buf_id].ptr =
 		    (u32 *)(bin_ptr + buf_array[buf_id].offset);
 		s_dbg_arrays[buf_id].size_in_dwords =
@@ -5874,16 +6176,16 @@ static enum dbg_status qed_parse_reg_fifo_dump(struct qed_hwfn *p_hwfn,
 		results_offset +=
 		    sprintf(qed_get_buf_ptr(results_buf,
 					    results_offset),
-			    "raw: 0x%016llx, address: 0x%07llx, access: %-5s, pf: %2lld, vf: %s, port: %lld, privilege: %-3s, protection: %-12s, master: %-4s, errors: ",
+			    "raw: 0x%016llx, address: 0x%07x, access: %-5s, pf: %2d, vf: %s, port: %d, privilege: %-3s, protection: %-12s, master: %-4s, errors: ",
 			    elements[i].data,
-			    GET_FIELD(elements[i].data,
+			    (u32)GET_FIELD(elements[i].data,
 				      REG_FIFO_ELEMENT_ADDRESS) *
 				      REG_FIFO_ELEMENT_ADDR_FACTOR,
 				      s_access_strs[GET_FIELD(elements[i].data,
 						    REG_FIFO_ELEMENT_ACCESS)],
-			    GET_FIELD(elements[i].data,
-				      REG_FIFO_ELEMENT_PF), vf_str,
-			    GET_FIELD(elements[i].data,
+			    (u32)GET_FIELD(elements[i].data,
+					   REG_FIFO_ELEMENT_PF), vf_str,
+			    (u32)GET_FIELD(elements[i].data,
 				      REG_FIFO_ELEMENT_PORT),
 				      s_privilege_strs[GET_FIELD(elements[i].
 				      data,
@@ -6189,13 +6491,13 @@ qed_parse_protection_override_dump(struct qed_hwfn *p_hwfn,
 		results_offset +=
 		    sprintf(qed_get_buf_ptr(results_buf,
 					    results_offset),
-			    "window %2d, address: 0x%07x, size: %7lld regs, read: %lld, write: %lld, read protection: %-12s, write protection: %-12s\n",
+			    "window %2d, address: 0x%07x, size: %7d regs, read: %d, write: %d, read protection: %-12s, write protection: %-12s\n",
 			    i, address,
-			    GET_FIELD(elements[i].data,
+			    (u32)GET_FIELD(elements[i].data,
 				      PROTECTION_OVERRIDE_ELEMENT_WINDOW_SIZE),
-			    GET_FIELD(elements[i].data,
+			    (u32)GET_FIELD(elements[i].data,
 				      PROTECTION_OVERRIDE_ELEMENT_READ),
-			    GET_FIELD(elements[i].data,
+			    (u32)GET_FIELD(elements[i].data,
 				      PROTECTION_OVERRIDE_ELEMENT_WRITE),
 			    s_protection_strs[GET_FIELD(elements[i].data,
 				PROTECTION_OVERRIDE_ELEMENT_READ_PROTECTION)],
@@ -6508,7 +6810,7 @@ static enum dbg_status qed_dbg_dump(struct qed_hwfn *p_hwfn,
 	 */
 	rc = qed_features_lookup[feature_idx].get_size(p_hwfn, p_ptt,
 						       &buf_size_dwords);
-	if (rc != DBG_STATUS_OK)
+	if (rc != DBG_STATUS_OK && rc != DBG_STATUS_NVRAM_GET_IMAGE_FAILED)
 		return rc;
 	feature->buf_size = buf_size_dwords * sizeof(u32);
 	feature->dump_buf = vmalloc(feature->buf_size);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 37c2bfb663bb..88ca78a297a0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -574,6 +574,7 @@ enum core_event_opcode {
 	CORE_EVENT_TX_QUEUE_STOP,
 	CORE_EVENT_RX_QUEUE_START,
 	CORE_EVENT_RX_QUEUE_STOP,
+	CORE_EVENT_RX_QUEUE_FLUSH,
 	MAX_CORE_EVENT_OPCODE
 };
 
@@ -625,6 +626,7 @@ enum core_ramrod_cmd_id {
 	CORE_RAMROD_TX_QUEUE_START,
 	CORE_RAMROD_RX_QUEUE_STOP,
 	CORE_RAMROD_TX_QUEUE_STOP,
+	CORE_RAMROD_RX_QUEUE_FLUSH,
 	MAX_CORE_RAMROD_CMD_ID
 };
 
@@ -698,7 +700,8 @@ struct core_rx_slow_path_cqe {
 	u8 type;
 	u8 ramrod_cmd_id;
 	__le16 echo;
-	__le32 reserved1[7];
+	struct core_rx_cqe_opaque_data opaque_data;
+	__le32 reserved1[5];
 };
 
 union core_rx_cqe_union {
@@ -735,45 +738,46 @@ struct core_rx_stop_ramrod_data {
 	__le16 reserved2[2];
 };
 
-struct core_tx_bd_flags {
-	u8 as_bitfield;
-#define CORE_TX_BD_FLAGS_FORCE_VLAN_MODE_MASK	0x1
-#define CORE_TX_BD_FLAGS_FORCE_VLAN_MODE_SHIFT	0
-#define CORE_TX_BD_FLAGS_VLAN_INSERTION_MASK	0x1
-#define CORE_TX_BD_FLAGS_VLAN_INSERTION_SHIFT	1
-#define CORE_TX_BD_FLAGS_START_BD_MASK	0x1
-#define CORE_TX_BD_FLAGS_START_BD_SHIFT	2
-#define CORE_TX_BD_FLAGS_IP_CSUM_MASK	0x1
-#define CORE_TX_BD_FLAGS_IP_CSUM_SHIFT	3
-#define CORE_TX_BD_FLAGS_L4_CSUM_MASK	0x1
-#define CORE_TX_BD_FLAGS_L4_CSUM_SHIFT	4
-#define CORE_TX_BD_FLAGS_IPV6_EXT_MASK	0x1
-#define CORE_TX_BD_FLAGS_IPV6_EXT_SHIFT	5
-#define CORE_TX_BD_FLAGS_L4_PROTOCOL_MASK	0x1
-#define CORE_TX_BD_FLAGS_L4_PROTOCOL_SHIFT	6
-#define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_MASK	0x1
-#define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_SHIFT 7
+struct core_tx_bd_data {
+	__le16 as_bitfield;
+#define CORE_TX_BD_DATA_FORCE_VLAN_MODE_MASK	0x1
+#define CORE_TX_BD_DATA_FORCE_VLAN_MODE_SHIFT     0
+#define CORE_TX_BD_DATA_VLAN_INSERTION_MASK	0x1
+#define CORE_TX_BD_DATA_VLAN_INSERTION_SHIFT      1
+#define CORE_TX_BD_DATA_START_BD_MASK	0x1
+#define CORE_TX_BD_DATA_START_BD_SHIFT            2
+#define CORE_TX_BD_DATA_IP_CSUM_MASK	0x1
+#define CORE_TX_BD_DATA_IP_CSUM_SHIFT             3
+#define CORE_TX_BD_DATA_L4_CSUM_MASK	0x1
+#define CORE_TX_BD_DATA_L4_CSUM_SHIFT             4
+#define CORE_TX_BD_DATA_IPV6_EXT_MASK	0x1
+#define CORE_TX_BD_DATA_IPV6_EXT_SHIFT            5
+#define CORE_TX_BD_DATA_L4_PROTOCOL_MASK	0x1
+#define CORE_TX_BD_DATA_L4_PROTOCOL_SHIFT         6
+#define CORE_TX_BD_DATA_L4_PSEUDO_CSUM_MODE_MASK	0x1
+#define CORE_TX_BD_DATA_L4_PSEUDO_CSUM_MODE_SHIFT 7
+#define CORE_TX_BD_DATA_NBDS_MASK	0xF
+#define CORE_TX_BD_DATA_NBDS_SHIFT                8
+#define CORE_TX_BD_DATA_ROCE_FLAV_MASK	0x1
+#define CORE_TX_BD_DATA_ROCE_FLAV_SHIFT           12
+#define CORE_TX_BD_DATA_IP_LEN_MASK	0x1
+#define CORE_TX_BD_DATA_IP_LEN_SHIFT              13
+#define CORE_TX_BD_DATA_RESERVED0_MASK            0x3
+#define CORE_TX_BD_DATA_RESERVED0_SHIFT           14
 };
 
 struct core_tx_bd {
 	struct regpair addr;
 	__le16 nbytes;
 	__le16 nw_vlan_or_lb_echo;
-	u8 bitfield0;
-#define CORE_TX_BD_NBDS_MASK	0xF
-#define CORE_TX_BD_NBDS_SHIFT	0
-#define CORE_TX_BD_ROCE_FLAV_MASK	0x1
-#define CORE_TX_BD_ROCE_FLAV_SHIFT	4
-#define CORE_TX_BD_RESERVED0_MASK	0x7
-#define CORE_TX_BD_RESERVED0_SHIFT	5
-	struct core_tx_bd_flags bd_flags;
+	struct core_tx_bd_data bd_data;
 	__le16 bitfield1;
 #define CORE_TX_BD_L4_HDR_OFFSET_W_MASK	0x3FFF
 #define CORE_TX_BD_L4_HDR_OFFSET_W_SHIFT 0
 #define CORE_TX_BD_TX_DST_MASK	0x1
 #define CORE_TX_BD_TX_DST_SHIFT	14
-#define CORE_TX_BD_RESERVED1_MASK	0x1
-#define CORE_TX_BD_RESERVED1_SHIFT	15
+#define CORE_TX_BD_RESERVED_MASK         0x1
+#define CORE_TX_BD_RESERVED_SHIFT        15
 };
 
 enum core_tx_dest {
@@ -800,6 +804,14 @@ struct core_tx_stop_ramrod_data {
 	__le32 reserved0[2];
 };
 
+enum dcb_dhcp_update_flag {
+	DONT_UPDATE_DCB_DHCP,
+	UPDATE_DCB,
+	UPDATE_DSCP,
+	UPDATE_DCB_DSCP,
+	MAX_DCB_DHCP_UPDATE_FLAG
+};
+
 struct eth_mstorm_per_pf_stat {
 	struct regpair gre_discard_pkts;
 	struct regpair vxlan_discard_pkts;
@@ -893,6 +905,12 @@ union event_ring_element {
 	struct event_ring_next_addr next_addr;
 };
 
+enum fw_flow_ctrl_mode {
+	flow_ctrl_pause,
+	flow_ctrl_pfc,
+	MAX_FW_FLOW_CTRL_MODE
+};
+
 /* Major and Minor hsi Versions */
 struct hsi_fp_ver_struct {
 	u8 minor_ver_arr[2];
@@ -921,6 +939,7 @@ enum malicious_vf_error_id {
 	ETH_EDPM_OUT_OF_SYNC,
 	ETH_TUNN_IPV6_EXT_NBD_ERR,
 	ETH_CONTROL_PACKET_VIOLATION,
+	ETH_ANTI_SPOOFING_ERR,
 	MAX_MALICIOUS_VF_ERROR_ID
 };
 
@@ -1106,8 +1125,9 @@ struct tstorm_per_port_stat {
 	struct regpair ll2_mac_filter_discard;
 	struct regpair ll2_conn_disabled_discard;
 	struct regpair iscsi_irregular_pkt;
-	struct regpair reserved;
+	struct regpair fcoe_irregular_pkt;
 	struct regpair roce_irregular_pkt;
+	struct regpair reserved;
 	struct regpair eth_irregular_pkt;
 	struct regpair reserved1;
 	struct regpair preroce_irregular_pkt;
@@ -1648,6 +1668,11 @@ enum block_addr {
 	GRCBASE_MS = 0x6a0000,
 	GRCBASE_PHY_PCIE = 0x620000,
 	GRCBASE_LED = 0x6b8000,
+	GRCBASE_AVS_WRAP = 0x6b0000,
+	GRCBASE_RGFS = 0x19d0000,
+	GRCBASE_TGFS = 0x19e0000,
+	GRCBASE_PTLD = 0x19f0000,
+	GRCBASE_YPLD = 0x1a10000,
 	GRCBASE_MISC_AEU = 0x8000,
 	GRCBASE_BAR0_MAP = 0x1c00000,
 	MAX_BLOCK_ADDR
@@ -1732,6 +1757,11 @@ enum block_id {
 	BLOCK_MS,
 	BLOCK_PHY_PCIE,
 	BLOCK_LED,
+	BLOCK_AVS_WRAP,
+	BLOCK_RGFS,
+	BLOCK_TGFS,
+	BLOCK_PTLD,
+	BLOCK_YPLD,
 	BLOCK_MISC_AEU,
 	BLOCK_BAR0_MAP,
 	MAX_BLOCK_ID
@@ -1783,9 +1813,9 @@ struct dbg_attn_reg_result {
 	__le32 data;
 #define DBG_ATTN_REG_RESULT_STS_ADDRESS_MASK	0xFFFFFF
 #define DBG_ATTN_REG_RESULT_STS_ADDRESS_SHIFT	0
-#define DBG_ATTN_REG_RESULT_NUM_ATTN_IDX_MASK	0xFF
-#define DBG_ATTN_REG_RESULT_NUM_ATTN_IDX_SHIFT	24
-	__le16 attn_idx_offset;
+#define DBG_ATTN_REG_RESULT_NUM_REG_ATTN_MASK	0xFF
+#define DBG_ATTN_REG_RESULT_NUM_REG_ATTN_SHIFT	24
+	__le16 block_attn_offset;
 	__le16 reserved;
 	__le32 sts_val;
 	__le32 mask_val;
@@ -1815,12 +1845,12 @@ struct dbg_mode_hdr {
 /* Attention register */
 struct dbg_attn_reg {
 	struct dbg_mode_hdr mode;
-	__le16 attn_idx_offset;
+	__le16 block_attn_offset;
 	__le32 data;
 #define DBG_ATTN_REG_STS_ADDRESS_MASK	0xFFFFFF
 #define DBG_ATTN_REG_STS_ADDRESS_SHIFT	0
-#define DBG_ATTN_REG_NUM_ATTN_IDX_MASK	0xFF
-#define DBG_ATTN_REG_NUM_ATTN_IDX_SHIFT	24
+#define DBG_ATTN_REG_NUM_REG_ATTN_MASK	0xFF
+#define DBG_ATTN_REG_NUM_REG_ATTN_SHIFT 24
 	__le32 sts_clr_address;
 	__le32 mask_address;
 };
@@ -2001,6 +2031,20 @@ enum dbg_bus_clients {
 	MAX_DBG_BUS_CLIENTS
 };
 
+enum dbg_bus_constraint_ops {
+	DBG_BUS_CONSTRAINT_OP_EQ,
+	DBG_BUS_CONSTRAINT_OP_NE,
+	DBG_BUS_CONSTRAINT_OP_LT,
+	DBG_BUS_CONSTRAINT_OP_LTC,
+	DBG_BUS_CONSTRAINT_OP_LE,
+	DBG_BUS_CONSTRAINT_OP_LEC,
+	DBG_BUS_CONSTRAINT_OP_GT,
+	DBG_BUS_CONSTRAINT_OP_GTC,
+	DBG_BUS_CONSTRAINT_OP_GE,
+	DBG_BUS_CONSTRAINT_OP_GEC,
+	MAX_DBG_BUS_CONSTRAINT_OPS
+};
+
 /* Debug Bus memory address */
 struct dbg_bus_mem_addr {
 	__le32 lo;
@@ -2092,10 +2136,18 @@ struct dbg_bus_data {
 					      * DBG_BUS_TARGET_ID_PCI.
 					      */
 	__le16 reserved;
-	struct dbg_bus_block_data blocks[80];/* Debug Bus data for each block */
+	struct dbg_bus_block_data blocks[88];/* Debug Bus data for each block */
 	struct dbg_bus_storm_data storms[6]; /* Debug Bus data for each block */
 };
 
+enum dbg_bus_filter_types {
+	DBG_BUS_FILTER_TYPE_OFF,
+	DBG_BUS_FILTER_TYPE_PRE,
+	DBG_BUS_FILTER_TYPE_POST,
+	DBG_BUS_FILTER_TYPE_ON,
+	MAX_DBG_BUS_FILTER_TYPES
+};
+
 /* Debug bus frame modes */
 enum dbg_bus_frame_modes {
 	DBG_BUS_FRAME_MODE_0HW_4ST = 0, /* 0 HW dwords, 4 Storm dwords */
@@ -2104,6 +2156,40 @@ enum dbg_bus_frame_modes {
 	MAX_DBG_BUS_FRAME_MODES
 };
 
+enum dbg_bus_input_types {
+	DBG_BUS_INPUT_TYPE_STORM,
+	DBG_BUS_INPUT_TYPE_BLOCK,
+	MAX_DBG_BUS_INPUT_TYPES
+};
+
+enum dbg_bus_other_engine_modes {
+	DBG_BUS_OTHER_ENGINE_MODE_NONE,
+	DBG_BUS_OTHER_ENGINE_MODE_DOUBLE_BW_TX,
+	DBG_BUS_OTHER_ENGINE_MODE_DOUBLE_BW_RX,
+	DBG_BUS_OTHER_ENGINE_MODE_CROSS_ENGINE_TX,
+	DBG_BUS_OTHER_ENGINE_MODE_CROSS_ENGINE_RX,
+	MAX_DBG_BUS_OTHER_ENGINE_MODES
+};
+
+enum dbg_bus_post_trigger_types {
+	DBG_BUS_POST_TRIGGER_RECORD,
+	DBG_BUS_POST_TRIGGER_DROP,
+	MAX_DBG_BUS_POST_TRIGGER_TYPES
+};
+
+enum dbg_bus_pre_trigger_types {
+	DBG_BUS_PRE_TRIGGER_START_FROM_ZERO,
+	DBG_BUS_PRE_TRIGGER_NUM_CHUNKS,
+	DBG_BUS_PRE_TRIGGER_DROP,
+	MAX_DBG_BUS_PRE_TRIGGER_TYPES
+};
+
+enum dbg_bus_semi_frame_modes {
+	DBG_BUS_SEMI_FRAME_MODE_0SLOW_4FAST = 0,
+	DBG_BUS_SEMI_FRAME_MODE_4SLOW_0FAST = 3,
+	MAX_DBG_BUS_SEMI_FRAME_MODES
+};
+
 /* Debug bus states */
 enum dbg_bus_states {
 	DBG_BUS_STATE_IDLE, /* debug bus idle state (not recording) */
@@ -2115,6 +2201,19 @@ enum dbg_bus_states {
 	MAX_DBG_BUS_STATES
 };
 
+enum dbg_bus_storm_modes {
+	DBG_BUS_STORM_MODE_PRINTF,
+	DBG_BUS_STORM_MODE_PRAM_ADDR,
+	DBG_BUS_STORM_MODE_DRA_RW,
+	DBG_BUS_STORM_MODE_DRA_W,
+	DBG_BUS_STORM_MODE_LD_ST_ADDR,
+	DBG_BUS_STORM_MODE_DRA_FSM,
+	DBG_BUS_STORM_MODE_RH,
+	DBG_BUS_STORM_MODE_FOC,
+	DBG_BUS_STORM_MODE_EXT_STORE,
+	MAX_DBG_BUS_STORM_MODES
+};
+
 /* Debug bus target IDs */
 enum dbg_bus_targets {
 	/* records debug bus to DBG block internal buffer */
@@ -2128,13 +2227,10 @@ enum dbg_bus_targets {
 
 /* GRC Dump data */
 struct dbg_grc_data {
-	__le32 param_val[40]; /* Value of each GRC parameter. Array size must
-			       * match the enum dbg_grc_params.
-			       */
-	u8 param_set_by_user[40]; /* Indicates for each GRC parameter if it was
-				   * set by the user (0/1). Array size must
-				   * match the enum dbg_grc_params.
-				   */
+	u8 params_initialized;
+	u8 reserved1;
+	__le16 reserved2;
+	__le32 param_val[48];
 };
 
 /* Debug GRC params */
@@ -2181,6 +2277,8 @@ enum dbg_grc_params {
 	DBG_GRC_PARAM_PARITY_SAFE,
 	DBG_GRC_PARAM_DUMP_CM, /* dump CM memories (0/1) */
 	DBG_GRC_PARAM_DUMP_PHY, /* dump PHY memories (0/1) */
+	DBG_GRC_PARAM_NO_MCP,
+	DBG_GRC_PARAM_NO_FW_VER,
 	MAX_DBG_GRC_PARAMS
 };
 
@@ -2280,7 +2378,7 @@ struct dbg_tools_data {
 	struct dbg_bus_data bus; /* Debug Bus data */
 	struct idle_chk_data idle_chk; /* Idle Check data */
 	u8 mode_enable[40]; /* Indicates if a mode is enabled (0/1) */
-	u8 block_in_reset[80]; /* Indicates if a block is in reset state (0/1).
+	u8 block_in_reset[88]; /* Indicates if a block is in reset state (0/1).
 				*/
 	u8 chip_id; /* Chip ID (from enum chip_ids) */
 	u8 platform_id; /* Platform ID (from enum platform_ids) */
@@ -2418,7 +2516,6 @@ enum init_modes {
 	MODE_PORTS_PER_ENG_2,
 	MODE_PORTS_PER_ENG_4,
 	MODE_100G,
-	MODE_40G,
 	MODE_RESERVED6,
 	MAX_INIT_MODES
 };
@@ -2685,6 +2782,13 @@ struct iro {
  * @param bin_ptr - a pointer to the binary data with debug arrays.
  */
 enum dbg_status qed_dbg_set_bin_ptr(const u8 * const bin_ptr);
+/**
+ * @brief qed_dbg_grc_set_params_default - Reverts all GRC parameters to their
+ *	default value.
+ *
+ * @param p_hwfn		- HW device data
+ */
+void qed_dbg_grc_set_params_default(struct qed_hwfn *p_hwfn);
 /**
  * @brief qed_dbg_grc_get_dump_buf_size - Returns the required buffer size for
  *	GRC Dump.
@@ -3418,7 +3522,7 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
 #define	MSTORM_TPA_TIMEOUT_US_SIZE			(IRO[21].size)
 #define	MSTORM_ETH_PF_STAT_OFFSET(pf_id) \
 	(IRO[22].base + ((pf_id) * IRO[22].m1))
-#define	MSTORM_ETH_PF_STAT_SIZE				(IRO[21].size)
+#define	MSTORM_ETH_PF_STAT_SIZE				(IRO[22].size)
 #define	USTORM_QUEUE_STAT_OFFSET(stat_counter_id) \
 	(IRO[23].base + ((stat_counter_id) * IRO[23].m1))
 #define	USTORM_QUEUE_STAT_SIZE				(IRO[23].size)
@@ -3482,7 +3586,7 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
 
 static const struct iro iro_arr[47] = {
 	{0x0, 0x0, 0x0, 0x0, 0x8},
-	{0x4cb0, 0x78, 0x0, 0x0, 0x78},
+	{0x4cb0, 0x80, 0x0, 0x0, 0x80},
 	{0x6318, 0x20, 0x0, 0x0, 0x20},
 	{0xb00, 0x8, 0x0, 0x0, 0x4},
 	{0xa80, 0x8, 0x0, 0x0, 0x4},
@@ -3521,13 +3625,13 @@ static const struct iro iro_arr[47] = {
 	{0xd888, 0x38, 0x0, 0x0, 0x24},
 	{0x12c38, 0x10, 0x0, 0x0, 0x8},
 	{0x11aa0, 0x38, 0x0, 0x0, 0x18},
-	{0xa8c0, 0x30, 0x0, 0x0, 0x10},
-	{0x86f8, 0x28, 0x0, 0x0, 0x18},
+	{0xa8c0, 0x38, 0x0, 0x0, 0x10},
+	{0x86f8, 0x30, 0x0, 0x0, 0x18},
 	{0x101f8, 0x10, 0x0, 0x0, 0x10},
 	{0xdd08, 0x48, 0x0, 0x0, 0x38},
 	{0x10660, 0x20, 0x0, 0x0, 0x20},
 	{0x2b80, 0x80, 0x0, 0x0, 0x10},
-	{0x5000, 0x10, 0x0, 0x0, 0x10},
+	{0x5020, 0x10, 0x0, 0x0, 0x10},
 };
 
 /* Runtime array offsets */
@@ -4595,6 +4699,12 @@ enum eth_ipv4_frag_type {
 	MAX_ETH_IPV4_FRAG_TYPE
 };
 
+enum eth_ip_type {
+	ETH_IPV4,
+	ETH_IPV6,
+	MAX_ETH_IP_TYPE
+};
+
 enum eth_ramrod_cmd_id {
 	ETH_RAMROD_UNUSED,
 	ETH_RAMROD_VPORT_START,
@@ -4944,7 +5054,10 @@ struct vport_update_ramrod_data_cmn {
 	u8 update_mtu_flg;
 
 	__le16 mtu;
-	u8 reserved[2];
+	u8 update_ctl_frame_checks_en_flg;
+	u8 ctl_frame_mac_check_en;
+	u8 ctl_frame_ethtype_check_en;
+	u8 reserved[15];
 };
 
 struct vport_update_ramrod_mcast {
@@ -4962,6 +5075,492 @@ struct vport_update_ramrod_data {
 	struct eth_vport_rss_config rss_config;
 };
 
+struct mstorm_eth_conn_ag_ctx {
+	u8 byte0;
+	u8 byte1;
+	u8 flags0;
+#define MSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_EXIST_IN_QM0_SHIFT 0
+#define MSTORM_ETH_CONN_AG_CTX_BIT1_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_BIT1_SHIFT         1
+#define MSTORM_ETH_CONN_AG_CTX_CF0_MASK	0x3
+#define MSTORM_ETH_CONN_AG_CTX_CF0_SHIFT          2
+#define MSTORM_ETH_CONN_AG_CTX_CF1_MASK	0x3
+#define MSTORM_ETH_CONN_AG_CTX_CF1_SHIFT          4
+#define MSTORM_ETH_CONN_AG_CTX_CF2_MASK	0x3
+#define MSTORM_ETH_CONN_AG_CTX_CF2_SHIFT          6
+	u8 flags1;
+#define MSTORM_ETH_CONN_AG_CTX_CF0EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_CF0EN_SHIFT        0
+#define MSTORM_ETH_CONN_AG_CTX_CF1EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_CF1EN_SHIFT        1
+#define MSTORM_ETH_CONN_AG_CTX_CF2EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_CF2EN_SHIFT        2
+#define MSTORM_ETH_CONN_AG_CTX_RULE0EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_RULE0EN_SHIFT      3
+#define MSTORM_ETH_CONN_AG_CTX_RULE1EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_RULE1EN_SHIFT      4
+#define MSTORM_ETH_CONN_AG_CTX_RULE2EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_RULE2EN_SHIFT      5
+#define MSTORM_ETH_CONN_AG_CTX_RULE3EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_RULE3EN_SHIFT      6
+#define MSTORM_ETH_CONN_AG_CTX_RULE4EN_MASK	0x1
+#define MSTORM_ETH_CONN_AG_CTX_RULE4EN_SHIFT      7
+	__le16 word0;
+	__le16 word1;
+	__le32 reg0;
+	__le32 reg1;
+};
+
+struct xstorm_eth_conn_agctxdq_ext_ldpart {
+	u8 reserved0;
+	u8 eth_state;
+	u8 flags0;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM0_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM0_SHIFT           0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED1_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED1_SHIFT              1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED2_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED2_SHIFT              2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM3_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EXIST_IN_QM3_SHIFT           3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED3_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED3_SHIFT              4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED4_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED4_SHIFT              5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED5_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED5_SHIFT              6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED6_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED6_SHIFT              7
+	u8 flags1;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED7_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED7_SHIFT              0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED8_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED8_SHIFT              1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED9_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED9_SHIFT              2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_BIT11_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_BIT11_SHIFT                  3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_BIT12_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_BIT12_SHIFT                  4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_BIT13_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_BIT13_SHIFT                  5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TX_RULE_ACTIVE_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TX_RULE_ACTIVE_SHIFT         6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_ACTIVE_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_ACTIVE_SHIFT           7
+	u8 flags2;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF0_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF0_SHIFT                    0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF1_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF1_SHIFT                    2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF2_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF2_SHIFT                    4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF3_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF3_SHIFT                    6
+	u8 flags3;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF4_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF4_SHIFT                    0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF5_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF5_SHIFT                    2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF6_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF6_SHIFT                    4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF7_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF7_SHIFT                    6
+	u8 flags4;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF8_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF8_SHIFT                    0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF9_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF9_SHIFT                    2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF10_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF10_SHIFT                   4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF11_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF11_SHIFT                   6
+	u8 flags5;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF12_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF12_SHIFT                   0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF13_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF13_SHIFT                   2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF14_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF14_SHIFT                   4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF15_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF15_SHIFT                   6
+	u8 flags6;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_SHIFT       0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_SHIFT       2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_SHIFT                  4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_SHIFT           6
+	u8 flags7;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_SHIFT               0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED10_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED10_SHIFT             2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_SHIFT              4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF0EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF0EN_SHIFT                  6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF1EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF1EN_SHIFT                  7
+	u8 flags8;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF2EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF2EN_SHIFT                  0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF3EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF3EN_SHIFT                  1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF4EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF4EN_SHIFT                  2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF5EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF5EN_SHIFT                  3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF6EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF6EN_SHIFT                  4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF7EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF7EN_SHIFT                  5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF8EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF8EN_SHIFT                  6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF9EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF9EN_SHIFT                  7
+	u8 flags9;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF10EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF10EN_SHIFT                 0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF11EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF11EN_SHIFT                 1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF12EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF12EN_SHIFT                 2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF13EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF13EN_SHIFT                 3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF14EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF14EN_SHIFT                 4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF15EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_CF15EN_SHIFT                 5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_GO_TO_BD_CONS_CF_EN_SHIFT    6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_MULTI_UNICAST_CF_EN_SHIFT    7
+	u8 flags10;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_DQ_CF_EN_SHIFT               0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TERMINATE_CF_EN_SHIFT        1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_FLUSH_Q0_EN_SHIFT            2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED11_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED11_SHIFT             3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_SLOW_PATH_EN_SHIFT           4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_EN_RESERVED_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_EN_RESERVED_SHIFT 5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED12_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED12_SHIFT             6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED13_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED13_SHIFT             7
+	u8 flags11;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED14_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED14_SHIFT             0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED15_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RESERVED15_SHIFT             1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TX_DEC_RULE_EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TX_DEC_RULE_EN_SHIFT         2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE5EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE5EN_SHIFT                3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE6EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE6EN_SHIFT                4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE7EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE7EN_SHIFT                5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED1_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED1_SHIFT           6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE9EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE9EN_SHIFT                7
+	u8 flags12;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE10EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE10EN_SHIFT               0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE11EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE11EN_SHIFT               1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED2_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED2_SHIFT           2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED3_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED3_SHIFT           3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE14EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE14EN_SHIFT               4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE15EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE15EN_SHIFT               5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE16EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE16EN_SHIFT               6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE17EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE17EN_SHIFT               7
+	u8 flags13;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE18EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE18EN_SHIFT               0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE19EN_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_RULE19EN_SHIFT               1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED4_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED4_SHIFT           2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED5_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED5_SHIFT           3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED6_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED6_SHIFT           4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED7_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED7_SHIFT           5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED8_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED8_SHIFT           6
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED9_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_A0_RESERVED9_SHIFT           7
+	u8 flags14;
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_USE_EXT_HDR_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_USE_EXT_HDR_SHIFT       0
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_RAW_L3L4_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_RAW_L3L4_SHIFT     1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_INBAND_PROP_HDR_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_INBAND_PROP_HDR_SHIFT   2
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_EXT_TUNNEL_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_EDPM_SEND_EXT_TUNNEL_SHIFT   3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_L2_EDPM_ENABLE_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_L2_EDPM_ENABLE_SHIFT         4
+#define XSTORMETHCONNAGCTXDQEXTLDPART_ROCE_EDPM_ENABLE_MASK	0x1
+#define XSTORMETHCONNAGCTXDQEXTLDPART_ROCE_EDPM_ENABLE_SHIFT       5
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_MASK	0x3
+#define XSTORMETHCONNAGCTXDQEXTLDPART_TPH_ENABLE_SHIFT             6
+	u8 edpm_event_id;
+	__le16 physical_q0;
+	__le16 quota;
+	__le16 edpm_num_bds;
+	__le16 tx_bd_cons;
+	__le16 tx_bd_prod;
+	__le16 tx_class;
+	__le16 conn_dpi;
+	u8 byte3;
+	u8 byte4;
+	u8 byte5;
+	u8 byte6;
+	__le32 reg0;
+	__le32 reg1;
+	__le32 reg2;
+	__le32 reg3;
+	__le32 reg4;
+};
+
+struct xstorm_eth_hw_conn_ag_ctx {
+	u8 reserved0;
+	u8 eth_state;
+	u8 flags0;
+#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM0_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM0_SHIFT           0
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED1_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED1_SHIFT              1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED2_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED2_SHIFT              2
+#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM3_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EXIST_IN_QM3_SHIFT           3
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED3_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED3_SHIFT              4
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED4_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED4_SHIFT              5
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED5_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED5_SHIFT              6
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED6_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED6_SHIFT              7
+	u8 flags1;
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED7_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED7_SHIFT              0
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED8_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED8_SHIFT              1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED9_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED9_SHIFT              2
+#define XSTORM_ETH_HW_CONN_AG_CTX_BIT11_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_BIT11_SHIFT                  3
+#define XSTORM_ETH_HW_CONN_AG_CTX_BIT12_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_BIT12_SHIFT                  4
+#define XSTORM_ETH_HW_CONN_AG_CTX_BIT13_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_BIT13_SHIFT                  5
+#define XSTORM_ETH_HW_CONN_AG_CTX_TX_RULE_ACTIVE_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_TX_RULE_ACTIVE_SHIFT         6
+#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_ACTIVE_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_ACTIVE_SHIFT           7
+	u8 flags2;
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF0_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF0_SHIFT                    0
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF1_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF1_SHIFT                    2
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF2_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF2_SHIFT                    4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF3_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF3_SHIFT                    6
+	u8 flags3;
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF4_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF4_SHIFT                    0
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF5_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF5_SHIFT                    2
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF6_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF6_SHIFT                    4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF7_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF7_SHIFT                    6
+	u8 flags4;
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF8_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF8_SHIFT                    0
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF9_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF9_SHIFT                    2
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF10_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF10_SHIFT                   4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF11_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF11_SHIFT                   6
+	u8 flags5;
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF12_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF12_SHIFT                   0
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF13_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF13_SHIFT                   2
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF14_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF14_SHIFT                   4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF15_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF15_SHIFT                   6
+	u8 flags6;
+#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_SHIFT       0
+#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_SHIFT       2
+#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_SHIFT                  4
+#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_SHIFT           6
+	u8 flags7;
+#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_SHIFT               0
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED10_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED10_SHIFT             2
+#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_SHIFT              4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF0EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF0EN_SHIFT                  6
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF1EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF1EN_SHIFT                  7
+	u8 flags8;
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF2EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF2EN_SHIFT                  0
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF3EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF3EN_SHIFT                  1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF4EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF4EN_SHIFT                  2
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF5EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF5EN_SHIFT                  3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF6EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF6EN_SHIFT                  4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF7EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF7EN_SHIFT                  5
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF8EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF8EN_SHIFT                  6
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF9EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF9EN_SHIFT                  7
+	u8 flags9;
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF10EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF10EN_SHIFT                 0
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF11EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF11EN_SHIFT                 1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF12EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF12EN_SHIFT                 2
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF13EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF13EN_SHIFT                 3
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF14EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF14EN_SHIFT                 4
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF15EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_CF15EN_SHIFT                 5
+#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_GO_TO_BD_CONS_CF_EN_SHIFT    6
+#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_MULTI_UNICAST_CF_EN_SHIFT    7
+	u8 flags10;
+#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_DQ_CF_EN_SHIFT               0
+#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_TERMINATE_CF_EN_SHIFT        1
+#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT            2
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED11_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED11_SHIFT             3
+#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_SLOW_PATH_EN_SHIFT           4
+#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_EN_RESERVED_SHIFT 5
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED12_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED12_SHIFT             6
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED13_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED13_SHIFT             7
+	u8 flags11;
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED14_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED14_SHIFT             0
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED15_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RESERVED15_SHIFT             1
+#define XSTORM_ETH_HW_CONN_AG_CTX_TX_DEC_RULE_EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_TX_DEC_RULE_EN_SHIFT         2
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE5EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE5EN_SHIFT                3
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE6EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE6EN_SHIFT                4
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE7EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE7EN_SHIFT                5
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED1_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED1_SHIFT           6
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE9EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE9EN_SHIFT                7
+	u8 flags12;
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE10EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE10EN_SHIFT               0
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE11EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE11EN_SHIFT               1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED2_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED2_SHIFT           2
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED3_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED3_SHIFT           3
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE14EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE14EN_SHIFT               4
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE15EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE15EN_SHIFT               5
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE16EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE16EN_SHIFT               6
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE17EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE17EN_SHIFT               7
+	u8 flags13;
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE18EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE18EN_SHIFT               0
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE19EN_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_RULE19EN_SHIFT               1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED4_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED4_SHIFT           2
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED5_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED5_SHIFT           3
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED6_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED6_SHIFT           4
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED7_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED7_SHIFT           5
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED8_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED8_SHIFT           6
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED9_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_A0_RESERVED9_SHIFT           7
+	u8 flags14;
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_USE_EXT_HDR_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_USE_EXT_HDR_SHIFT       0
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_RAW_L3L4_SHIFT     1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_INBAND_PROP_HDR_SHIFT   2
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_EDPM_SEND_EXT_TUNNEL_SHIFT   3
+#define XSTORM_ETH_HW_CONN_AG_CTX_L2_EDPM_ENABLE_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_L2_EDPM_ENABLE_SHIFT         4
+#define XSTORM_ETH_HW_CONN_AG_CTX_ROCE_EDPM_ENABLE_MASK	0x1
+#define XSTORM_ETH_HW_CONN_AG_CTX_ROCE_EDPM_ENABLE_SHIFT       5
+#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_MASK	0x3
+#define XSTORM_ETH_HW_CONN_AG_CTX_TPH_ENABLE_SHIFT             6
+	u8 edpm_event_id;
+	__le16 physical_q0;
+	__le16 quota;
+	__le16 edpm_num_bds;
+	__le16 tx_bd_cons;
+	__le16 tx_bd_prod;
+	__le16 tx_class;
+	__le16 conn_dpi;
+};
+
 struct mstorm_rdma_task_st_ctx {
 	struct regpair temp[4];
 };
@@ -6165,7 +6764,7 @@ struct ystorm_roce_conn_st_ctx {
 };
 
 struct xstorm_roce_conn_st_ctx {
-	struct regpair temp[22];
+	struct regpair temp[24];
 };
 
 struct tstorm_roce_conn_st_ctx {
@@ -6220,7 +6819,7 @@ struct roce_create_qp_req_ramrod_data {
 	__le16 mtu;
 	__le16 pd;
 	__le16 sq_num_pages;
-	__le16 reseved2;
+	__le16 low_latency_phy_queue;
 	struct regpair sq_pbl_addr;
 	struct regpair orq_pbl_addr;
 	__le16 local_mac_addr[3];
@@ -6234,7 +6833,7 @@ struct roce_create_qp_req_ramrod_data {
 	u8 stats_counter_id;
 	u8 reserved3[7];
 	__le32 cq_cid;
-	__le16 physical_queue0;
+	__le16 regular_latency_phy_queue;
 	__le16 dpi;
 };
 
@@ -6282,15 +6881,16 @@ struct roce_create_qp_resp_ramrod_data {
 	__le32 dst_gid[4];
 	struct regpair qp_handle_for_cqe;
 	struct regpair qp_handle_for_async;
-	__le32 reserved2[2];
+	__le16 low_latency_phy_queue;
+	u8 reserved2[6];
 	__le32 cq_cid;
-	__le16 physical_queue0;
+	__le16 regular_latency_phy_queue;
 	__le16 dpi;
 };
 
 struct roce_destroy_qp_req_output_params {
 	__le32 num_bound_mw;
-	__le32 reserved;
+	__le32 cq_prod;
 };
 
 struct roce_destroy_qp_req_ramrod_data {
@@ -6299,7 +6899,7 @@ struct roce_destroy_qp_req_ramrod_data {
 
 struct roce_destroy_qp_resp_output_params {
 	__le32 num_invalidated_mw;
-	__le32 reserved;
+	__le32 cq_prod;
 };
 
 struct roce_destroy_qp_resp_ramrod_data {
@@ -7426,6 +8026,7 @@ struct ystorm_fcoe_conn_st_ctx {
 	u8 fcp_rsp_size;
 	__le16 mss;
 	struct regpair reserved;
+	__le16 min_frame_size;
 	u8 protection_info_flags;
 #define YSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_MASK  0x1
 #define YSTORM_FCOE_CONN_ST_CTX_SUPPORT_PROTECTION_SHIFT 0
@@ -7444,7 +8045,6 @@ struct ystorm_fcoe_conn_st_ctx {
 #define YSTORM_FCOE_CONN_ST_CTX_RSRV_MASK                0x3F
 #define YSTORM_FCOE_CONN_ST_CTX_RSRV_SHIFT               2
 	u8 fcp_xfer_size;
-	u8 reserved3[2];
 };
 
 struct fcoe_vlan_fields {
@@ -8273,10 +8873,10 @@ struct xstorm_iscsi_conn_ag_ctx {
 #define XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_MASK                    0x3
 #define XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_SHIFT                   6
 	u8 flags7;
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_MASK                    0x3
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_SHIFT                   0
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q1_MASK                    0x3
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q1_SHIFT                   2
+#define XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_MASK	0x3
+#define XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_SHIFT        0
+#define XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_MASK	0x3
+#define XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_SHIFT        2
 #define XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_MASK                   0x3
 #define XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_SHIFT                  4
 #define XSTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK                       0x1
@@ -8322,10 +8922,10 @@ struct xstorm_iscsi_conn_ag_ctx {
 #define XSTORM_ISCSI_CONN_AG_CTX_CF18EN_SHIFT                     0
 #define XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_EN_MASK                 0x1
 #define XSTORM_ISCSI_CONN_AG_CTX_DQ_FLUSH_EN_SHIFT                1
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_EN_MASK                 0x1
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q0_EN_SHIFT                2
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q1_EN_MASK                 0x1
-#define XSTORM_ISCSI_CONN_AG_CTX_FLUSH_Q1_EN_SHIFT                3
+#define XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_EN_MASK	0x1
+#define XSTORM_ISCSI_CONN_AG_CTX_MST_XCM_Q0_FLUSH_CF_EN_SHIFT     2
+#define XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_EN_MASK	0x1
+#define XSTORM_ISCSI_CONN_AG_CTX_UST_XCM_Q1_FLUSH_CF_EN_SHIFT     3
 #define XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_EN_MASK                0x1
 #define XSTORM_ISCSI_CONN_AG_CTX_SLOW_PATH_EN_SHIFT               4
 #define XSTORM_ISCSI_CONN_AG_CTX_PROC_ONLY_CLEANUP_EN_MASK        0x1
@@ -8335,8 +8935,8 @@ struct xstorm_iscsi_conn_ag_ctx {
 #define XSTORM_ISCSI_CONN_AG_CTX_MORE_TO_SEND_DEC_RULE_EN_MASK    0x1
 #define XSTORM_ISCSI_CONN_AG_CTX_MORE_TO_SEND_DEC_RULE_EN_SHIFT   7
 	u8 flags11;
-#define XSTORM_ISCSI_CONN_AG_CTX_RULE2EN_MASK                     0x1
-#define XSTORM_ISCSI_CONN_AG_CTX_RULE2EN_SHIFT                    0
+#define XSTORM_ISCSI_CONN_AG_CTX_TX_BLOCKED_EN_MASK	0x1
+#define XSTORM_ISCSI_CONN_AG_CTX_TX_BLOCKED_EN_SHIFT              0
 #define XSTORM_ISCSI_CONN_AG_CTX_RULE3EN_MASK                     0x1
 #define XSTORM_ISCSI_CONN_AG_CTX_RULE3EN_SHIFT                    1
 #define XSTORM_ISCSI_CONN_AG_CTX_RESERVED3_MASK                   0x1
@@ -8440,7 +9040,7 @@ struct xstorm_iscsi_conn_ag_ctx {
 	__le32 reg10;
 	__le32 reg11;
 	__le32 exp_stat_sn;
-	__le32 reg13;
+	__le32 ongoing_fast_rxmit_seq;
 	__le32 reg14;
 	__le32 reg15;
 	__le32 reg16;
@@ -8466,10 +9066,10 @@ struct tstorm_iscsi_conn_ag_ctx {
 #define TSTORM_ISCSI_CONN_AG_CTX_CF0_MASK                0x3
 #define TSTORM_ISCSI_CONN_AG_CTX_CF0_SHIFT               6
 	u8 flags1;
-#define TSTORM_ISCSI_CONN_AG_CTX_CF1_MASK                0x3
-#define TSTORM_ISCSI_CONN_AG_CTX_CF1_SHIFT               0
-#define TSTORM_ISCSI_CONN_AG_CTX_CF2_MASK                0x3
-#define TSTORM_ISCSI_CONN_AG_CTX_CF2_SHIFT               2
+#define TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_MASK	0x3
+#define TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_SHIFT      0
+#define TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_MASK	0x3
+#define TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_SHIFT      2
 #define TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_MASK     0x3
 #define TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_SHIFT    4
 #define TSTORM_ISCSI_CONN_AG_CTX_CF4_MASK                0x3
@@ -8490,10 +9090,10 @@ struct tstorm_iscsi_conn_ag_ctx {
 #define TSTORM_ISCSI_CONN_AG_CTX_CF10_SHIFT              2
 #define TSTORM_ISCSI_CONN_AG_CTX_CF0EN_MASK              0x1
 #define TSTORM_ISCSI_CONN_AG_CTX_CF0EN_SHIFT             4
-#define TSTORM_ISCSI_CONN_AG_CTX_CF1EN_MASK              0x1
-#define TSTORM_ISCSI_CONN_AG_CTX_CF1EN_SHIFT             5
-#define TSTORM_ISCSI_CONN_AG_CTX_CF2EN_MASK              0x1
-#define TSTORM_ISCSI_CONN_AG_CTX_CF2EN_SHIFT             6
+#define TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_EN_MASK	0x1
+#define TSTORM_ISCSI_CONN_AG_CTX_P2T_FLUSH_CF_EN_SHIFT   5
+#define TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_EN_MASK	0x1
+#define TSTORM_ISCSI_CONN_AG_CTX_M2T_FLUSH_CF_EN_SHIFT   6
 #define TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_EN_MASK  0x1
 #define TSTORM_ISCSI_CONN_AG_CTX_TIMER_STOP_ALL_EN_SHIFT 7
 	u8 flags4;
@@ -8539,7 +9139,7 @@ struct tstorm_iscsi_conn_ag_ctx {
 	__le32 reg6;
 	__le32 reg7;
 	__le32 reg8;
-	u8 byte2;
+	u8 cid_offload_cnt;
 	u8 byte3;
 	__le16 word0;
 };
@@ -9067,6 +9667,10 @@ struct dcb_dscp_map {
 struct public_global {
 	u32 max_path;
 	u32 max_ports;
+#define MODE_1P 1
+#define MODE_2P 2
+#define MODE_3P 3
+#define MODE_4P 4
 	u32 debug_mb_offset;
 	u32 phymod_dbg_mb_offset;
 	struct couple_mode_teaming cmt;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index d891a6852695..2a50e2b7568f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -215,13 +215,6 @@ static void qed_cmdq_lines_voq_rt_init(struct qed_hwfn *p_hwfn,
 {
 	u32 qm_line_crd;
 
-	/* In A0 - Limit the size of pbf queue so that only 511 commands with
-	 * the minimum size of 4 (FCoE minimum size)
-	 */
-	bool is_bb_a0 = QED_IS_BB_A0(p_hwfn->cdev);
-
-	if (is_bb_a0)
-		cmdq_lines = min_t(u32, cmdq_lines, 1022);
 	qm_line_crd = QM_VOQ_LINE_CRD(cmdq_lines);
 	OVERWRITE_RT_REG(p_hwfn, PBF_CMDQ_LINES_RT_OFFSET(voq),
 			 (u32)cmdq_lines);
@@ -343,13 +336,11 @@ static void qed_tx_pq_map_rt_init(
 	u16 first_pq_group = p_params->start_pq / QM_PF_QUEUE_GROUP_SIZE;
 	u16 last_pq_group = (p_params->start_pq + num_pqs - 1) /
 			    QM_PF_QUEUE_GROUP_SIZE;
-	bool is_bb_a0 = QED_IS_BB_A0(p_hwfn->cdev);
 	u16 i, pq_id, pq_group;
 
 	/* a bit per Tx PQ indicating if the PQ is associated with a VF */
 	u32 tx_pq_vf_mask[MAX_QM_TX_QUEUES / QM_PF_QUEUE_GROUP_SIZE] = { 0 };
-	u32 tx_pq_vf_mask_width = is_bb_a0 ? 32 : QM_PF_QUEUE_GROUP_SIZE;
-	u32 num_tx_pq_vf_masks = MAX_QM_TX_QUEUES / tx_pq_vf_mask_width;
+	u32 num_tx_pq_vf_masks = MAX_QM_TX_QUEUES / QM_PF_QUEUE_GROUP_SIZE;
 	u32 pq_mem_4kb = QM_PQ_MEM_4KB(p_params->num_pf_cids);
 	u32 vport_pq_mem_4kb = QM_PQ_MEM_4KB(p_params->num_vf_cids);
 	u32 mem_addr_4kb = base_mem_addr_4kb;
@@ -371,6 +362,10 @@ static void qed_tx_pq_map_rt_init(
 		bool is_vf_pq = (i >= p_params->num_pf_pqs);
 		struct qm_rf_pq_map tx_pq_map;
 
+		bool rl_valid = p_params->pq_params[i].rl_valid &&
+				(p_params->pq_params[i].vport_id <
+				 MAX_QM_GLOBAL_RLS);
+
 		/* update first Tx PQ of VPORT/TC */
 		u8 vport_id_in_pf = p_params->pq_params[i].vport_id -
 				    p_params->start_vport;
@@ -389,14 +384,18 @@ static void qed_tx_pq_map_rt_init(
 				     (p_params->pf_id <<
 				      QM_WFQ_VP_PQ_PF_SHIFT));
 		}
+
+		if (p_params->pq_params[i].rl_valid && !rl_valid)
+			DP_NOTICE(p_hwfn,
+				  "Invalid VPORT ID for rate limiter configuration");
 		/* fill PQ map entry */
 		memset(&tx_pq_map, 0, sizeof(tx_pq_map));
 		SET_FIELD(tx_pq_map.reg, QM_RF_PQ_MAP_PQ_VALID, 1);
-		SET_FIELD(tx_pq_map.reg, QM_RF_PQ_MAP_RL_VALID,
-			  p_params->pq_params[i].rl_valid ? 1 : 0);
+		SET_FIELD(tx_pq_map.reg,
+			  QM_RF_PQ_MAP_RL_VALID, rl_valid ? 1 : 0);
 		SET_FIELD(tx_pq_map.reg, QM_RF_PQ_MAP_VP_PQ_ID, first_tx_pq_id);
 		SET_FIELD(tx_pq_map.reg, QM_RF_PQ_MAP_RL_ID,
-			  p_params->pq_params[i].rl_valid ?
+			  rl_valid ?
 			  p_params->pq_params[i].vport_id : 0);
 		SET_FIELD(tx_pq_map.reg, QM_RF_PQ_MAP_VOQ, voq);
 		SET_FIELD(tx_pq_map.reg, QM_RF_PQ_MAP_WRR_WEIGHT_GROUP,
@@ -413,8 +412,9 @@ static void qed_tx_pq_map_rt_init(
 			/* if PQ is associated with a VF, add indication
 			 * to PQ VF mask
 			 */
-			tx_pq_vf_mask[pq_id / tx_pq_vf_mask_width] |=
-				(1 << (pq_id % tx_pq_vf_mask_width));
+			tx_pq_vf_mask[pq_id /
+				      QM_PF_QUEUE_GROUP_SIZE] |=
+			    BIT((pq_id % QM_PF_QUEUE_GROUP_SIZE));
 			mem_addr_4kb += vport_pq_mem_4kb;
 		} else {
 			mem_addr_4kb += pq_mem_4kb;
@@ -480,8 +480,8 @@ static int qed_pf_wfq_rt_init(struct qed_hwfn *p_hwfn,
 	if (p_params->pf_id < MAX_NUM_PFS_BB)
 		crd_reg_offset = QM_REG_WFQPFCRD_RT_OFFSET;
 	else
-		crd_reg_offset = QM_REG_WFQPFCRD_MSB_RT_OFFSET +
-				 (p_params->pf_id % MAX_NUM_PFS_BB);
+		crd_reg_offset = QM_REG_WFQPFCRD_MSB_RT_OFFSET;
+	crd_reg_offset += p_params->pf_id % MAX_NUM_PFS_BB;
 
 	inc_val = QM_WFQ_INC_VAL(p_params->pf_wfq);
 	if (!inc_val || inc_val > QM_WFQ_MAX_INC_VAL) {
@@ -498,11 +498,11 @@ static int qed_pf_wfq_rt_init(struct qed_hwfn *p_hwfn,
 				 QM_WFQ_CRD_REG_SIGN_BIT);
 	}
 
-	STORE_RT_REG(p_hwfn, QM_REG_WFQPFWEIGHT_RT_OFFSET + p_params->pf_id,
-		     inc_val);
 	STORE_RT_REG(p_hwfn,
 		     QM_REG_WFQPFUPPERBOUND_RT_OFFSET + p_params->pf_id,
 		     QM_WFQ_UPPER_BOUND | QM_WFQ_CRD_REG_SIGN_BIT);
+	STORE_RT_REG(p_hwfn, QM_REG_WFQPFWEIGHT_RT_OFFSET + p_params->pf_id,
+		     inc_val);
 	return 0;
 }
 
@@ -576,6 +576,12 @@ static int qed_vport_rl_rt_init(struct qed_hwfn *p_hwfn,
 {
 	u8 i, vport_id;
 
+	if (start_vport + num_vports >= MAX_QM_GLOBAL_RLS) {
+		DP_NOTICE(p_hwfn,
+			  "Invalid VPORT ID for rate limiter configuration");
+		return -1;
+	}
+
 	/* go over all PF VPORTs */
 	for (i = 0, vport_id = start_vport; i < num_vports; i++, vport_id++) {
 		u32 inc_val = QM_RL_INC_VAL(vport_params[i].vport_rl);
@@ -785,6 +791,12 @@ int qed_init_vport_rl(struct qed_hwfn *p_hwfn,
 {
 	u32 inc_val = QM_RL_INC_VAL(vport_rl);
 
+	if (vport_id >= MAX_QM_GLOBAL_RLS) {
+		DP_NOTICE(p_hwfn,
+			  "Invalid VPORT ID for rate limiter configuration");
+		return -1;
+	}
+
 	if (inc_val > QM_RL_MAX_INC_VAL) {
 		DP_NOTICE(p_hwfn, "Invalid VPORT rate-limit configuration");
 		return -1;
@@ -940,12 +952,6 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
 	       eth_geneve_enable ? 1 : 0);
 	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_IP_ENABLE, ip_geneve_enable ? 1 : 0);
 
-	/* comp ver */
-	reg_val = (ip_geneve_enable || eth_geneve_enable) ? 1 : 0;
-	qed_wr(p_hwfn, p_ptt, NIG_REG_NGE_COMP_VER, reg_val);
-	qed_wr(p_hwfn, p_ptt, PBF_REG_NGE_COMP_VER, reg_val);
-	qed_wr(p_hwfn, p_ptt, PRS_REG_NGE_COMP_VER, reg_val);
-
 	/* EDPM with geneve tunnel not supported in BB_B0 */
 	if (QED_IS_BB_B0(p_hwfn->cdev))
 		return;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
index 243b64e0d4dc..4a2e7be5bf72 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_ops.c
@@ -554,7 +554,7 @@ int qed_init_fw_data(struct qed_dev *cdev, const u8 *data)
 	}
 
 	/* First Dword contains metadata and should be skipped */
-	buf_hdr = (struct bin_buffer_hdr *)(data + sizeof(u32));
+	buf_hdr = (struct bin_buffer_hdr *)data;
 
 	offset = buf_hdr[BIN_BUF_INIT_FW_VER_INFO].offset;
 	fw->fw_ver_info = (struct fw_ver_info *)(data + offset);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index 9a0b9af10a57..161d90376dae 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -594,7 +594,7 @@ static u8 qed_ll2_convert_rx_parse_to_tx_flags(u16 parse_flags)
 	u8 bd_flags = 0;
 
 	if (GET_FIELD(parse_flags, PARSING_AND_ERR_FLAGS_TAG8021QEXIST))
-		SET_FIELD(bd_flags, CORE_TX_BD_FLAGS_VLAN_INSERTION, 1);
+		SET_FIELD(bd_flags, CORE_TX_BD_DATA_VLAN_INSERTION, 1);
 
 	return bd_flags;
 }
@@ -755,8 +755,8 @@ qed_ooo_submit_tx_buffers(struct qed_hwfn *p_hwfn,
 			     p_buffer->placement_offset;
 		parse_flags = p_buffer->parse_flags;
 		bd_flags = qed_ll2_convert_rx_parse_to_tx_flags(parse_flags);
-		SET_FIELD(bd_flags, CORE_TX_BD_FLAGS_FORCE_VLAN_MODE, 1);
-		SET_FIELD(bd_flags, CORE_TX_BD_FLAGS_L4_PROTOCOL, 1);
+		SET_FIELD(bd_flags, CORE_TX_BD_DATA_FORCE_VLAN_MODE, 1);
+		SET_FIELD(bd_flags, CORE_TX_BD_DATA_L4_PROTOCOL, 1);
 
 		rc = qed_ll2_prepare_tx_packet(p_hwfn, p_ll2_conn->my_id, 1,
 					       p_buffer->vlan, bd_flags,
@@ -1588,33 +1588,34 @@ static void qed_ll2_prepare_tx_packet_set(struct qed_hwfn *p_hwfn,
 	p_tx->cur_send_frag_num++;
 }
 
-static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
-					     struct qed_ll2_info *p_ll2,
-					     struct qed_ll2_tx_packet *p_curp,
-					     u8 num_of_bds,
-					     enum core_tx_dest tx_dest,
-					     u16 vlan,
-					     u8 bd_flags,
-					     u16 l4_hdr_offset_w,
-					     enum core_roce_flavor_type type,
-					     dma_addr_t first_frag,
-					     u16 first_frag_len)
+static void
+qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
+				 struct qed_ll2_info *p_ll2,
+				 struct qed_ll2_tx_packet *p_curp,
+				 u8 num_of_bds,
+				 enum core_tx_dest tx_dest,
+				 u16 vlan,
+				 u8 bd_flags,
+				 u16 l4_hdr_offset_w,
+				 enum core_roce_flavor_type roce_flavor,
+				 dma_addr_t first_frag,
+				 u16 first_frag_len)
 {
 	struct qed_chain *p_tx_chain = &p_ll2->tx_queue.txq_chain;
 	u16 prod_idx = qed_chain_get_prod_idx(p_tx_chain);
 	struct core_tx_bd *start_bd = NULL;
-	u16 frag_idx;
+	u16 bd_data = 0, frag_idx;
 
 	start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
 	start_bd->nw_vlan_or_lb_echo = cpu_to_le16(vlan);
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W,
 		  cpu_to_le16(l4_hdr_offset_w));
 	SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest);
-	start_bd->bd_flags.as_bitfield = bd_flags;
-	start_bd->bd_flags.as_bitfield |= CORE_TX_BD_FLAGS_START_BD_MASK <<
-	    CORE_TX_BD_FLAGS_START_BD_SHIFT;
-	SET_FIELD(start_bd->bitfield0, CORE_TX_BD_NBDS, num_of_bds);
-	SET_FIELD(start_bd->bitfield0, CORE_TX_BD_ROCE_FLAV, type);
+	bd_data |= bd_flags;
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_START_BD, 0x1);
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_NBDS, num_of_bds);
+	SET_FIELD(bd_data, CORE_TX_BD_DATA_ROCE_FLAV, roce_flavor);
+	start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data);
 	DMA_REGPAIR_LE(start_bd->addr, first_frag);
 	start_bd->nbytes = cpu_to_le16(first_frag_len);
 
@@ -1639,9 +1640,8 @@ static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn,
 		struct core_tx_bd **p_bd = &p_curp->bds_set[frag_idx].txq_bd;
 
 		*p_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain);
-		(*p_bd)->bd_flags.as_bitfield = 0;
+		(*p_bd)->bd_data.as_bitfield = 0;
 		(*p_bd)->bitfield1 = 0;
-		(*p_bd)->bitfield0 = 0;
 		p_curp->bds_set[frag_idx].tx_frag = 0;
 		p_curp->bds_set[frag_idx].frag_len = 0;
 	}
@@ -2238,11 +2238,11 @@ static int qed_ll2_start_xmit(struct qed_dev *cdev, struct sk_buff *skb)
 	/* Request HW to calculate IP csum */
 	if (!((vlan_get_protocol(skb) == htons(ETH_P_IPV6)) &&
 	      ipv6_hdr(skb)->nexthdr == NEXTHDR_IPV6))
-		flags |= BIT(CORE_TX_BD_FLAGS_IP_CSUM_SHIFT);
+		flags |= BIT(CORE_TX_BD_DATA_IP_CSUM_SHIFT);
 
 	if (skb_vlan_tag_present(skb)) {
 		vlan = skb_vlan_tag_get(skb);
-		flags |= BIT(CORE_TX_BD_FLAGS_VLAN_INSERTION_SHIFT);
+		flags |= BIT(CORE_TX_BD_DATA_VLAN_INSERTION_SHIFT);
 	}
 
 	rc = qed_ll2_prepare_tx_packet(QED_LEADING_HWFN(cdev),
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index d59d9df60cd2..36ae361884e0 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -356,6 +356,10 @@
 	0x238804UL
 #define  RDIF_REG_STOP_ON_ERROR \
 	0x300040UL
+#define RDIF_REG_DEBUG_ERROR_INFO \
+	0x300400UL
+#define RDIF_REG_DEBUG_ERROR_INFO_SIZE \
+	64
 #define  SRC_REG_SOFT_RST \
 	0x23874cUL
 #define  TCFC_REG_ACTIVITY_COUNTER \
@@ -370,6 +374,10 @@
 	0x1700004UL
 #define  TDIF_REG_STOP_ON_ERROR \
 	0x310040UL
+#define TDIF_REG_DEBUG_ERROR_INFO \
+	0x310400UL
+#define TDIF_REG_DEBUG_ERROR_INFO_SIZE \
+	64
 #define  UCM_REG_INIT \
 	0x1280000UL
 #define  UMAC_REG_IPG_HD_BKP_CNTL_BB_B0 \
@@ -1236,6 +1244,26 @@
 	0x1901534UL
 #define USEM_REG_DBG_FORCE_FRAME \
 	0x1901538UL
+#define NWS_REG_DBG_SELECT \
+	0x700128UL
+#define NWS_REG_DBG_DWORD_ENABLE \
+	0x70012cUL
+#define NWS_REG_DBG_SHIFT \
+	0x700130UL
+#define NWS_REG_DBG_FORCE_VALID	\
+	0x700134UL
+#define NWS_REG_DBG_FORCE_FRAME	\
+	0x700138UL
+#define MS_REG_DBG_SELECT \
+	0x6a0228UL
+#define MS_REG_DBG_DWORD_ENABLE \
+	0x6a022cUL
+#define MS_REG_DBG_SHIFT \
+	0x6a0230UL
+#define MS_REG_DBG_FORCE_VALID \
+	0x6a0234UL
+#define MS_REG_DBG_FORCE_FRAME \
+	0x6a0238UL
 #define PCIE_REG_DBG_COMMON_SELECT \
 	0x054398UL
 #define PCIE_REG_DBG_COMMON_DWORD_ENABLE \
@@ -1448,6 +1476,8 @@
 	0x000b48UL
 #define RSS_REG_RSS_RAM_DATA \
 	0x238c20UL
+#define RSS_REG_RSS_RAM_DATA_SIZE \
+	4
 #define MISC_REG_BLOCK_256B_EN \
 	0x008c14UL
 #define NWS_REG_NWS_CMU	\
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index d9ff6b28591c..4bef5c59627c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -66,13 +66,27 @@
 #include "qed_roce.h"
 #include "qed_ll2.h"
 
-void qed_async_roce_event(struct qed_hwfn *p_hwfn,
-			  struct event_ring_entry *p_eqe)
+static void qed_roce_free_real_icid(struct qed_hwfn *p_hwfn, u16 icid);
+
+void qed_roce_async_event(struct qed_hwfn *p_hwfn,
+			  u8 fw_event_code, union rdma_eqe_data *rdma_data)
 {
-	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
+	if (fw_event_code == ROCE_ASYNC_EVENT_DESTROY_QP_DONE) {
+		u16 icid =
+		    (u16)le32_to_cpu(rdma_data->rdma_destroy_qp_data.cid);
+
+		/* icid release in this async event can occur only if the icid
+		 * was offloaded to the FW. In case it wasn't offloaded this is
+		 * handled in qed_roce_sp_destroy_qp.
+		 */
+		qed_roce_free_real_icid(p_hwfn, icid);
+	} else {
+		struct qed_rdma_events *events = &p_hwfn->p_rdma_info->events;
 
-	p_rdma_info->events.affiliated_event(p_rdma_info->events.context,
-					     p_eqe->opcode, &p_eqe->data);
+		events->affiliated_event(p_hwfn->p_rdma_info->events.context,
+					 fw_event_code,
+					 &rdma_data->async_handle);
+	}
 }
 
 static int qed_rdma_bmap_alloc(struct qed_hwfn *p_hwfn,
@@ -113,6 +127,15 @@ static int qed_rdma_bmap_alloc_id(struct qed_hwfn *p_hwfn,
 	return 0;
 }
 
+static void qed_bmap_set_id(struct qed_hwfn *p_hwfn,
+			    struct qed_bmap *bmap, u32 id_num)
+{
+	if (id_num >= bmap->max_count)
+		return;
+
+	__set_bit(id_num, bmap->bitmap);
+}
+
 static void qed_bmap_release_id(struct qed_hwfn *p_hwfn,
 				struct qed_bmap *bmap, u32 id_num)
 {
@@ -129,6 +152,15 @@ static void qed_bmap_release_id(struct qed_hwfn *p_hwfn,
 	}
 }
 
+static int qed_bmap_test_id(struct qed_hwfn *p_hwfn,
+			    struct qed_bmap *bmap, u32 id_num)
+{
+	if (id_num >= bmap->max_count)
+		return -1;
+
+	return test_bit(id_num, bmap->bitmap);
+}
+
 static u32 qed_rdma_get_sb_id(void *p_hwfn, u32 rel_sb_id)
 {
 	/* First sb id for RoCE is after all the l2 sb */
@@ -170,7 +202,8 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 	/* Queue zone lines are shared between RoCE and L2 in such a way that
 	 * they can be used by each without obstructing the other.
 	 */
-	p_rdma_info->queue_zone_base = (u16)FEAT_NUM(p_hwfn, QED_L2_QUEUE);
+	p_rdma_info->queue_zone_base = (u16)RESC_START(p_hwfn, QED_L2_QUEUE);
+	p_rdma_info->max_queue_zones = (u16)RESC_NUM(p_hwfn, QED_L2_QUEUE);
 
 	/* Allocate a struct with device params and fill it */
 	p_rdma_info->dev = kzalloc(sizeof(*p_rdma_info->dev), GFP_KERNEL);
@@ -248,9 +281,18 @@ static int qed_rdma_alloc(struct qed_hwfn *p_hwfn,
 		goto free_tid_map;
 	}
 
+	/* Allocate bitmap for cids used for responders/requesters. */
+	rc = qed_rdma_bmap_alloc(p_hwfn, &p_rdma_info->real_cid_map, num_cons);
+	if (rc) {
+		DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+			   "Failed to allocate real cid bitmap, rc = %d\n", rc);
+		goto free_cid_map;
+	}
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Allocation successful\n");
 	return 0;
 
+free_cid_map:
+	kfree(p_rdma_info->cid_map.bitmap);
 free_tid_map:
 	kfree(p_rdma_info->tid_map.bitmap);
 free_toggle_map:
@@ -273,7 +315,22 @@ free_rdma_info:
 
 static void qed_rdma_resc_free(struct qed_hwfn *p_hwfn)
 {
+	struct qed_bmap *rcid_map = &p_hwfn->p_rdma_info->real_cid_map;
 	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
+	int wait_count = 0;
+
+	/* when destroying a_RoCE QP the control is returned to the user after
+	 * the synchronous part. The asynchronous part may take a little longer.
+	 * We delay for a short while if an async destroy QP is still expected.
+	 * Beyond the added delay we clear the bitmap anyway.
+	 */
+	while (bitmap_weight(rcid_map->bitmap, rcid_map->max_count)) {
+		msleep(100);
+		if (wait_count++ > 20) {
+			DP_NOTICE(p_hwfn, "cid bitmap wait timed out\n");
+			break;
+		}
+	}
 
 	kfree(p_rdma_info->cid_map.bitmap);
 	kfree(p_rdma_info->tid_map.bitmap);
@@ -724,6 +781,14 @@ static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod)
 	u32 addr;
 
 	p_hwfn = (struct qed_hwfn *)rdma_cxt;
+
+	if (qz_offset > p_hwfn->p_rdma_info->max_queue_zones) {
+		DP_NOTICE(p_hwfn,
+			  "queue zone offset %d is too large (max is %d)\n",
+			  qz_offset, p_hwfn->p_rdma_info->max_queue_zones);
+		return;
+	}
+
 	qz_num = p_hwfn->p_rdma_info->queue_zone_base + qz_offset;
 	addr = GTT_BAR0_MAP_REG_USDM_RAM +
 	       USTORM_COMMON_QUEUE_CONS_OFFSET(qz_num);
@@ -1080,6 +1145,14 @@ static enum roce_flavor qed_roce_mode_to_flavor(enum roce_mode roce_mode)
 	return flavor;
 }
 
+void qed_roce_free_cid_pair(struct qed_hwfn *p_hwfn, u16 cid)
+{
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, cid);
+	qed_bmap_release_id(p_hwfn, &p_hwfn->p_rdma_info->cid_map, cid + 1);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
 static int qed_roce_alloc_cid(struct qed_hwfn *p_hwfn, u16 *cid)
 {
 	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
@@ -1139,6 +1212,13 @@ err:
 	return rc;
 }
 
+static void qed_roce_set_real_cid(struct qed_hwfn *p_hwfn, u32 cid)
+{
+	spin_lock_bh(&p_hwfn->p_rdma_info->lock);
+	qed_bmap_set_id(p_hwfn, &p_hwfn->p_rdma_info->real_cid_map, cid);
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
 static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn,
 					struct qed_rdma_qp *qp)
 {
@@ -1147,7 +1227,8 @@ static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn,
 	union qed_qm_pq_params qm_params;
 	enum roce_flavor roce_flavor;
 	struct qed_spq_entry *p_ent;
-	u16 physical_queue0 = 0;
+	u16 regular_latency_queue;
+	enum protocol_type proto;
 	int rc;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
@@ -1229,15 +1310,19 @@ static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn,
 	p_ramrod->qp_handle_for_async.lo = cpu_to_le32(qp->qp_handle_async.lo);
 	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
 	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
-	p_ramrod->stats_counter_id = p_hwfn->rel_pf_id;
 	p_ramrod->cq_cid = cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) |
 				       qp->rq_cq_id);
 
 	memset(&qm_params, 0, sizeof(qm_params));
 	qm_params.roce.qpid = qp->icid >> 1;
-	physical_queue0 = qed_get_qm_pq(p_hwfn, PROTOCOLID_ROCE, &qm_params);
+	regular_latency_queue = qed_get_qm_pq(p_hwfn, PROTOCOLID_ROCE,
+					      &qm_params);
+
+	p_ramrod->regular_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
+	p_ramrod->low_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
 
-	p_ramrod->physical_queue0 = cpu_to_le16(physical_queue0);
 	p_ramrod->dpi = cpu_to_le16(qp->dpi);
 
 	qed_rdma_set_fw_mac(p_ramrod->remote_mac_addr, qp->remote_mac_addr);
@@ -1253,13 +1338,19 @@ static int qed_roce_sp_create_responder(struct qed_hwfn *p_hwfn,
 
 	rc = qed_spq_post(p_hwfn, p_ent, NULL);
 
-	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "rc = %d physical_queue0 = 0x%x\n",
-		   rc, physical_queue0);
+	DP_VERBOSE(p_hwfn, QED_MSG_RDMA,
+		   "rc = %d regular physical queue = 0x%x\n", rc,
+		   regular_latency_queue);
 
 	if (rc)
 		goto err;
 
 	qp->resp_offloaded = true;
+	qp->cq_prod = 0;
+
+	proto = p_hwfn->p_rdma_info->proto;
+	qed_roce_set_real_cid(p_hwfn, qp->icid -
+			      qed_cxt_get_proto_cid_start(p_hwfn, proto));
 
 	return rc;
 
@@ -1280,7 +1371,8 @@ static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn,
 	union qed_qm_pq_params qm_params;
 	enum roce_flavor roce_flavor;
 	struct qed_spq_entry *p_ent;
-	u16 physical_queue0 = 0;
+	u16 regular_latency_queue;
+	enum protocol_type proto;
 	int rc;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
@@ -1351,15 +1443,19 @@ static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn,
 	p_ramrod->qp_handle_for_async.lo = cpu_to_le32(qp->qp_handle_async.lo);
 	p_ramrod->qp_handle_for_cqe.hi = cpu_to_le32(qp->qp_handle.hi);
 	p_ramrod->qp_handle_for_cqe.lo = cpu_to_le32(qp->qp_handle.lo);
-	p_ramrod->stats_counter_id = p_hwfn->rel_pf_id;
-	p_ramrod->cq_cid = cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) |
-				       qp->sq_cq_id);
+	p_ramrod->cq_cid =
+	    cpu_to_le32((p_hwfn->hw_info.opaque_fid << 16) | qp->sq_cq_id);
 
 	memset(&qm_params, 0, sizeof(qm_params));
 	qm_params.roce.qpid = qp->icid >> 1;
-	physical_queue0 = qed_get_qm_pq(p_hwfn, PROTOCOLID_ROCE, &qm_params);
+	regular_latency_queue = qed_get_qm_pq(p_hwfn, PROTOCOLID_ROCE,
+					      &qm_params);
+
+	p_ramrod->regular_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
+	p_ramrod->low_latency_phy_queue =
+	    cpu_to_le16(regular_latency_queue);
 
-	p_ramrod->physical_queue0 = cpu_to_le16(physical_queue0);
 	p_ramrod->dpi = cpu_to_le16(qp->dpi);
 
 	qed_rdma_set_fw_mac(p_ramrod->remote_mac_addr, qp->remote_mac_addr);
@@ -1378,6 +1474,10 @@ static int qed_roce_sp_create_requester(struct qed_hwfn *p_hwfn,
 		goto err;
 
 	qp->req_offloaded = true;
+	proto = p_hwfn->p_rdma_info->proto;
+	qed_roce_set_real_cid(p_hwfn,
+			      qp->icid + 1 -
+			      qed_cxt_get_proto_cid_start(p_hwfn, proto));
 
 	return rc;
 
@@ -1577,7 +1677,8 @@ static int qed_roce_sp_modify_requester(struct qed_hwfn *p_hwfn,
 
 static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 					    struct qed_rdma_qp *qp,
-					    u32 *num_invalidated_mw)
+					    u32 *num_invalidated_mw,
+					    u32 *cq_prod)
 {
 	struct roce_destroy_qp_resp_output_params *p_ramrod_res;
 	struct roce_destroy_qp_resp_ramrod_data *p_ramrod;
@@ -1588,8 +1689,22 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "icid = %08x\n", qp->icid);
 
-	if (!qp->resp_offloaded)
+	*num_invalidated_mw = 0;
+	*cq_prod = qp->cq_prod;
+
+	if (!qp->resp_offloaded) {
+		/* If a responder was never offload, we need to free the cids
+		 * allocated in create_qp as a FW async event will never arrive
+		 */
+		u32 cid;
+
+		cid = qp->icid -
+		      qed_cxt_get_proto_cid_start(p_hwfn,
+						  p_hwfn->p_rdma_info->proto);
+		qed_roce_free_cid_pair(p_hwfn, (u16)cid);
+
 		return 0;
+	}
 
 	/* Get SPQ entry */
 	memset(&init_data, 0, sizeof(init_data));
@@ -1624,6 +1739,8 @@ static int qed_roce_sp_destroy_qp_responder(struct qed_hwfn *p_hwfn,
 		goto err;
 
 	*num_invalidated_mw = le32_to_cpu(p_ramrod_res->num_invalidated_mw);
+	*cq_prod = le32_to_cpu(p_ramrod_res->cq_prod);
+	qp->cq_prod = *cq_prod;
 
 	/* Free IRQ - only if ramrod succeeded, in case FW is still using it */
 	dma_free_coherent(&p_hwfn->cdev->pdev->dev,
@@ -1827,10 +1944,8 @@ static int qed_roce_query_qp(struct qed_hwfn *p_hwfn,
 
 	out_params->draining = false;
 
-	if (rq_err_state)
+	if (rq_err_state || sq_err_state)
 		qp->cur_state = QED_ROCE_QP_STATE_ERR;
-	else if (sq_err_state)
-		qp->cur_state = QED_ROCE_QP_STATE_SQE;
 	else if (sq_draining)
 		out_params->draining = true;
 	out_params->state = qp->cur_state;
@@ -1849,10 +1964,9 @@ err_resp:
 
 static int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 {
-	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
 	u32 num_invalidated_mw = 0;
 	u32 num_bound_mw = 0;
-	u32 start_cid;
+	u32 cq_prod;
 	int rc;
 
 	/* Destroys the specified QP */
@@ -1866,7 +1980,8 @@ static int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 
 	if (qp->cur_state != QED_ROCE_QP_STATE_RESET) {
 		rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp,
-						      &num_invalidated_mw);
+						      &num_invalidated_mw,
+						      &cq_prod);
 		if (rc)
 			return rc;
 
@@ -1881,21 +1996,6 @@ static int qed_roce_destroy_qp(struct qed_hwfn *p_hwfn, struct qed_rdma_qp *qp)
 				  "number of invalidate memory windows is different from bounded ones\n");
 			return -EINVAL;
 		}
-
-		spin_lock_bh(&p_rdma_info->lock);
-
-		start_cid = qed_cxt_get_proto_cid_start(p_hwfn,
-							p_rdma_info->proto);
-
-		/* Release responder's icid */
-		qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map,
-				    qp->icid - start_cid);
-
-		/* Release requester's icid */
-		qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map,
-				    qp->icid + 1 - start_cid);
-
-		spin_unlock_bh(&p_rdma_info->lock);
 	}
 
 	return 0;
@@ -2110,12 +2210,19 @@ static int qed_roce_modify_qp(struct qed_hwfn *p_hwfn,
 		return rc;
 	} else if (qp->cur_state == QED_ROCE_QP_STATE_RESET) {
 		/* Any state -> RESET */
+		u32 cq_prod;
+
+		/* Send destroy responder ramrod */
+		rc = qed_roce_sp_destroy_qp_responder(p_hwfn,
+						      qp,
+						      &num_invalidated_mw,
+						      &cq_prod);
 
-		rc = qed_roce_sp_destroy_qp_responder(p_hwfn, qp,
-						      &num_invalidated_mw);
 		if (rc)
 			return rc;
 
+		qp->cq_prod = cq_prod;
+
 		rc = qed_roce_sp_destroy_qp_requester(p_hwfn, qp,
 						      &num_bound_mw);
 
@@ -2454,6 +2561,31 @@ static int qed_rdma_deregister_tid(void *rdma_cxt, u32 itid)
 	return rc;
 }
 
+static void qed_roce_free_real_icid(struct qed_hwfn *p_hwfn, u16 icid)
+{
+	struct qed_rdma_info *p_rdma_info = p_hwfn->p_rdma_info;
+	u32 start_cid, cid, xcid;
+
+	/* an even icid belongs to a responder while an odd icid belongs to a
+	 * requester. The 'cid' received as an input can be either. We calculate
+	 * the "partner" icid and call it xcid. Only if both are free then the
+	 * "cid" map can be cleared.
+	 */
+	start_cid = qed_cxt_get_proto_cid_start(p_hwfn, p_rdma_info->proto);
+	cid = icid - start_cid;
+	xcid = cid ^ 1;
+
+	spin_lock_bh(&p_rdma_info->lock);
+
+	qed_bmap_release_id(p_hwfn, &p_rdma_info->real_cid_map, cid);
+	if (qed_bmap_test_id(p_hwfn, &p_rdma_info->real_cid_map, xcid) == 0) {
+		qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, cid);
+		qed_bmap_release_id(p_hwfn, &p_rdma_info->cid_map, xcid);
+	}
+
+	spin_unlock_bh(&p_hwfn->p_rdma_info->lock);
+}
+
 static void *qed_rdma_get_rdma_ctx(struct qed_dev *cdev)
 {
 	return QED_LEADING_HWFN(cdev);
@@ -2773,7 +2905,7 @@ static int qed_roce_ll2_tx(struct qed_dev *cdev,
 						      : QED_LL2_RROCE;
 
 	if (pkt->roce_mode == ROCE_V2_IPV4)
-		flags |= BIT(CORE_TX_BD_FLAGS_IP_CSUM_SHIFT);
+		flags |= BIT(CORE_TX_BD_DATA_IP_CSUM_SHIFT);
 
 	/* Tx header */
 	rc = qed_ll2_prepare_tx_packet(QED_LEADING_HWFN(cdev), roce_ll2->handle,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.h b/drivers/net/ethernet/qlogic/qed/qed_roce.h
index 36cf4b2ab7fa..3ccc08a7c995 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.h
@@ -82,6 +82,7 @@ struct qed_rdma_info {
 	struct qed_bmap qp_map;
 	struct qed_bmap srq_map;
 	struct qed_bmap cid_map;
+	struct qed_bmap real_cid_map;
 	struct qed_bmap dpi_map;
 	struct qed_bmap toggle_bits;
 	struct qed_rdma_events events;
@@ -92,6 +93,7 @@ struct qed_rdma_info {
 	u32 num_qps;
 	u32 num_mrs;
 	u16 queue_zone_base;
+	u16 max_queue_zones;
 	enum protocol_type proto;
 };
 
@@ -153,6 +155,7 @@ struct qed_rdma_qp {
 	dma_addr_t irq_phys_addr;
 	u8 irq_num_pages;
 	bool resp_offloaded;
+	u32 cq_prod;
 
 	u8 remote_mac_addr[6];
 	u8 local_mac_addr[6];
@@ -163,8 +166,8 @@ struct qed_rdma_qp {
 
 #if IS_ENABLED(CONFIG_QED_RDMA)
 void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
-void qed_async_roce_event(struct qed_hwfn *p_hwfn,
-			  struct event_ring_entry *p_eqe);
+void qed_roce_async_event(struct qed_hwfn *p_hwfn,
+			  u8 fw_event_code, union rdma_eqe_data *rdma_data);
 void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn,
 				     u8 connection_handle,
 				     void *cookie,
@@ -187,7 +190,9 @@ void qed_ll2b_complete_rx_gsi_packet(struct qed_hwfn *p_hwfn,
 				     u16 src_mac_addr_lo, bool b_last_packet);
 #else
 static inline void qed_rdma_dpm_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) {}
-static inline void qed_async_roce_event(struct qed_hwfn *p_hwfn, struct event_ring_entry *p_eqe) {}
+static inline void qed_roce_async_event(struct qed_hwfn *p_hwfn,
+					u8 fw_event_code,
+					union rdma_eqe_data *rdma_data) {}
 static inline void qed_ll2b_complete_tx_gsi_packet(struct qed_hwfn *p_hwfn,
 						   u8 connection_handle,
 						   void *cookie,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c
index 645328a9f0cf..54fbe3789cf3 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_spq.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c
@@ -296,9 +296,12 @@ qed_async_event_completion(struct qed_hwfn *p_hwfn,
 			   struct event_ring_entry *p_eqe)
 {
 	switch (p_eqe->protocol_id) {
+#if IS_ENABLED(CONFIG_QED_RDMA)
 	case PROTOCOLID_ROCE:
-		qed_async_roce_event(p_hwfn, p_eqe);
+		qed_roce_async_event(p_hwfn, p_eqe->opcode,
+				     &p_eqe->data.rdma_data);
 		return 0;
+#endif
 	case PROTOCOLID_COMMON:
 		return qed_sriov_eqe_event(p_hwfn,
 					   p_eqe->opcode,
@@ -306,14 +309,6 @@ qed_async_event_completion(struct qed_hwfn *p_hwfn,
 	case PROTOCOLID_ISCSI:
 		if (!IS_ENABLED(CONFIG_QED_ISCSI))
 			return -EINVAL;
-		if (p_eqe->opcode == ISCSI_EVENT_TYPE_ASYN_DELETE_OOO_ISLES) {
-			u32 cid = le32_to_cpu(p_eqe->data.iscsi_info.cid);
-
-			qed_ooo_release_connection_isles(p_hwfn,
-							 p_hwfn->p_ooo_info,
-							 cid);
-			return 0;
-		}
 
 		if (p_hwfn->p_iscsi_info->event_cb) {
 			struct qed_iscsi_info *p_iscsi = p_hwfn->p_iscsi_info;
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index f2aaef2cfb86..8d02fb6c19d7 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -50,7 +50,7 @@
 #define QEDE_MAJOR_VERSION		8
 #define QEDE_MINOR_VERSION		10
 #define QEDE_REVISION_VERSION		10
-#define QEDE_ENGINEERING_VERSION	20
+#define QEDE_ENGINEERING_VERSION	21
 #define DRV_MODULE_VERSION __stringify(QEDE_MAJOR_VERSION) "."	\
 		__stringify(QEDE_MINOR_VERSION) "."		\
 		__stringify(QEDE_REVISION_VERSION) "."		\
diff --git a/drivers/scsi/qedf/Makefile b/drivers/scsi/qedf/Makefile
index 64e9f507ce32..414f2a772a5f 100644
--- a/drivers/scsi/qedf/Makefile
+++ b/drivers/scsi/qedf/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_QEDF) := qedf.o
 qedf-y = qedf_dbg.o qedf_main.o qedf_io.o qedf_fip.o \
-	 qedf_attr.o qedf_els.o
+	 qedf_attr.o qedf_els.o drv_scsi_fw_funcs.o drv_fcoe_fw_funcs.o
 
 qedf-$(CONFIG_DEBUG_FS) += qedf_debugfs.o
diff --git a/drivers/scsi/qedf/drv_fcoe_fw_funcs.c b/drivers/scsi/qedf/drv_fcoe_fw_funcs.c
new file mode 100644
index 000000000000..bb812db48da6
--- /dev/null
+++ b/drivers/scsi/qedf/drv_fcoe_fw_funcs.c
@@ -0,0 +1,190 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#include "drv_fcoe_fw_funcs.h"
+#include "drv_scsi_fw_funcs.h"
+
+#define FCOE_RX_ID ((u32)0x0000FFFF)
+
+static inline void init_common_sqe(struct fcoe_task_params *task_params,
+				   enum fcoe_sqe_request_type request_type)
+{
+	memset(task_params->sqe, 0, sizeof(*(task_params->sqe)));
+	SET_FIELD(task_params->sqe->flags, FCOE_WQE_REQ_TYPE,
+		  request_type);
+	task_params->sqe->task_id = task_params->itid;
+}
+
+int init_initiator_rw_fcoe_task(struct fcoe_task_params *task_params,
+				struct scsi_sgl_task_params *sgl_task_params,
+				struct regpair sense_data_buffer_phys_addr,
+				u32 task_retry_id,
+				u8 fcp_cmd_payload[32])
+{
+	struct fcoe_task_context *ctx = task_params->context;
+	struct ystorm_fcoe_task_st_ctx *y_st_ctx;
+	struct tstorm_fcoe_task_st_ctx *t_st_ctx;
+	struct ustorm_fcoe_task_ag_ctx *u_ag_ctx;
+	struct mstorm_fcoe_task_st_ctx *m_st_ctx;
+	u32 io_size, val;
+	bool slow_sgl;
+
+	memset(ctx, 0, sizeof(*(ctx)));
+	slow_sgl = scsi_is_slow_sgl(sgl_task_params->num_sges,
+				    sgl_task_params->small_mid_sge);
+	io_size = (task_params->task_type == FCOE_TASK_TYPE_WRITE_INITIATOR ?
+		   task_params->tx_io_size : task_params->rx_io_size);
+
+	/* Ystorm ctx */
+	y_st_ctx = &ctx->ystorm_st_context;
+	y_st_ctx->data_2_trns_rem = cpu_to_le32(io_size);
+	y_st_ctx->task_rety_identifier = cpu_to_le32(task_retry_id);
+	y_st_ctx->task_type = task_params->task_type;
+	memcpy(&y_st_ctx->tx_info_union.fcp_cmd_payload,
+	       fcp_cmd_payload, sizeof(struct fcoe_fcp_cmd_payload));
+
+	/* Tstorm ctx */
+	t_st_ctx = &ctx->tstorm_st_context;
+	t_st_ctx->read_only.dev_type = (task_params->is_tape_device == 1 ?
+					FCOE_TASK_DEV_TYPE_TAPE :
+					FCOE_TASK_DEV_TYPE_DISK);
+	t_st_ctx->read_only.cid = cpu_to_le32(task_params->conn_cid);
+	val = cpu_to_le32(task_params->cq_rss_number);
+	t_st_ctx->read_only.glbl_q_num = val;
+	t_st_ctx->read_only.fcp_cmd_trns_size = cpu_to_le32(io_size);
+	t_st_ctx->read_only.task_type = task_params->task_type;
+	SET_FIELD(t_st_ctx->read_write.flags,
+		  FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME, 1);
+	t_st_ctx->read_write.rx_id = cpu_to_le32(FCOE_RX_ID);
+
+	/* Ustorm ctx */
+	u_ag_ctx = &ctx->ustorm_ag_context;
+	u_ag_ctx->global_cq_num = cpu_to_le32(task_params->cq_rss_number);
+
+	/* Mstorm buffer for sense/rsp data placement */
+	m_st_ctx = &ctx->mstorm_st_context;
+	val = cpu_to_le32(sense_data_buffer_phys_addr.hi);
+	m_st_ctx->rsp_buf_addr.hi = val;
+	val = cpu_to_le32(sense_data_buffer_phys_addr.lo);
+	m_st_ctx->rsp_buf_addr.lo = val;
+
+	if (task_params->task_type == FCOE_TASK_TYPE_WRITE_INITIATOR) {
+		/* Ystorm ctx */
+		y_st_ctx->expect_first_xfer = 1;
+
+		/* Set the amount of super SGEs. Can be up to 4. */
+		SET_FIELD(y_st_ctx->sgl_mode,
+			  YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE,
+			  (slow_sgl ? SCSI_TX_SLOW_SGL : SCSI_FAST_SGL));
+		init_scsi_sgl_context(&y_st_ctx->sgl_params,
+				      &y_st_ctx->data_desc,
+				      sgl_task_params);
+
+		/* Mstorm ctx */
+		SET_FIELD(m_st_ctx->flags,
+			  MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE,
+			  (slow_sgl ? SCSI_TX_SLOW_SGL : SCSI_FAST_SGL));
+	} else {
+		/* Tstorm ctx */
+		SET_FIELD(t_st_ctx->read_write.flags,
+			  FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE,
+			  (slow_sgl ? SCSI_TX_SLOW_SGL : SCSI_FAST_SGL));
+
+		/* Mstorm ctx */
+		m_st_ctx->data_2_trns_rem = cpu_to_le32(io_size);
+		init_scsi_sgl_context(&m_st_ctx->sgl_params,
+				      &m_st_ctx->data_desc,
+				      sgl_task_params);
+	}
+
+	init_common_sqe(task_params, SEND_FCOE_CMD);
+	return 0;
+}
+
+int init_initiator_midpath_unsolicited_fcoe_task(
+	struct fcoe_task_params *task_params,
+	struct fcoe_tx_mid_path_params *mid_path_fc_header,
+	struct scsi_sgl_task_params *tx_sgl_task_params,
+	struct scsi_sgl_task_params *rx_sgl_task_params,
+	u8 fw_to_place_fc_header)
+{
+	struct fcoe_task_context *ctx = task_params->context;
+	struct ystorm_fcoe_task_st_ctx *y_st_ctx;
+	struct tstorm_fcoe_task_st_ctx *t_st_ctx;
+	struct ustorm_fcoe_task_ag_ctx *u_ag_ctx;
+	struct mstorm_fcoe_task_st_ctx *m_st_ctx;
+	u32 val;
+
+	memset(ctx, 0, sizeof(*(ctx)));
+
+	/* Init Ystorm */
+	y_st_ctx = &ctx->ystorm_st_context;
+	init_scsi_sgl_context(&y_st_ctx->sgl_params,
+			      &y_st_ctx->data_desc,
+			      tx_sgl_task_params);
+	SET_FIELD(y_st_ctx->sgl_mode,
+		  YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE, SCSI_FAST_SGL);
+	y_st_ctx->data_2_trns_rem = cpu_to_le32(task_params->tx_io_size);
+	y_st_ctx->task_type = task_params->task_type;
+	memcpy(&y_st_ctx->tx_info_union.tx_params.mid_path,
+	       mid_path_fc_header, sizeof(struct fcoe_tx_mid_path_params));
+
+	/* Init Mstorm */
+	m_st_ctx = &ctx->mstorm_st_context;
+	init_scsi_sgl_context(&m_st_ctx->sgl_params,
+			      &m_st_ctx->data_desc,
+			      rx_sgl_task_params);
+	SET_FIELD(m_st_ctx->flags,
+		  MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER,
+		  fw_to_place_fc_header);
+	m_st_ctx->data_2_trns_rem = cpu_to_le32(task_params->rx_io_size);
+
+	/* Init Tstorm */
+	t_st_ctx = &ctx->tstorm_st_context;
+	t_st_ctx->read_only.cid = cpu_to_le32(task_params->conn_cid);
+	val = cpu_to_le32(task_params->cq_rss_number);
+	t_st_ctx->read_only.glbl_q_num = val;
+	t_st_ctx->read_only.task_type = task_params->task_type;
+	SET_FIELD(t_st_ctx->read_write.flags,
+		  FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME, 1);
+	t_st_ctx->read_write.rx_id = cpu_to_le32(FCOE_RX_ID);
+
+	/* Init Ustorm */
+	u_ag_ctx = &ctx->ustorm_ag_context;
+	u_ag_ctx->global_cq_num = cpu_to_le32(task_params->cq_rss_number);
+
+	/* Init SQE */
+	init_common_sqe(task_params, SEND_FCOE_MIDPATH);
+	task_params->sqe->additional_info_union.burst_length =
+				    tx_sgl_task_params->total_buffer_size;
+	SET_FIELD(task_params->sqe->flags,
+		  FCOE_WQE_NUM_SGES, tx_sgl_task_params->num_sges);
+	SET_FIELD(task_params->sqe->flags, FCOE_WQE_SGL_MODE,
+		  SCSI_FAST_SGL);
+
+	return 0;
+}
+
+int init_initiator_abort_fcoe_task(struct fcoe_task_params *task_params)
+{
+	init_common_sqe(task_params, SEND_FCOE_ABTS_REQUEST);
+	return 0;
+}
+
+int init_initiator_cleanup_fcoe_task(struct fcoe_task_params *task_params)
+{
+	init_common_sqe(task_params, FCOE_EXCHANGE_CLEANUP);
+	return 0;
+}
+
+int init_initiator_sequence_recovery_fcoe_task(
+	struct fcoe_task_params *task_params, u32 off)
+{
+	init_common_sqe(task_params, FCOE_SEQUENCE_RECOVERY);
+	task_params->sqe->additional_info_union.seq_rec_updated_offset = off;
+	return 0;
+}
diff --git a/drivers/scsi/qedf/drv_fcoe_fw_funcs.h b/drivers/scsi/qedf/drv_fcoe_fw_funcs.h
new file mode 100644
index 000000000000..617529b058f4
--- /dev/null
+++ b/drivers/scsi/qedf/drv_fcoe_fw_funcs.h
@@ -0,0 +1,93 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#ifndef _FCOE_FW_FUNCS_H
+#define _FCOE_FW_FUNCS_H
+#include "drv_scsi_fw_funcs.h"
+#include "qedf_hsi.h"
+#include <linux/qed/qed_if.h>
+
+struct fcoe_task_params {
+	/* Output parameter [set/filled by the HSI function] */
+	struct fcoe_task_context *context;
+
+	/* Output parameter [set/filled by the HSI function] */
+	struct fcoe_wqe *sqe;
+	enum fcoe_task_type task_type;
+	u32 tx_io_size; /* in bytes */
+	u32 rx_io_size; /* in bytes */
+	u32 conn_cid;
+	u16 itid;
+	u8 cq_rss_number;
+
+	 /* Whether it's Tape device or not (0=Disk, 1=Tape) */
+	u8 is_tape_device;
+};
+
+/**
+ * @brief init_initiator_rw_fcoe_task - Initializes FCoE task context for
+ * read/write task types and init fcoe_sqe
+ *
+ * @param task_params - Pointer to task parameters struct
+ * @param sgl_task_params - Pointer to SGL task params
+ * @param sense_data_buffer_phys_addr - Pointer to sense data buffer
+ * @param task_retry_id - retry identification - Used only for Tape device
+ * @param fcp_cmnd_payload - FCP CMD Payload
+ */
+int init_initiator_rw_fcoe_task(struct fcoe_task_params *task_params,
+	struct scsi_sgl_task_params *sgl_task_params,
+	struct regpair sense_data_buffer_phys_addr,
+	u32 task_retry_id,
+	u8 fcp_cmd_payload[32]);
+
+/**
+ * @brief init_initiator_midpath_fcoe_task - Initializes FCoE task context for
+ * midpath/unsolicited task types and init fcoe_sqe
+ *
+ * @param task_params - Pointer to task parameters struct
+ * @param mid_path_fc_header - FC header
+ * @param tx_sgl_task_params - Pointer to Tx SGL task params
+ * @param rx_sgl_task_params - Pointer to Rx SGL task params
+ * @param fw_to_place_fc_header	- Indication if the FW will place the FC header
+ * in addition to the data arrives.
+ */
+int init_initiator_midpath_unsolicited_fcoe_task(
+	struct fcoe_task_params *task_params,
+	struct fcoe_tx_mid_path_params *mid_path_fc_header,
+	struct scsi_sgl_task_params *tx_sgl_task_params,
+	struct scsi_sgl_task_params *rx_sgl_task_params,
+	u8 fw_to_place_fc_header);
+
+/**
+ * @brief init_initiator_abort_fcoe_task - Initializes FCoE task context for
+ * abort task types and init fcoe_sqe
+ *
+ * @param task_params - Pointer to task parameters struct
+ */
+int init_initiator_abort_fcoe_task(struct fcoe_task_params *task_params);
+
+/**
+ * @brief init_initiator_cleanup_fcoe_task - Initializes FCoE task context for
+ * cleanup task types and init fcoe_sqe
+ *
+ *
+ * @param task_params - Pointer to task parameters struct
+ */
+int init_initiator_cleanup_fcoe_task(struct fcoe_task_params *task_params);
+
+/**
+ * @brief init_initiator_cleanup_fcoe_task - Initializes FCoE task context for
+ * sequence recovery task types and init fcoe_sqe
+ *
+ *
+ * @param task_params - Pointer to task parameters struct
+ * @param desired_offset - The desired offest the task will be re-sent from
+ */
+int init_initiator_sequence_recovery_fcoe_task(
+	struct fcoe_task_params *task_params,
+	u32 desired_offset);
+#endif
diff --git a/drivers/scsi/qedf/drv_scsi_fw_funcs.c b/drivers/scsi/qedf/drv_scsi_fw_funcs.c
new file mode 100644
index 000000000000..11e0cc082ec0
--- /dev/null
+++ b/drivers/scsi/qedf/drv_scsi_fw_funcs.c
@@ -0,0 +1,44 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#include "drv_scsi_fw_funcs.h"
+
+#define SCSI_NUM_SGES_IN_CACHE 0x4
+
+bool scsi_is_slow_sgl(u16 num_sges, bool small_mid_sge)
+{
+	return (num_sges > SCSI_NUM_SGES_SLOW_SGL_THR && small_mid_sge);
+}
+
+void init_scsi_sgl_context(struct scsi_sgl_params *ctx_sgl_params,
+			   struct scsi_cached_sges *ctx_data_desc,
+			   struct scsi_sgl_task_params *sgl_task_params)
+{
+	/* no need to check for sgl_task_params->sgl validity */
+	u8 num_sges_to_init = sgl_task_params->num_sges >
+			      SCSI_NUM_SGES_IN_CACHE ? SCSI_NUM_SGES_IN_CACHE :
+			      sgl_task_params->num_sges;
+	u8 sge_index;
+	u32 val;
+
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.lo);
+	ctx_sgl_params->sgl_addr.lo = val;
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.hi);
+	ctx_sgl_params->sgl_addr.hi = val;
+	val = cpu_to_le32(sgl_task_params->total_buffer_size);
+	ctx_sgl_params->sgl_total_length = val;
+	ctx_sgl_params->sgl_num_sges = cpu_to_le16(sgl_task_params->num_sges);
+
+	for (sge_index = 0; sge_index < num_sges_to_init; sge_index++) {
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.lo);
+		ctx_data_desc->sge[sge_index].sge_addr.lo = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.hi);
+		ctx_data_desc->sge[sge_index].sge_addr.hi = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_len);
+		ctx_data_desc->sge[sge_index].sge_len = val;
+	}
+}
diff --git a/drivers/scsi/qedf/drv_scsi_fw_funcs.h b/drivers/scsi/qedf/drv_scsi_fw_funcs.h
new file mode 100644
index 000000000000..9cb45410bc45
--- /dev/null
+++ b/drivers/scsi/qedf/drv_scsi_fw_funcs.h
@@ -0,0 +1,85 @@
+/* QLogic FCoE Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+#ifndef _SCSI_FW_FUNCS_H
+#define _SCSI_FW_FUNCS_H
+#include <linux/qed/common_hsi.h>
+#include <linux/qed/storage_common.h>
+#include <linux/qed/fcoe_common.h>
+
+struct scsi_sgl_task_params {
+	struct scsi_sge *sgl;
+	struct regpair sgl_phys_addr;
+	u32 total_buffer_size;
+	u16 num_sges;
+
+	 /* true if SGL contains a small (< 4KB) SGE in middle(not 1st or last)
+	  * -> relevant for tx only
+	  */
+	bool small_mid_sge;
+};
+
+struct scsi_dif_task_params {
+	u32 initial_ref_tag;
+	bool initial_ref_tag_is_valid;
+	u16 application_tag;
+	u16 application_tag_mask;
+	u16 dif_block_size_log;
+	bool dif_on_network;
+	bool dif_on_host;
+	u8 host_guard_type;
+	u8 protection_type;
+	u8 ref_tag_mask;
+	bool crc_seed;
+
+	 /* Enable Connection error upon DIF error (segments with DIF errors are
+	  * dropped)
+	  */
+	bool tx_dif_conn_err_en;
+	bool ignore_app_tag;
+	bool keep_ref_tag_const;
+	bool validate_guard;
+	bool validate_app_tag;
+	bool validate_ref_tag;
+	bool forward_guard;
+	bool forward_app_tag;
+	bool forward_ref_tag;
+	bool forward_app_tag_with_mask;
+	bool forward_ref_tag_with_mask;
+};
+
+struct scsi_initiator_cmd_params {
+	 /* for cdb_size > default CDB size (extended CDB > 16 bytes) ->
+	  * pointer to the CDB buffer SGE
+	  */
+	struct scsi_sge extended_cdb_sge;
+
+	/* Physical address of sense data buffer for sense data - 256B buffer */
+	struct regpair sense_data_buffer_phys_addr;
+};
+
+/**
+ * @brief scsi_is_slow_sgl - checks for slow SGL
+ *
+ * @param num_sges - number of sges in SGL
+ * @param small_mid_sge - True is the SGL contains an SGE which is smaller than
+ * 4KB and its not the 1st or last SGE in the SGL
+ */
+bool scsi_is_slow_sgl(u16 num_sges, bool small_mid_sge);
+
+/**
+ * @brief init_scsi_sgl_context - initializes SGL task context
+ *
+ * @param sgl_params - SGL context parameters to initialize (output parameter)
+ * @param data_desc - context struct containing SGEs array to set (output
+ * parameter)
+ * @param sgl_task_params - SGL parameters (input)
+ */
+void init_scsi_sgl_context(struct scsi_sgl_params *sgl_params,
+	struct scsi_cached_sges *ctx_data_desc,
+	struct scsi_sgl_task_params *sgl_task_params);
+#endif
diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h
index 96346a1b1515..40aeb6bb96a2 100644
--- a/drivers/scsi/qedf/qedf.h
+++ b/drivers/scsi/qedf/qedf.h
@@ -26,6 +26,7 @@
 #include <linux/qed/qed_ll2_if.h>
 #include "qedf_version.h"
 #include "qedf_dbg.h"
+#include "drv_fcoe_fw_funcs.h"
 
 /* Helpers to extract upper and lower 32-bits of pointer */
 #define U64_HI(val) ((u32)(((u64)(val)) >> 32))
@@ -59,19 +60,17 @@
 #define UPSTREAM_KEEP		1
 
 struct qedf_mp_req {
-	uint8_t tm_flags;
-
 	uint32_t req_len;
 	void *req_buf;
 	dma_addr_t req_buf_dma;
-	struct fcoe_sge *mp_req_bd;
+	struct scsi_sge *mp_req_bd;
 	dma_addr_t mp_req_bd_dma;
 	struct fc_frame_header req_fc_hdr;
 
 	uint32_t resp_len;
 	void *resp_buf;
 	dma_addr_t resp_buf_dma;
-	struct fcoe_sge *mp_resp_bd;
+	struct scsi_sge *mp_resp_bd;
 	dma_addr_t mp_resp_bd_dma;
 	struct fc_frame_header resp_fc_hdr;
 };
@@ -119,6 +118,7 @@ struct qedf_ioreq {
 #define QEDF_CMD_IN_CLEANUP		0x2
 #define QEDF_CMD_SRR_SENT		0x3
 	u8 io_req_flags;
+	uint8_t tm_flags;
 	struct qedf_rport *fcport;
 	unsigned long flags;
 	enum qedf_ioreq_event event;
@@ -130,6 +130,8 @@ struct qedf_ioreq {
 	struct completion tm_done;
 	struct completion abts_done;
 	struct fcoe_task_context *task;
+	struct fcoe_task_params *task_params;
+	struct scsi_sgl_task_params *sgl_task_params;
 	int idx;
 /*
  * Need to allocate enough room for both sense data and FCP response data
@@ -199,8 +201,8 @@ struct qedf_rport {
 	dma_addr_t sq_pbl_dma;
 	u32 sq_pbl_size;
 	u32 sid;
-#define	QEDF_RPORT_TYPE_DISK		1
-#define	QEDF_RPORT_TYPE_TAPE		2
+#define	QEDF_RPORT_TYPE_DISK		0
+#define	QEDF_RPORT_TYPE_TAPE		1
 	uint dev_type; /* Disk or tape */
 	struct list_head peers;
 };
@@ -391,7 +393,7 @@ struct qedf_ctx {
 
 struct io_bdt {
 	struct qedf_ioreq *io_req;
-	struct fcoe_sge *bd_tbl;
+	struct scsi_sge *bd_tbl;
 	dma_addr_t bd_tbl_dma;
 	u16 bd_valid;
 };
@@ -400,7 +402,7 @@ struct qedf_cmd_mgr {
 	struct qedf_ctx *qedf;
 	u16 idx;
 	struct io_bdt **io_bdt_pool;
-#define FCOE_PARAMS_NUM_TASKS		4096
+#define FCOE_PARAMS_NUM_TASKS		2048
 	struct qedf_ioreq cmds[FCOE_PARAMS_NUM_TASKS];
 	spinlock_t lock;
 	atomic_t free_list_cnt;
@@ -465,9 +467,8 @@ extern void qedf_cmd_timer_set(struct qedf_ctx *qedf, struct qedf_ioreq *io_req,
 	unsigned int timer_msec);
 extern int qedf_init_mp_req(struct qedf_ioreq *io_req);
 extern void qedf_init_mp_task(struct qedf_ioreq *io_req,
-	struct fcoe_task_context *task_ctx);
-extern void qedf_add_to_sq(struct qedf_rport *fcport, u16 xid,
-	u32 ptu_invalidate, enum fcoe_task_type req_type, u32 offset);
+	struct fcoe_task_context *task_ctx, struct fcoe_wqe *wqe);
+extern u16 qedf_get_sqe_idx(struct qedf_rport *fcport);
 extern void qedf_ring_doorbell(struct qedf_rport *fcport);
 extern void qedf_process_els_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
 	struct qedf_ioreq *els_req);
diff --git a/drivers/scsi/qedf/qedf_els.c b/drivers/scsi/qedf/qedf_els.c
index 59f3e5c73a13..c505d41f6dc8 100644
--- a/drivers/scsi/qedf/qedf_els.c
+++ b/drivers/scsi/qedf/qedf_els.c
@@ -25,6 +25,9 @@ static int qedf_initiate_els(struct qedf_rport *fcport, unsigned int op,
 	uint16_t xid;
 	uint32_t start_time = jiffies / HZ;
 	uint32_t current_time;
+	struct fcoe_wqe *sqe;
+	unsigned long flags;
+	u16 sqe_idx;
 
 	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Sending ELS\n");
 
@@ -113,20 +116,25 @@ retry_els:
 	/* Obtain exchange id */
 	xid = els_req->xid;
 
+	spin_lock_irqsave(&fcport->rport_lock, flags);
+
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+
 	/* Initialize task context for this IO request */
 	task = qedf_get_task_mem(&qedf->tasks, xid);
-	qedf_init_mp_task(els_req, task);
+	qedf_init_mp_task(els_req, task, sqe);
 
 	/* Put timer on original I/O request */
 	if (timer_msec)
 		qedf_cmd_timer_set(qedf, els_req, timer_msec);
 
-	qedf_add_to_sq(fcport, xid, 0, FCOE_TASK_TYPE_MIDPATH, 0);
-
 	/* Ring doorbell */
 	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_ELS, "Ringing doorbell for ELS "
 		   "req\n");
 	qedf_ring_doorbell(fcport);
+	spin_unlock_irqrestore(&fcport->rport_lock, flags);
 els_err:
 	return rc;
 }
@@ -604,6 +612,8 @@ static void qedf_initiate_seq_cleanup(struct qedf_ioreq *orig_io_req,
 	struct qedf_rport *fcport;
 	unsigned long flags;
 	struct qedf_els_cb_arg *cb_arg;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
 
 	fcport = orig_io_req->fcport;
 
@@ -631,8 +641,13 @@ static void qedf_initiate_seq_cleanup(struct qedf_ioreq *orig_io_req,
 
 	spin_lock_irqsave(&fcport->rport_lock, flags);
 
-	qedf_add_to_sq(fcport, orig_io_req->xid, 0,
-	    FCOE_TASK_TYPE_SEQUENCE_CLEANUP, offset);
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+	orig_io_req->task_params->sqe = sqe;
+
+	init_initiator_sequence_recovery_fcoe_task(orig_io_req->task_params,
+						   offset);
 	qedf_ring_doorbell(fcport);
 
 	spin_unlock_irqrestore(&fcport->rport_lock, flags);
diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c
index ee0dcf9d3aba..af2294635ab2 100644
--- a/drivers/scsi/qedf/qedf_io.c
+++ b/drivers/scsi/qedf/qedf_io.c
@@ -96,7 +96,7 @@ void qedf_cmd_mgr_free(struct qedf_cmd_mgr *cmgr)
 	if (!cmgr->io_bdt_pool)
 		goto free_cmd_pool;
 
-	bd_tbl_sz = QEDF_MAX_BDS_PER_CMD * sizeof(struct fcoe_sge);
+	bd_tbl_sz = QEDF_MAX_BDS_PER_CMD * sizeof(struct scsi_sge);
 	for (i = 0; i < num_ios; i++) {
 		bdt_info = cmgr->io_bdt_pool[i];
 		if (bdt_info->bd_tbl) {
@@ -119,6 +119,8 @@ free_cmd_pool:
 
 	for (i = 0; i < num_ios; i++) {
 		io_req = &cmgr->cmds[i];
+		kfree(io_req->sgl_task_params);
+		kfree(io_req->task_params);
 		/* Make sure we free per command sense buffer */
 		if (io_req->sense_buffer)
 			dma_free_coherent(&qedf->pdev->dev,
@@ -178,7 +180,7 @@ struct qedf_cmd_mgr *qedf_cmd_mgr_alloc(struct qedf_ctx *qedf)
 	spin_lock_init(&cmgr->lock);
 
 	/*
-	 * Initialize list of qedf_ioreq.
+	 * Initialize I/O request fields.
 	 */
 	xid = QEDF_MIN_XID;
 
@@ -196,6 +198,29 @@ struct qedf_cmd_mgr *qedf_cmd_mgr_alloc(struct qedf_ctx *qedf)
 		    GFP_KERNEL);
 		if (!io_req->sense_buffer)
 			goto mem_err;
+
+		/* Allocate task parameters to pass to f/w init funcions */
+		io_req->task_params = kzalloc(sizeof(*io_req->task_params),
+					      GFP_KERNEL);
+		if (!io_req->task_params) {
+			QEDF_ERR(&(qedf->dbg_ctx),
+				 "Failed to allocate task_params for xid=0x%x\n",
+				 i);
+			goto mem_err;
+		}
+
+		/*
+		 * Allocate scatter/gather list info to pass to f/w init
+		 * functions.
+		 */
+		io_req->sgl_task_params = kzalloc(
+		    sizeof(struct scsi_sgl_task_params), GFP_KERNEL);
+		if (!io_req->sgl_task_params) {
+			QEDF_ERR(&(qedf->dbg_ctx),
+				 "Failed to allocate sgl_task_params for xid=0x%x\n",
+				 i);
+			goto mem_err;
+		}
 	}
 
 	/* Allocate pool of io_bdts - one for each qedf_ioreq */
@@ -211,8 +236,8 @@ struct qedf_cmd_mgr *qedf_cmd_mgr_alloc(struct qedf_ctx *qedf)
 		cmgr->io_bdt_pool[i] = kmalloc(sizeof(struct io_bdt),
 		    GFP_KERNEL);
 		if (!cmgr->io_bdt_pool[i]) {
-			QEDF_WARN(&(qedf->dbg_ctx), "Failed to alloc "
-				   "io_bdt_pool[%d].\n", i);
+			QEDF_WARN(&(qedf->dbg_ctx),
+				  "Failed to alloc io_bdt_pool[%d].\n", i);
 			goto mem_err;
 		}
 	}
@@ -220,11 +245,11 @@ struct qedf_cmd_mgr *qedf_cmd_mgr_alloc(struct qedf_ctx *qedf)
 	for (i = 0; i < num_ios; i++) {
 		bdt_info = cmgr->io_bdt_pool[i];
 		bdt_info->bd_tbl = dma_alloc_coherent(&qedf->pdev->dev,
-		    QEDF_MAX_BDS_PER_CMD * sizeof(struct fcoe_sge),
+		    QEDF_MAX_BDS_PER_CMD * sizeof(struct scsi_sge),
 		    &bdt_info->bd_tbl_dma, GFP_KERNEL);
 		if (!bdt_info->bd_tbl) {
-			QEDF_WARN(&(qedf->dbg_ctx), "Failed to alloc "
-				   "bdt_tbl[%d].\n", i);
+			QEDF_WARN(&(qedf->dbg_ctx),
+				  "Failed to alloc bdt_tbl[%d].\n", i);
 			goto mem_err;
 		}
 	}
@@ -318,6 +343,7 @@ struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport, u8 cmd_type)
 	}
 	bd_tbl->io_req = io_req;
 	io_req->cmd_type = cmd_type;
+	io_req->tm_flags = 0;
 
 	/* Reset sequence offset data */
 	io_req->rx_buf_off = 0;
@@ -336,10 +362,9 @@ static void qedf_free_mp_resc(struct qedf_ioreq *io_req)
 {
 	struct qedf_mp_req *mp_req = &(io_req->mp_req);
 	struct qedf_ctx *qedf = io_req->fcport->qedf;
-	uint64_t sz = sizeof(struct fcoe_sge);
+	uint64_t sz = sizeof(struct scsi_sge);
 
 	/* clear tm flags */
-	mp_req->tm_flags = 0;
 	if (mp_req->mp_req_bd) {
 		dma_free_coherent(&qedf->pdev->dev, sz,
 		    mp_req->mp_req_bd, mp_req->mp_req_bd_dma);
@@ -387,7 +412,7 @@ void qedf_release_cmd(struct kref *ref)
 static int qedf_split_bd(struct qedf_ioreq *io_req, u64 addr, int sg_len,
 	int bd_index)
 {
-	struct fcoe_sge *bd = io_req->bd_tbl->bd_tbl;
+	struct scsi_sge *bd = io_req->bd_tbl->bd_tbl;
 	int frag_size, sg_frags;
 
 	sg_frags = 0;
@@ -398,7 +423,7 @@ static int qedf_split_bd(struct qedf_ioreq *io_req, u64 addr, int sg_len,
 			frag_size = sg_len;
 		bd[bd_index + sg_frags].sge_addr.lo = U64_LO(addr);
 		bd[bd_index + sg_frags].sge_addr.hi = U64_HI(addr);
-		bd[bd_index + sg_frags].size = (uint16_t)frag_size;
+		bd[bd_index + sg_frags].sge_len = (uint16_t)frag_size;
 
 		addr += (u64)frag_size;
 		sg_frags++;
@@ -413,7 +438,7 @@ static int qedf_map_sg(struct qedf_ioreq *io_req)
 	struct Scsi_Host *host = sc->device->host;
 	struct fc_lport *lport = shost_priv(host);
 	struct qedf_ctx *qedf = lport_priv(lport);
-	struct fcoe_sge *bd = io_req->bd_tbl->bd_tbl;
+	struct scsi_sge *bd = io_req->bd_tbl->bd_tbl;
 	struct scatterlist *sg;
 	int byte_count = 0;
 	int sg_count = 0;
@@ -439,7 +464,7 @@ static int qedf_map_sg(struct qedf_ioreq *io_req)
 
 		bd[bd_count].sge_addr.lo = (addr & 0xffffffff);
 		bd[bd_count].sge_addr.hi = (addr >> 32);
-		bd[bd_count].size = (u16)sg_len;
+		bd[bd_count].sge_len = (u16)sg_len;
 
 		return ++bd_count;
 	}
@@ -480,7 +505,7 @@ static int qedf_map_sg(struct qedf_ioreq *io_req)
 			sg_frags = 1;
 			bd[bd_count].sge_addr.lo = U64_LO(addr);
 			bd[bd_count].sge_addr.hi  = U64_HI(addr);
-			bd[bd_count].size = (uint16_t)sg_len;
+			bd[bd_count].sge_len = (uint16_t)sg_len;
 		}
 
 		bd_count += sg_frags;
@@ -498,7 +523,7 @@ static int qedf_map_sg(struct qedf_ioreq *io_req)
 static int qedf_build_bd_list_from_sg(struct qedf_ioreq *io_req)
 {
 	struct scsi_cmnd *sc = io_req->sc_cmd;
-	struct fcoe_sge *bd = io_req->bd_tbl->bd_tbl;
+	struct scsi_sge *bd = io_req->bd_tbl->bd_tbl;
 	int bd_count;
 
 	if (scsi_sg_count(sc)) {
@@ -508,7 +533,7 @@ static int qedf_build_bd_list_from_sg(struct qedf_ioreq *io_req)
 	} else {
 		bd_count = 0;
 		bd[0].sge_addr.lo = bd[0].sge_addr.hi = 0;
-		bd[0].size = 0;
+		bd[0].sge_len = 0;
 	}
 	io_req->bd_tbl->bd_valid = bd_count;
 
@@ -529,430 +554,223 @@ static void qedf_build_fcp_cmnd(struct qedf_ioreq *io_req,
 
 	/* 4 bytes: flag info */
 	fcp_cmnd->fc_pri_ta = 0;
-	fcp_cmnd->fc_tm_flags = io_req->mp_req.tm_flags;
+	fcp_cmnd->fc_tm_flags = io_req->tm_flags;
 	fcp_cmnd->fc_flags = io_req->io_req_flags;
 	fcp_cmnd->fc_cmdref = 0;
 
 	/* Populate data direction */
-	if (sc_cmd->sc_data_direction == DMA_TO_DEVICE)
-		fcp_cmnd->fc_flags |= FCP_CFL_WRDATA;
-	else if (sc_cmd->sc_data_direction == DMA_FROM_DEVICE)
+	if (io_req->cmd_type == QEDF_TASK_MGMT_CMD) {
 		fcp_cmnd->fc_flags |= FCP_CFL_RDDATA;
+	} else {
+		if (sc_cmd->sc_data_direction == DMA_TO_DEVICE)
+			fcp_cmnd->fc_flags |= FCP_CFL_WRDATA;
+		else if (sc_cmd->sc_data_direction == DMA_FROM_DEVICE)
+			fcp_cmnd->fc_flags |= FCP_CFL_RDDATA;
+	}
 
 	fcp_cmnd->fc_pri_ta = FCP_PTA_SIMPLE;
 
 	/* 16 bytes: CDB information */
-	memcpy(fcp_cmnd->fc_cdb, sc_cmd->cmnd, sc_cmd->cmd_len);
+	if (io_req->cmd_type != QEDF_TASK_MGMT_CMD)
+		memcpy(fcp_cmnd->fc_cdb, sc_cmd->cmnd, sc_cmd->cmd_len);
 
 	/* 4 bytes: FCP data length */
 	fcp_cmnd->fc_dl = htonl(io_req->data_xfer_len);
-
 }
 
 static void  qedf_init_task(struct qedf_rport *fcport, struct fc_lport *lport,
-	struct qedf_ioreq *io_req, u32 *ptu_invalidate,
-	struct fcoe_task_context *task_ctx)
+	struct qedf_ioreq *io_req, struct fcoe_task_context *task_ctx,
+	struct fcoe_wqe *sqe)
 {
 	enum fcoe_task_type task_type;
 	struct scsi_cmnd *sc_cmd = io_req->sc_cmd;
 	struct io_bdt *bd_tbl = io_req->bd_tbl;
-	union fcoe_data_desc_ctx *data_desc;
-	u32 *fcp_cmnd;
+	u8 fcp_cmnd[32];
 	u32 tmp_fcp_cmnd[8];
-	int cnt, i;
-	int bd_count;
+	int bd_count = 0;
 	struct qedf_ctx *qedf = fcport->qedf;
 	uint16_t cq_idx = smp_processor_id() % qedf->num_queues;
-	u8 tmp_sgl_mode = 0;
-	u8 mst_sgl_mode = 0;
+	struct regpair sense_data_buffer_phys_addr;
+	u32 tx_io_size = 0;
+	u32 rx_io_size = 0;
+	int i, cnt;
 
-	memset(task_ctx, 0, sizeof(struct fcoe_task_context));
+	/* Note init_initiator_rw_fcoe_task memsets the task context */
 	io_req->task = task_ctx;
+	memset(task_ctx, 0, sizeof(struct fcoe_task_context));
+	memset(io_req->task_params, 0, sizeof(struct fcoe_task_params));
+	memset(io_req->sgl_task_params, 0, sizeof(struct scsi_sgl_task_params));
 
-	if (sc_cmd->sc_data_direction == DMA_TO_DEVICE)
-		task_type = FCOE_TASK_TYPE_WRITE_INITIATOR;
-	else
+	/* Set task type bassed on DMA directio of command */
+	if (io_req->cmd_type == QEDF_TASK_MGMT_CMD) {
 		task_type = FCOE_TASK_TYPE_READ_INITIATOR;
-
-	/* Y Storm context */
-	task_ctx->ystorm_st_context.expect_first_xfer = 1;
-	task_ctx->ystorm_st_context.data_2_trns_rem = io_req->data_xfer_len;
-	/* Check if this is required */
-	task_ctx->ystorm_st_context.ox_id = io_req->xid;
-	task_ctx->ystorm_st_context.task_rety_identifier =
-	    io_req->task_retry_identifier;
-
-	/* T Storm ag context */
-	SET_FIELD(task_ctx->tstorm_ag_context.flags0,
-	    TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE, PROTOCOLID_FCOE);
-	task_ctx->tstorm_ag_context.icid = (u16)fcport->fw_cid;
-
-	/* T Storm st context */
-	SET_FIELD(task_ctx->tstorm_st_context.read_write.flags,
-	    FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME,
-	    1);
-	task_ctx->tstorm_st_context.read_write.rx_id = 0xffff;
-
-	task_ctx->tstorm_st_context.read_only.dev_type =
-	    FCOE_TASK_DEV_TYPE_DISK;
-	task_ctx->tstorm_st_context.read_only.conf_supported = 0;
-	task_ctx->tstorm_st_context.read_only.cid = fcport->fw_cid;
-
-	/* Completion queue for response. */
-	task_ctx->tstorm_st_context.read_only.glbl_q_num = cq_idx;
-	task_ctx->tstorm_st_context.read_only.fcp_cmd_trns_size =
-	    io_req->data_xfer_len;
-	task_ctx->tstorm_st_context.read_write.e_d_tov_exp_timeout_val =
-	    lport->e_d_tov;
-
-	task_ctx->ustorm_ag_context.global_cq_num = cq_idx;
-	io_req->fp_idx = cq_idx;
-
-	bd_count = bd_tbl->bd_valid;
-	if (task_type == FCOE_TASK_TYPE_WRITE_INITIATOR) {
-		/* Setup WRITE task */
-		struct fcoe_sge *fcoe_bd_tbl = bd_tbl->bd_tbl;
-
-		task_ctx->ystorm_st_context.task_type =
-		    FCOE_TASK_TYPE_WRITE_INITIATOR;
-		data_desc = &task_ctx->ystorm_st_context.data_desc;
-
-		if (io_req->use_slowpath) {
-			SET_FIELD(task_ctx->ystorm_st_context.sgl_mode,
-			    YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE,
-			    FCOE_SLOW_SGL);
-			data_desc->slow.base_sgl_addr.lo =
-			    U64_LO(bd_tbl->bd_tbl_dma);
-			data_desc->slow.base_sgl_addr.hi =
-			    U64_HI(bd_tbl->bd_tbl_dma);
-			data_desc->slow.remainder_num_sges = bd_count;
-			data_desc->slow.curr_sge_off = 0;
-			data_desc->slow.curr_sgl_index = 0;
-			qedf->slow_sge_ios++;
-			io_req->sge_type = QEDF_IOREQ_SLOW_SGE;
-		} else {
-			SET_FIELD(task_ctx->ystorm_st_context.sgl_mode,
-			    YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE,
-			    (bd_count <= 4) ? (enum fcoe_sgl_mode)bd_count :
-			    FCOE_MUL_FAST_SGES);
-
-			if (bd_count == 1) {
-				data_desc->single_sge.sge_addr.lo =
-				    fcoe_bd_tbl->sge_addr.lo;
-				data_desc->single_sge.sge_addr.hi =
-				    fcoe_bd_tbl->sge_addr.hi;
-				data_desc->single_sge.size =
-				    fcoe_bd_tbl->size;
-				data_desc->single_sge.is_valid_sge = 0;
-				qedf->single_sge_ios++;
-				io_req->sge_type = QEDF_IOREQ_SINGLE_SGE;
-			} else {
-				data_desc->fast.sgl_start_addr.lo =
-				    U64_LO(bd_tbl->bd_tbl_dma);
-				data_desc->fast.sgl_start_addr.hi =
-				    U64_HI(bd_tbl->bd_tbl_dma);
-				data_desc->fast.sgl_byte_offset =
-				    data_desc->fast.sgl_start_addr.lo &
-				    (QEDF_PAGE_SIZE - 1);
-				if (data_desc->fast.sgl_byte_offset > 0)
-					QEDF_ERR(&(qedf->dbg_ctx),
-					    "byte_offset=%u for xid=0x%x.\n",
-					    io_req->xid,
-					    data_desc->fast.sgl_byte_offset);
-				data_desc->fast.task_reuse_cnt =
-				    io_req->reuse_count;
-				io_req->reuse_count++;
-				if (io_req->reuse_count == QEDF_MAX_REUSE) {
-					*ptu_invalidate = 1;
-					io_req->reuse_count = 0;
-				}
-				qedf->fast_sge_ios++;
-				io_req->sge_type = QEDF_IOREQ_FAST_SGE;
-			}
-		}
-
-		/* T Storm context */
-		task_ctx->tstorm_st_context.read_only.task_type =
-		    FCOE_TASK_TYPE_WRITE_INITIATOR;
-
-		/* M Storm context */
-		tmp_sgl_mode = GET_FIELD(task_ctx->ystorm_st_context.sgl_mode,
-		    YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE);
-		SET_FIELD(task_ctx->mstorm_st_context.non_fp.tx_rx_sgl_mode,
-		    FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE,
-		    tmp_sgl_mode);
-
 	} else {
-		/* Setup READ task */
-
-		/* M Storm context */
-		struct fcoe_sge *fcoe_bd_tbl = bd_tbl->bd_tbl;
-
-		data_desc = &task_ctx->mstorm_st_context.fp.data_desc;
-		task_ctx->mstorm_st_context.fp.data_2_trns_rem =
-		    io_req->data_xfer_len;
-
-		if (io_req->use_slowpath) {
-			SET_FIELD(
-			    task_ctx->mstorm_st_context.non_fp.tx_rx_sgl_mode,
-			    FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE,
-			    FCOE_SLOW_SGL);
-			data_desc->slow.base_sgl_addr.lo =
-			    U64_LO(bd_tbl->bd_tbl_dma);
-			data_desc->slow.base_sgl_addr.hi =
-			    U64_HI(bd_tbl->bd_tbl_dma);
-			data_desc->slow.remainder_num_sges =
-			    bd_count;
-			data_desc->slow.curr_sge_off = 0;
-			data_desc->slow.curr_sgl_index = 0;
-			qedf->slow_sge_ios++;
-			io_req->sge_type = QEDF_IOREQ_SLOW_SGE;
+		if (sc_cmd->sc_data_direction == DMA_TO_DEVICE) {
+			task_type = FCOE_TASK_TYPE_WRITE_INITIATOR;
+			tx_io_size = io_req->data_xfer_len;
 		} else {
-			SET_FIELD(
-			    task_ctx->mstorm_st_context.non_fp.tx_rx_sgl_mode,
-			    FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE,
-			    (bd_count <= 4) ? (enum fcoe_sgl_mode)bd_count :
-			    FCOE_MUL_FAST_SGES);
-
-			if (bd_count == 1) {
-				data_desc->single_sge.sge_addr.lo =
-				    fcoe_bd_tbl->sge_addr.lo;
-				data_desc->single_sge.sge_addr.hi =
-				    fcoe_bd_tbl->sge_addr.hi;
-				data_desc->single_sge.size =
-				    fcoe_bd_tbl->size;
-				data_desc->single_sge.is_valid_sge = 0;
-				qedf->single_sge_ios++;
-				io_req->sge_type = QEDF_IOREQ_SINGLE_SGE;
-			} else {
-				data_desc->fast.sgl_start_addr.lo =
-				    U64_LO(bd_tbl->bd_tbl_dma);
-				data_desc->fast.sgl_start_addr.hi =
-				    U64_HI(bd_tbl->bd_tbl_dma);
-				data_desc->fast.sgl_byte_offset = 0;
-				data_desc->fast.task_reuse_cnt =
-				    io_req->reuse_count;
-				io_req->reuse_count++;
-				if (io_req->reuse_count == QEDF_MAX_REUSE) {
-					*ptu_invalidate = 1;
-					io_req->reuse_count = 0;
-				}
-				qedf->fast_sge_ios++;
-				io_req->sge_type = QEDF_IOREQ_FAST_SGE;
-			}
+			task_type = FCOE_TASK_TYPE_READ_INITIATOR;
+			rx_io_size = io_req->data_xfer_len;
 		}
-
-		/* Y Storm context */
-		task_ctx->ystorm_st_context.expect_first_xfer = 0;
-		task_ctx->ystorm_st_context.task_type =
-		    FCOE_TASK_TYPE_READ_INITIATOR;
-
-		/* T Storm context */
-		task_ctx->tstorm_st_context.read_only.task_type =
-		    FCOE_TASK_TYPE_READ_INITIATOR;
-		mst_sgl_mode = GET_FIELD(
-		    task_ctx->mstorm_st_context.non_fp.tx_rx_sgl_mode,
-		    FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE);
-		SET_FIELD(task_ctx->tstorm_st_context.read_write.flags,
-		    FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE,
-		    mst_sgl_mode);
 	}
 
+	/* Setup the fields for fcoe_task_params */
+	io_req->task_params->context = task_ctx;
+	io_req->task_params->sqe = sqe;
+	io_req->task_params->task_type = task_type;
+	io_req->task_params->tx_io_size = tx_io_size;
+	io_req->task_params->rx_io_size = rx_io_size;
+	io_req->task_params->conn_cid = fcport->fw_cid;
+	io_req->task_params->itid = io_req->xid;
+	io_req->task_params->cq_rss_number = cq_idx;
+	io_req->task_params->is_tape_device = fcport->dev_type;
+
+	/* Fill in information for scatter/gather list */
+	if (io_req->cmd_type != QEDF_TASK_MGMT_CMD) {
+		bd_count = bd_tbl->bd_valid;
+		io_req->sgl_task_params->sgl = bd_tbl->bd_tbl;
+		io_req->sgl_task_params->sgl_phys_addr.lo =
+			U64_LO(bd_tbl->bd_tbl_dma);
+		io_req->sgl_task_params->sgl_phys_addr.hi =
+			U64_HI(bd_tbl->bd_tbl_dma);
+		io_req->sgl_task_params->num_sges = bd_count;
+		io_req->sgl_task_params->total_buffer_size =
+		    scsi_bufflen(io_req->sc_cmd);
+		io_req->sgl_task_params->small_mid_sge =
+			io_req->use_slowpath;
+	}
+
+	/* Fill in physical address of sense buffer */
+	sense_data_buffer_phys_addr.lo = U64_LO(io_req->sense_buffer_dma);
+	sense_data_buffer_phys_addr.hi = U64_HI(io_req->sense_buffer_dma);
+
 	/* fill FCP_CMND IU */
-	fcp_cmnd = (u32 *)task_ctx->ystorm_st_context.tx_info_union.fcp_cmd_payload.opaque;
-	qedf_build_fcp_cmnd(io_req, (struct fcp_cmnd *)&tmp_fcp_cmnd);
+	qedf_build_fcp_cmnd(io_req, (struct fcp_cmnd *)tmp_fcp_cmnd);
 
 	/* Swap fcp_cmnd since FC is big endian */
 	cnt = sizeof(struct fcp_cmnd) / sizeof(u32);
-
 	for (i = 0; i < cnt; i++) {
-		*fcp_cmnd = cpu_to_be32(tmp_fcp_cmnd[i]);
-		fcp_cmnd++;
+		tmp_fcp_cmnd[i] = cpu_to_be32(tmp_fcp_cmnd[i]);
+	}
+	memcpy(fcp_cmnd, tmp_fcp_cmnd, sizeof(struct fcp_cmnd));
+
+	init_initiator_rw_fcoe_task(io_req->task_params,
+				    io_req->sgl_task_params,
+				    sense_data_buffer_phys_addr,
+				    io_req->task_retry_identifier, fcp_cmnd);
+
+	/* Increment SGL type counters */
+	if (bd_count == 1) {
+		qedf->single_sge_ios++;
+		io_req->sge_type = QEDF_IOREQ_SINGLE_SGE;
+	} else if (io_req->use_slowpath) {
+		qedf->slow_sge_ios++;
+		io_req->sge_type = QEDF_IOREQ_SLOW_SGE;
+	} else {
+		qedf->fast_sge_ios++;
+		io_req->sge_type = QEDF_IOREQ_FAST_SGE;
 	}
-
-	/* M Storm context - Sense buffer */
-	task_ctx->mstorm_st_context.non_fp.rsp_buf_addr.lo =
-		U64_LO(io_req->sense_buffer_dma);
-	task_ctx->mstorm_st_context.non_fp.rsp_buf_addr.hi =
-		U64_HI(io_req->sense_buffer_dma);
 }
 
 void qedf_init_mp_task(struct qedf_ioreq *io_req,
-	struct fcoe_task_context *task_ctx)
+	struct fcoe_task_context *task_ctx, struct fcoe_wqe *sqe)
 {
 	struct qedf_mp_req *mp_req = &(io_req->mp_req);
 	struct qedf_rport *fcport = io_req->fcport;
 	struct qedf_ctx *qedf = io_req->fcport->qedf;
 	struct fc_frame_header *fc_hdr;
-	enum fcoe_task_type task_type = 0;
-	union fcoe_data_desc_ctx *data_desc;
+	struct fcoe_tx_mid_path_params task_fc_hdr;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
 
-	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Initializing MP task "
-		   "for cmd_type = %d\n", io_req->cmd_type);
+	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC,
+		  "Initializing MP task for cmd_type=%d\n",
+		  io_req->cmd_type);
 
 	qedf->control_requests++;
 
-	/* Obtain task_type */
-	if ((io_req->cmd_type == QEDF_TASK_MGMT_CMD) ||
-	    (io_req->cmd_type == QEDF_ELS)) {
-		task_type = FCOE_TASK_TYPE_MIDPATH;
-	} else if (io_req->cmd_type == QEDF_ABTS) {
-		task_type = FCOE_TASK_TYPE_ABTS;
-	}
-
+	memset(&tx_sgl_task_params, 0, sizeof(struct scsi_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(struct scsi_sgl_task_params));
 	memset(task_ctx, 0, sizeof(struct fcoe_task_context));
+	memset(&task_fc_hdr, 0, sizeof(struct fcoe_tx_mid_path_params));
 
 	/* Setup the task from io_req for easy reference */
 	io_req->task = task_ctx;
 
-	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "task type = %d\n",
-		   task_type);
-
-	/* YSTORM only */
-	{
-		/* Initialize YSTORM task context */
-		struct fcoe_tx_mid_path_params *task_fc_hdr =
-		    &task_ctx->ystorm_st_context.tx_info_union.tx_params.mid_path;
-		memset(task_fc_hdr, 0, sizeof(struct fcoe_tx_mid_path_params));
-		task_ctx->ystorm_st_context.task_rety_identifier =
-		    io_req->task_retry_identifier;
-
-		/* Init SGL parameters */
-		if ((task_type == FCOE_TASK_TYPE_MIDPATH) ||
-		    (task_type == FCOE_TASK_TYPE_UNSOLICITED)) {
-			data_desc = &task_ctx->ystorm_st_context.data_desc;
-			data_desc->slow.base_sgl_addr.lo =
-			    U64_LO(mp_req->mp_req_bd_dma);
-			data_desc->slow.base_sgl_addr.hi =
-			    U64_HI(mp_req->mp_req_bd_dma);
-			data_desc->slow.remainder_num_sges = 1;
-			data_desc->slow.curr_sge_off = 0;
-			data_desc->slow.curr_sgl_index = 0;
-		}
-
-		fc_hdr = &(mp_req->req_fc_hdr);
-		if (task_type == FCOE_TASK_TYPE_MIDPATH) {
-			fc_hdr->fh_ox_id = io_req->xid;
-			fc_hdr->fh_rx_id = htons(0xffff);
-		} else if (task_type == FCOE_TASK_TYPE_UNSOLICITED) {
-			fc_hdr->fh_rx_id = io_req->xid;
-		}
+	/* Setup the fields for fcoe_task_params */
+	io_req->task_params->context = task_ctx;
+	io_req->task_params->sqe = sqe;
+	io_req->task_params->task_type = FCOE_TASK_TYPE_MIDPATH;
+	io_req->task_params->tx_io_size = io_req->data_xfer_len;
+	/* rx_io_size tells the f/w how large a response buffer we have */
+	io_req->task_params->rx_io_size = PAGE_SIZE;
+	io_req->task_params->conn_cid = fcport->fw_cid;
+	io_req->task_params->itid = io_req->xid;
+	/* Return middle path commands on CQ 0 */
+	io_req->task_params->cq_rss_number = 0;
+	io_req->task_params->is_tape_device = fcport->dev_type;
+
+	fc_hdr = &(mp_req->req_fc_hdr);
+	/* Set OX_ID and RX_ID based on driver task id */
+	fc_hdr->fh_ox_id = io_req->xid;
+	fc_hdr->fh_rx_id = htons(0xffff);
+
+	/* Set up FC header information */
+	task_fc_hdr.parameter = fc_hdr->fh_parm_offset;
+	task_fc_hdr.r_ctl = fc_hdr->fh_r_ctl;
+	task_fc_hdr.type = fc_hdr->fh_type;
+	task_fc_hdr.cs_ctl = fc_hdr->fh_cs_ctl;
+	task_fc_hdr.df_ctl = fc_hdr->fh_df_ctl;
+	task_fc_hdr.rx_id = fc_hdr->fh_rx_id;
+	task_fc_hdr.ox_id = fc_hdr->fh_ox_id;
+
+	/* Set up s/g list parameters for request buffer */
+	tx_sgl_task_params.sgl = mp_req->mp_req_bd;
+	tx_sgl_task_params.sgl_phys_addr.lo = U64_LO(mp_req->mp_req_bd_dma);
+	tx_sgl_task_params.sgl_phys_addr.hi = U64_HI(mp_req->mp_req_bd_dma);
+	tx_sgl_task_params.num_sges = 1;
+	/* Set PAGE_SIZE for now since sg element is that size ??? */
+	tx_sgl_task_params.total_buffer_size = io_req->data_xfer_len;
+	tx_sgl_task_params.small_mid_sge = 0;
+
+	/* Set up s/g list parameters for request buffer */
+	rx_sgl_task_params.sgl = mp_req->mp_resp_bd;
+	rx_sgl_task_params.sgl_phys_addr.lo = U64_LO(mp_req->mp_resp_bd_dma);
+	rx_sgl_task_params.sgl_phys_addr.hi = U64_HI(mp_req->mp_resp_bd_dma);
+	rx_sgl_task_params.num_sges = 1;
+	/* Set PAGE_SIZE for now since sg element is that size ??? */
+	rx_sgl_task_params.total_buffer_size = PAGE_SIZE;
+	rx_sgl_task_params.small_mid_sge = 0;
 
-		/* Fill FC Header into middle path buffer */
-		task_fc_hdr->parameter = fc_hdr->fh_parm_offset;
-		task_fc_hdr->r_ctl = fc_hdr->fh_r_ctl;
-		task_fc_hdr->type = fc_hdr->fh_type;
-		task_fc_hdr->cs_ctl = fc_hdr->fh_cs_ctl;
-		task_fc_hdr->df_ctl = fc_hdr->fh_df_ctl;
-		task_fc_hdr->rx_id = fc_hdr->fh_rx_id;
-		task_fc_hdr->ox_id = fc_hdr->fh_ox_id;
-
-		task_ctx->ystorm_st_context.data_2_trns_rem =
-		    io_req->data_xfer_len;
-		task_ctx->ystorm_st_context.task_type = task_type;
-	}
-
-	/* TSTORM ONLY */
-	{
-		task_ctx->tstorm_ag_context.icid = (u16)fcport->fw_cid;
-		task_ctx->tstorm_st_context.read_only.cid = fcport->fw_cid;
-		/* Always send middle-path repsonses on CQ #0 */
-		task_ctx->tstorm_st_context.read_only.glbl_q_num = 0;
-		io_req->fp_idx = 0;
-		SET_FIELD(task_ctx->tstorm_ag_context.flags0,
-		    TSTORM_FCOE_TASK_AG_CTX_CONNECTION_TYPE,
-		    PROTOCOLID_FCOE);
-		task_ctx->tstorm_st_context.read_only.task_type = task_type;
-		SET_FIELD(task_ctx->tstorm_st_context.read_write.flags,
-		    FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME,
-		    1);
-		task_ctx->tstorm_st_context.read_write.rx_id = 0xffff;
-	}
-
-	/* MSTORM only */
-	{
-		if (task_type == FCOE_TASK_TYPE_MIDPATH) {
-			/* Initialize task context */
-			data_desc = &task_ctx->mstorm_st_context.fp.data_desc;
-
-			/* Set cache sges address and length */
-			data_desc->slow.base_sgl_addr.lo =
-			    U64_LO(mp_req->mp_resp_bd_dma);
-			data_desc->slow.base_sgl_addr.hi =
-			    U64_HI(mp_req->mp_resp_bd_dma);
-			data_desc->slow.remainder_num_sges = 1;
-			data_desc->slow.curr_sge_off = 0;
-			data_desc->slow.curr_sgl_index = 0;
 
-			/*
-			 * Also need to fil in non-fastpath response address
-			 * for middle path commands.
-			 */
-			task_ctx->mstorm_st_context.non_fp.rsp_buf_addr.lo =
-			    U64_LO(mp_req->mp_resp_bd_dma);
-			task_ctx->mstorm_st_context.non_fp.rsp_buf_addr.hi =
-			    U64_HI(mp_req->mp_resp_bd_dma);
-		}
-	}
-
-	/* USTORM ONLY */
-	{
-		task_ctx->ustorm_ag_context.global_cq_num = 0;
-	}
+	/*
+	 * Last arg is 0 as previous code did not set that we wanted the
+	 * fc header information.
+	 */
+	init_initiator_midpath_unsolicited_fcoe_task(io_req->task_params,
+						     &task_fc_hdr,
+						     &tx_sgl_task_params,
+						     &rx_sgl_task_params, 0);
 
-	/* I/O stats. Middle path commands always use slow SGEs */
-	qedf->slow_sge_ios++;
-	io_req->sge_type = QEDF_IOREQ_SLOW_SGE;
+	/* Midpath requests always consume 1 SGE */
+	qedf->single_sge_ios++;
 }
 
-void qedf_add_to_sq(struct qedf_rport *fcport, u16 xid, u32 ptu_invalidate,
-	enum fcoe_task_type req_type, u32 offset)
+/* Presumed that fcport->rport_lock is held */
+u16 qedf_get_sqe_idx(struct qedf_rport *fcport)
 {
-	struct fcoe_wqe *sqe;
 	uint16_t total_sqe = (fcport->sq_mem_size)/(sizeof(struct fcoe_wqe));
+	u16 rval;
 
-	sqe = &fcport->sq[fcport->sq_prod_idx];
+	rval = fcport->sq_prod_idx;
 
+	/* Adjust ring index */
 	fcport->sq_prod_idx++;
 	fcport->fw_sq_prod_idx++;
 	if (fcport->sq_prod_idx == total_sqe)
 		fcport->sq_prod_idx = 0;
 
-	switch (req_type) {
-	case FCOE_TASK_TYPE_WRITE_INITIATOR:
-	case FCOE_TASK_TYPE_READ_INITIATOR:
-		SET_FIELD(sqe->flags, FCOE_WQE_REQ_TYPE, SEND_FCOE_CMD);
-		if (ptu_invalidate)
-			SET_FIELD(sqe->flags, FCOE_WQE_INVALIDATE_PTU, 1);
-		break;
-	case FCOE_TASK_TYPE_MIDPATH:
-		SET_FIELD(sqe->flags, FCOE_WQE_REQ_TYPE, SEND_FCOE_MIDPATH);
-		break;
-	case FCOE_TASK_TYPE_ABTS:
-		SET_FIELD(sqe->flags, FCOE_WQE_REQ_TYPE,
-		    SEND_FCOE_ABTS_REQUEST);
-		break;
-	case FCOE_TASK_TYPE_EXCHANGE_CLEANUP:
-		SET_FIELD(sqe->flags, FCOE_WQE_REQ_TYPE,
-		     FCOE_EXCHANGE_CLEANUP);
-		break;
-	case FCOE_TASK_TYPE_SEQUENCE_CLEANUP:
-		SET_FIELD(sqe->flags, FCOE_WQE_REQ_TYPE,
-		    FCOE_SEQUENCE_RECOVERY);
-		/* NOTE: offset param only used for sequence recovery */
-		sqe->additional_info_union.seq_rec_updated_offset = offset;
-		break;
-	case FCOE_TASK_TYPE_UNSOLICITED:
-		break;
-	default:
-		break;
-	}
-
-	sqe->task_id = xid;
-
-	/* Make sure SQ data is coherent */
-	wmb();
-
+	return rval;
 }
 
 void qedf_ring_doorbell(struct qedf_rport *fcport)
@@ -1029,7 +847,8 @@ int qedf_post_io_req(struct qedf_rport *fcport, struct qedf_ioreq *io_req)
 	struct fcoe_task_context *task_ctx;
 	u16 xid;
 	enum fcoe_task_type req_type = 0;
-	u32 ptu_invalidate = 0;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
 
 	/* Initialize rest of io_req fileds */
 	io_req->data_xfer_len = scsi_bufflen(sc_cmd);
@@ -1061,6 +880,16 @@ int qedf_post_io_req(struct qedf_rport *fcport, struct qedf_ioreq *io_req)
 		return -EAGAIN;
 	}
 
+	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
+		QEDF_ERR(&(qedf->dbg_ctx), "Session not offloaded yet.\n");
+		kref_put(&io_req->refcount, qedf_release_cmd);
+	}
+
+	/* Obtain free SQE */
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+
 	/* Get the task context */
 	task_ctx = qedf_get_task_mem(&qedf->tasks, xid);
 	if (!task_ctx) {
@@ -1070,15 +899,7 @@ int qedf_post_io_req(struct qedf_rport *fcport, struct qedf_ioreq *io_req)
 		return -EINVAL;
 	}
 
-	qedf_init_task(fcport, lport, io_req, &ptu_invalidate, task_ctx);
-
-	if (!test_bit(QEDF_RPORT_SESSION_READY, &fcport->flags)) {
-		QEDF_ERR(&(qedf->dbg_ctx), "Session not offloaded yet.\n");
-		kref_put(&io_req->refcount, qedf_release_cmd);
-	}
-
-	/* Obtain free SQ entry */
-	qedf_add_to_sq(fcport, xid, ptu_invalidate, req_type, 0);
+	qedf_init_task(fcport, lport, io_req, task_ctx, sqe);
 
 	/* Ring doorbell */
 	qedf_ring_doorbell(fcport);
@@ -1661,6 +1482,8 @@ int qedf_initiate_abts(struct qedf_ioreq *io_req, bool return_scsi_cmd_on_abts)
 	u32 r_a_tov = 0;
 	int rc = 0;
 	unsigned long flags;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
 
 	r_a_tov = rdata->r_a_tov;
 	lport = qedf->lport;
@@ -1712,10 +1535,12 @@ int qedf_initiate_abts(struct qedf_ioreq *io_req, bool return_scsi_cmd_on_abts)
 
 	spin_lock_irqsave(&fcport->rport_lock, flags);
 
-	/* Add ABTS to send queue */
-	qedf_add_to_sq(fcport, xid, 0, FCOE_TASK_TYPE_ABTS, 0);
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+	io_req->task_params->sqe = sqe;
 
-	/* Ring doorbell */
+	init_initiator_abort_fcoe_task(io_req->task_params);
 	qedf_ring_doorbell(fcport);
 
 	spin_unlock_irqrestore(&fcport->rport_lock, flags);
@@ -1784,8 +1609,8 @@ void qedf_process_abts_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
 int qedf_init_mp_req(struct qedf_ioreq *io_req)
 {
 	struct qedf_mp_req *mp_req;
-	struct fcoe_sge *mp_req_bd;
-	struct fcoe_sge *mp_resp_bd;
+	struct scsi_sge *mp_req_bd;
+	struct scsi_sge *mp_resp_bd;
 	struct qedf_ctx *qedf = io_req->fcport->qedf;
 	dma_addr_t addr;
 	uint64_t sz;
@@ -1819,7 +1644,7 @@ int qedf_init_mp_req(struct qedf_ioreq *io_req)
 	}
 
 	/* Allocate and map mp_req_bd and mp_resp_bd */
-	sz = sizeof(struct fcoe_sge);
+	sz = sizeof(struct scsi_sge);
 	mp_req->mp_req_bd = dma_alloc_coherent(&qedf->pdev->dev, sz,
 	    &mp_req->mp_req_bd_dma, GFP_KERNEL);
 	if (!mp_req->mp_req_bd) {
@@ -1841,7 +1666,7 @@ int qedf_init_mp_req(struct qedf_ioreq *io_req)
 	mp_req_bd = mp_req->mp_req_bd;
 	mp_req_bd->sge_addr.lo = U64_LO(addr);
 	mp_req_bd->sge_addr.hi = U64_HI(addr);
-	mp_req_bd->size = QEDF_PAGE_SIZE;
+	mp_req_bd->sge_len = QEDF_PAGE_SIZE;
 
 	/*
 	 * MP buffer is either a task mgmt command or an ELS.
@@ -1852,7 +1677,7 @@ int qedf_init_mp_req(struct qedf_ioreq *io_req)
 	addr = mp_req->resp_buf_dma;
 	mp_resp_bd->sge_addr.lo = U64_LO(addr);
 	mp_resp_bd->sge_addr.hi = U64_HI(addr);
-	mp_resp_bd->size = QEDF_PAGE_SIZE;
+	mp_resp_bd->sge_len = QEDF_PAGE_SIZE;
 
 	return 0;
 }
@@ -1895,6 +1720,8 @@ int qedf_initiate_cleanup(struct qedf_ioreq *io_req,
 	int tmo = 0;
 	int rc = SUCCESS;
 	unsigned long flags;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
 
 	fcport = io_req->fcport;
 	if (!fcport) {
@@ -1940,12 +1767,16 @@ int qedf_initiate_cleanup(struct qedf_ioreq *io_req,
 
 	init_completion(&io_req->tm_done);
 
-	/* Obtain free SQ entry */
 	spin_lock_irqsave(&fcport->rport_lock, flags);
-	qedf_add_to_sq(fcport, xid, 0, FCOE_TASK_TYPE_EXCHANGE_CLEANUP, 0);
 
-	/* Ring doorbell */
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+	io_req->task_params->sqe = sqe;
+
+	init_initiator_cleanup_fcoe_task(io_req->task_params);
 	qedf_ring_doorbell(fcport);
+
 	spin_unlock_irqrestore(&fcport->rport_lock, flags);
 
 	tmo = wait_for_completion_timeout(&io_req->tm_done,
@@ -1991,16 +1822,15 @@ static int qedf_execute_tmf(struct qedf_rport *fcport, struct scsi_cmnd *sc_cmd,
 	uint8_t tm_flags)
 {
 	struct qedf_ioreq *io_req;
-	struct qedf_mp_req *tm_req;
 	struct fcoe_task_context *task;
-	struct fc_frame_header *fc_hdr;
-	struct fcp_cmnd *fcp_cmnd;
 	struct qedf_ctx *qedf = fcport->qedf;
+	struct fc_lport *lport = qedf->lport;
 	int rc = 0;
 	uint16_t xid;
-	uint32_t sid, did;
 	int tmo = 0;
 	unsigned long flags;
+	struct fcoe_wqe *sqe;
+	u16 sqe_idx;
 
 	if (!sc_cmd) {
 		QEDF_ERR(&(qedf->dbg_ctx), "invalid arg\n");
@@ -2031,36 +1861,14 @@ static int qedf_execute_tmf(struct qedf_rport *fcport, struct scsi_cmnd *sc_cmd,
 	/* Set the return CPU to be the same as the request one */
 	io_req->cpu = smp_processor_id();
 
-	tm_req = (struct qedf_mp_req *)&(io_req->mp_req);
-
-	rc = qedf_init_mp_req(io_req);
-	if (rc == FAILED) {
-		QEDF_ERR(&(qedf->dbg_ctx), "Task mgmt MP request init "
-			  "failed\n");
-		kref_put(&io_req->refcount, qedf_release_cmd);
-		goto reset_tmf_err;
-	}
-
 	/* Set TM flags */
-	io_req->io_req_flags = 0;
-	tm_req->tm_flags = tm_flags;
+	io_req->io_req_flags = QEDF_READ;
+	io_req->data_xfer_len = 0;
+	io_req->tm_flags = tm_flags;
 
 	/* Default is to return a SCSI command when an error occurs */
 	io_req->return_scsi_cmd_on_abts = true;
 
-	/* Fill FCP_CMND */
-	qedf_build_fcp_cmnd(io_req, (struct fcp_cmnd *)tm_req->req_buf);
-	fcp_cmnd = (struct fcp_cmnd *)tm_req->req_buf;
-	memset(fcp_cmnd->fc_cdb, 0, FCP_CMND_LEN);
-	fcp_cmnd->fc_dl = 0;
-
-	/* Fill FC header */
-	fc_hdr = &(tm_req->req_fc_hdr);
-	sid = fcport->sid;
-	did = fcport->rdata->ids.port_id;
-	__fc_fill_fc_hdr(fc_hdr, FC_RCTL_DD_UNSOL_CMD, sid, did,
-			   FC_TYPE_FCP, FC_FC_FIRST_SEQ | FC_FC_END_SEQ |
-			   FC_FC_SEQ_INIT, 0);
 	/* Obtain exchange id */
 	xid = io_req->xid;
 
@@ -2069,16 +1877,18 @@ static int qedf_execute_tmf(struct qedf_rport *fcport, struct scsi_cmnd *sc_cmd,
 
 	/* Initialize task context for this IO request */
 	task = qedf_get_task_mem(&qedf->tasks, xid);
-	qedf_init_mp_task(io_req, task);
 
 	init_completion(&io_req->tm_done);
 
-	/* Obtain free SQ entry */
 	spin_lock_irqsave(&fcport->rport_lock, flags);
-	qedf_add_to_sq(fcport, xid, 0, FCOE_TASK_TYPE_MIDPATH, 0);
 
-	/* Ring doorbell */
+	sqe_idx = qedf_get_sqe_idx(fcport);
+	sqe = &fcport->sq[sqe_idx];
+	memset(sqe, 0, sizeof(struct fcoe_wqe));
+
+	qedf_init_task(fcport, lport, io_req, task, sqe);
 	qedf_ring_doorbell(fcport);
+
 	spin_unlock_irqrestore(&fcport->rport_lock, flags);
 
 	tmo = wait_for_completion_timeout(&io_req->tm_done,
@@ -2162,14 +1972,6 @@ void qedf_process_tmf_compl(struct qedf_ctx *qedf, struct fcoe_cqe *cqe,
 	struct qedf_ioreq *io_req)
 {
 	struct fcoe_cqe_rsp_info *fcp_rsp;
-	struct fcoe_cqe_midpath_info *mp_info;
-
-
-	/* Get TMF response length from CQE */
-	mp_info = &cqe->cqe_info.midpath_info;
-	io_req->mp_req.resp_len = mp_info->data_placement_size;
-	QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_SCSI_TM,
-	    "Response len is %d.\n", io_req->mp_req.resp_len);
 
 	fcp_rsp = &cqe->cqe_info.rsp_info;
 	qedf_parse_fcp_rsp(io_req, fcp_rsp);
diff --git a/drivers/scsi/qedi/Makefile b/drivers/scsi/qedi/Makefile
index 2b3e16b24299..90a6925577cc 100644
--- a/drivers/scsi/qedi/Makefile
+++ b/drivers/scsi/qedi/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_QEDI) := qedi.o
 qedi-y := qedi_main.o qedi_iscsi.o qedi_fw.o qedi_sysfs.o \
-	    qedi_dbg.o
+	    qedi_dbg.o qedi_fw_api.o
 
 qedi-$(CONFIG_DEBUG_FS) += qedi_debugfs.o
diff --git a/drivers/scsi/qedi/qedi_fw.c b/drivers/scsi/qedi/qedi_fw.c
index c9f0ef4e11b3..eca40b0513a3 100644
--- a/drivers/scsi/qedi/qedi_fw.c
+++ b/drivers/scsi/qedi/qedi_fw.c
@@ -14,6 +14,8 @@
 #include "qedi.h"
 #include "qedi_iscsi.h"
 #include "qedi_gbl.h"
+#include "qedi_fw_iscsi.h"
+#include "qedi_fw_scsi.h"
 
 static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
 			       struct iscsi_task *mtask);
@@ -53,8 +55,8 @@ static void qedi_process_logout_resp(struct qedi_ctx *qedi,
 	resp_hdr->exp_cmdsn = cpu_to_be32(cqe_logout_response->exp_cmd_sn);
 	resp_hdr->max_cmdsn = cpu_to_be32(cqe_logout_response->max_cmd_sn);
 
-	resp_hdr->t2wait = cpu_to_be32(cqe_logout_response->time2wait);
-	resp_hdr->t2retain = cpu_to_be32(cqe_logout_response->time2retain);
+	resp_hdr->t2wait = cpu_to_be32(cqe_logout_response->time_2_wait);
+	resp_hdr->t2retain = cpu_to_be32(cqe_logout_response->time_2_retain);
 
 	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_TID,
 		  "Freeing tid=0x%x for cid=0x%x\n",
@@ -975,81 +977,6 @@ exit_fp_process:
 	return;
 }
 
-static void qedi_add_to_sq(struct qedi_conn *qedi_conn, struct iscsi_task *task,
-			   u16 tid, uint16_t ptu_invalidate, int is_cleanup)
-{
-	struct iscsi_wqe *wqe;
-	struct iscsi_wqe_field *cont_field;
-	struct qedi_endpoint *ep;
-	struct scsi_cmnd *sc = task->sc;
-	struct iscsi_login_req *login_hdr;
-	struct qedi_cmd *cmd = task->dd_data;
-
-	login_hdr = (struct iscsi_login_req *)task->hdr;
-	ep = qedi_conn->ep;
-	wqe = &ep->sq[ep->sq_prod_idx];
-
-	memset(wqe, 0, sizeof(*wqe));
-
-	ep->sq_prod_idx++;
-	ep->fw_sq_prod_idx++;
-	if (ep->sq_prod_idx == QEDI_SQ_SIZE)
-		ep->sq_prod_idx = 0;
-
-	if (is_cleanup) {
-		SET_FIELD(wqe->flags, ISCSI_WQE_WQE_TYPE,
-			  ISCSI_WQE_TYPE_TASK_CLEANUP);
-		wqe->task_id = tid;
-		return;
-	}
-
-	if (ptu_invalidate) {
-		SET_FIELD(wqe->flags, ISCSI_WQE_PTU_INVALIDATE,
-			  ISCSI_WQE_SET_PTU_INVALIDATE);
-	}
-
-	cont_field = &wqe->cont_prevtid_union.cont_field;
-
-	switch (task->hdr->opcode & ISCSI_OPCODE_MASK) {
-	case ISCSI_OP_LOGIN:
-	case ISCSI_OP_TEXT:
-		SET_FIELD(wqe->flags, ISCSI_WQE_WQE_TYPE,
-			  ISCSI_WQE_TYPE_MIDDLE_PATH);
-		SET_FIELD(wqe->flags, ISCSI_WQE_NUM_FAST_SGES,
-			  1);
-		cont_field->contlen_cdbsize_field = ntoh24(login_hdr->dlength);
-		break;
-	case ISCSI_OP_LOGOUT:
-	case ISCSI_OP_NOOP_OUT:
-	case ISCSI_OP_SCSI_TMFUNC:
-		 SET_FIELD(wqe->flags, ISCSI_WQE_WQE_TYPE,
-			   ISCSI_WQE_TYPE_NORMAL);
-		break;
-	default:
-		if (!sc)
-			break;
-
-		SET_FIELD(wqe->flags, ISCSI_WQE_WQE_TYPE,
-			  ISCSI_WQE_TYPE_NORMAL);
-		cont_field->contlen_cdbsize_field =
-				(sc->sc_data_direction == DMA_TO_DEVICE) ?
-				scsi_bufflen(sc) : 0;
-		if (cmd->use_slowpath)
-			SET_FIELD(wqe->flags, ISCSI_WQE_NUM_FAST_SGES, 0);
-		else
-			SET_FIELD(wqe->flags, ISCSI_WQE_NUM_FAST_SGES,
-				  (sc->sc_data_direction ==
-				   DMA_TO_DEVICE) ?
-				  min((u16)QEDI_FAST_SGE_COUNT,
-				      (u16)cmd->io_tbl.sge_valid) : 0);
-		break;
-	}
-
-	wqe->task_id = tid;
-	/* Make sure SQ data is coherent */
-	wmb();
-}
-
 static void qedi_ring_doorbell(struct qedi_conn *qedi_conn)
 {
 	struct iscsi_db_data dbell = { 0 };
@@ -1076,96 +1003,116 @@ static void qedi_ring_doorbell(struct qedi_conn *qedi_conn)
 		  qedi_conn->iscsi_conn_id);
 }
 
+static u16 qedi_get_wqe_idx(struct qedi_conn *qedi_conn)
+{
+	struct qedi_endpoint *ep;
+	u16 rval;
+
+	ep = qedi_conn->ep;
+	rval = ep->sq_prod_idx;
+
+	/* Increament SQ index */
+	ep->sq_prod_idx++;
+	ep->fw_sq_prod_idx++;
+	if (ep->sq_prod_idx == QEDI_SQ_SIZE)
+		ep->sq_prod_idx = 0;
+
+	return rval;
+}
+
 int qedi_send_iscsi_login(struct qedi_conn *qedi_conn,
 			  struct iscsi_task *task)
 {
-	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_login_req_hdr login_req_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
 	struct iscsi_task_context *fw_task_ctx;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
 	struct iscsi_login_req *login_hdr;
-	struct iscsi_login_req_hdr *fw_login_req = NULL;
-	struct iscsi_cached_sge_ctx *cached_sge = NULL;
-	struct iscsi_sge *single_sge = NULL;
-	struct iscsi_sge *req_sge = NULL;
-	struct iscsi_sge *resp_sge = NULL;
+	struct scsi_sge *req_sge = NULL;
+	struct scsi_sge *resp_sge = NULL;
 	struct qedi_cmd *qedi_cmd;
-	s16 ptu_invalidate = 0;
+	struct qedi_endpoint *ep;
 	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
 
-	req_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
-	resp_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	req_sge = (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	resp_sge = (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
 	qedi_cmd = (struct qedi_cmd *)task->dd_data;
+	ep = qedi_conn->ep;
 	login_hdr = (struct iscsi_login_req *)task->hdr;
 
 	tid = qedi_get_task_idx(qedi);
 	if (tid == -1)
 		return -ENOMEM;
 
-	fw_task_ctx = qedi_get_task_mem(&qedi->tasks, tid);
+	fw_task_ctx =
+	     (struct iscsi_task_context *)qedi_get_task_mem(&qedi->tasks, tid);
 	memset(fw_task_ctx, 0, sizeof(struct iscsi_task_context));
 
 	qedi_cmd->task_id = tid;
 
-	/* Ystorm context */
-	fw_login_req = &fw_task_ctx->ystorm_st_context.pdu_hdr.login_req;
-	fw_login_req->opcode = login_hdr->opcode;
-	fw_login_req->version_min = login_hdr->min_version;
-	fw_login_req->version_max = login_hdr->max_version;
-	fw_login_req->flags_attr = login_hdr->flags;
-	fw_login_req->isid_tabc = *((u16 *)login_hdr->isid + 2);
-	fw_login_req->isid_d = *((u32 *)login_hdr->isid);
-	fw_login_req->tsih = login_hdr->tsih;
-	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
-	fw_login_req->itt = qedi_set_itt(tid, get_itt(task->itt));
-	fw_login_req->cid = qedi_conn->iscsi_conn_id;
-	fw_login_req->cmd_sn = be32_to_cpu(login_hdr->cmdsn);
-	fw_login_req->exp_stat_sn = be32_to_cpu(login_hdr->exp_statsn);
-	fw_login_req->exp_stat_sn = 0;
-
-	if (qedi->tid_reuse_count[tid] == QEDI_MAX_TASK_NUM) {
-		ptu_invalidate = 1;
-		qedi->tid_reuse_count[tid] = 0;
-	}
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&login_req_pdu_header, 0, sizeof(login_req_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+	/* Update header info */
+	login_req_pdu_header.opcode = login_hdr->opcode;
+	login_req_pdu_header.version_min = login_hdr->min_version;
+	login_req_pdu_header.version_max = login_hdr->max_version;
+	login_req_pdu_header.flags_attr = login_hdr->flags;
+	login_req_pdu_header.isid_tabc = swab32p((u32 *)login_hdr->isid);
+	login_req_pdu_header.isid_d = swab16p((u16 *)&login_hdr->isid[4]);
+
+	login_req_pdu_header.tsih = login_hdr->tsih;
+	login_req_pdu_header.hdr_second_dword = ntoh24(login_hdr->dlength);
 
-	fw_task_ctx->ystorm_st_context.state.reuse_count =
-						qedi->tid_reuse_count[tid];
-	fw_task_ctx->mstorm_st_context.reuse_count =
-						qedi->tid_reuse_count[tid]++;
-	cached_sge =
-	       &fw_task_ctx->ystorm_st_context.state.sgl_ctx_union.cached_sge;
-	cached_sge->sge.sge_len = req_sge->sge_len;
-	cached_sge->sge.sge_addr.lo = (u32)(qedi_conn->gen_pdu.req_dma_addr);
-	cached_sge->sge.sge_addr.hi =
-			     (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
-
-	/* Mstorm context */
-	single_sge = &fw_task_ctx->mstorm_st_context.sgl_union.single_sge;
-	fw_task_ctx->mstorm_st_context.task_type = 0x2;
-	fw_task_ctx->mstorm_ag_context.task_cid = (u16)qedi_conn->iscsi_conn_id;
-	single_sge->sge_addr.lo = resp_sge->sge_addr.lo;
-	single_sge->sge_addr.hi = resp_sge->sge_addr.hi;
-	single_sge->sge_len = resp_sge->sge_len;
-
-	SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-		  ISCSI_MFLAGS_SINGLE_SGE, 1);
-	SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-		  ISCSI_MFLAGS_SLOW_IO, 0);
-	fw_task_ctx->mstorm_st_context.sgl_size = 1;
-	fw_task_ctx->mstorm_st_context.rem_task_size = resp_sge->sge_len;
-
-	/* Ustorm context */
-	fw_task_ctx->ustorm_st_context.rem_rcv_len = resp_sge->sge_len;
-	fw_task_ctx->ustorm_st_context.exp_data_transfer_len =
-						ntoh24(login_hdr->dlength);
-	fw_task_ctx->ustorm_st_context.exp_data_sn = 0;
-	fw_task_ctx->ustorm_st_context.cq_rss_number = 0;
-	fw_task_ctx->ustorm_st_context.task_type = 0x2;
-	fw_task_ctx->ustorm_ag_context.icid = (u16)qedi_conn->iscsi_conn_id;
-	fw_task_ctx->ustorm_ag_context.exp_data_acked =
-						 ntoh24(login_hdr->dlength);
-	SET_FIELD(fw_task_ctx->ustorm_ag_context.flags1,
-		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
-	SET_FIELD(fw_task_ctx->ustorm_st_context.flags,
-		  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 0);
+	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
+	login_req_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	login_req_pdu_header.cid = qedi_conn->iscsi_conn_id;
+	login_req_pdu_header.cmd_sn = be32_to_cpu(login_hdr->cmdsn);
+	login_req_pdu_header.exp_stat_sn = be32_to_cpu(login_hdr->exp_statsn);
+	login_req_pdu_header.exp_stat_sn = 0;
+
+	/* Fill tx AHS and rx buffer */
+	tx_sgl_task_params.sgl =
+			       (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	tx_sgl_task_params.sgl_phys_addr.lo =
+					 (u32)(qedi_conn->gen_pdu.req_dma_addr);
+	tx_sgl_task_params.sgl_phys_addr.hi =
+			      (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+	tx_sgl_task_params.total_buffer_size = ntoh24(login_hdr->dlength);
+	tx_sgl_task_params.num_sges = 1;
+
+	rx_sgl_task_params.sgl =
+			      (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	rx_sgl_task_params.sgl_phys_addr.lo =
+					(u32)(qedi_conn->gen_pdu.resp_dma_addr);
+	rx_sgl_task_params.sgl_phys_addr.hi =
+			     (u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+	rx_sgl_task_params.total_buffer_size = resp_sge->sge_len;
+	rx_sgl_task_params.num_sges = 1;
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = ntoh24(login_hdr->dlength);
+	task_params.rx_io_size = resp_sge->sge_len;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_login_request_task(&task_params,
+						 &login_req_pdu_header,
+						 &tx_sgl_task_params,
+						 &rx_sgl_task_params);
+	if (rval)
+		return -1;
 
 	spin_lock(&qedi_conn->list_lock);
 	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
@@ -1173,7 +1120,6 @@ int qedi_send_iscsi_login(struct qedi_conn *qedi_conn,
 	qedi_conn->active_cmd_count++;
 	spin_unlock(&qedi_conn->list_lock);
 
-	qedi_add_to_sq(qedi_conn, task, tid, ptu_invalidate, false);
 	qedi_ring_doorbell(qedi_conn);
 	return 0;
 }
@@ -1181,65 +1127,64 @@ int qedi_send_iscsi_login(struct qedi_conn *qedi_conn,
 int qedi_send_iscsi_logout(struct qedi_conn *qedi_conn,
 			   struct iscsi_task *task)
 {
-	struct qedi_ctx *qedi = qedi_conn->qedi;
-	struct iscsi_logout_req_hdr *fw_logout_req = NULL;
-	struct iscsi_task_context *fw_task_ctx = NULL;
+	struct iscsi_logout_req_hdr logout_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
+	struct iscsi_task_context *fw_task_ctx;
 	struct iscsi_logout *logout_hdr = NULL;
-	struct qedi_cmd *qedi_cmd = NULL;
-	s16  tid = 0;
-	s16 ptu_invalidate = 0;
+	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
 
 	qedi_cmd = (struct qedi_cmd *)task->dd_data;
 	logout_hdr = (struct iscsi_logout *)task->hdr;
+	ep = qedi_conn->ep;
 
 	tid = qedi_get_task_idx(qedi);
 	if (tid == -1)
 		return -ENOMEM;
 
-	fw_task_ctx = qedi_get_task_mem(&qedi->tasks, tid);
-
+	fw_task_ctx =
+	     (struct iscsi_task_context *)qedi_get_task_mem(&qedi->tasks, tid);
 	memset(fw_task_ctx, 0, sizeof(struct iscsi_task_context));
+
 	qedi_cmd->task_id = tid;
 
-	/* Ystorm context */
-	fw_logout_req = &fw_task_ctx->ystorm_st_context.pdu_hdr.logout_req;
-	fw_logout_req->opcode = ISCSI_OPCODE_LOGOUT_REQUEST;
-	fw_logout_req->reason_code = 0x80 | logout_hdr->flags;
-	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
-	fw_logout_req->itt = qedi_set_itt(tid, get_itt(task->itt));
-	fw_logout_req->exp_stat_sn = be32_to_cpu(logout_hdr->exp_statsn);
-	fw_logout_req->cmd_sn = be32_to_cpu(logout_hdr->cmdsn);
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&logout_pdu_header, 0, sizeof(logout_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
 
-	if (qedi->tid_reuse_count[tid] == QEDI_MAX_TASK_NUM) {
-		ptu_invalidate = 1;
-		qedi->tid_reuse_count[tid] = 0;
-	}
-	fw_task_ctx->ystorm_st_context.state.reuse_count =
-						  qedi->tid_reuse_count[tid];
-	fw_task_ctx->mstorm_st_context.reuse_count =
-						qedi->tid_reuse_count[tid]++;
-	fw_logout_req->cid = qedi_conn->iscsi_conn_id;
-	fw_task_ctx->ystorm_st_context.state.buffer_offset[0] = 0;
-
-	/* Mstorm context */
-	fw_task_ctx->mstorm_st_context.task_type = ISCSI_TASK_TYPE_MIDPATH;
-	fw_task_ctx->mstorm_ag_context.task_cid = (u16)qedi_conn->iscsi_conn_id;
-
-	/* Ustorm context */
-	fw_task_ctx->ustorm_st_context.rem_rcv_len = 0;
-	fw_task_ctx->ustorm_st_context.exp_data_transfer_len = 0;
-	fw_task_ctx->ustorm_st_context.exp_data_sn = 0;
-	fw_task_ctx->ustorm_st_context.task_type =  ISCSI_TASK_TYPE_MIDPATH;
-	fw_task_ctx->ustorm_st_context.cq_rss_number = 0;
-
-	SET_FIELD(fw_task_ctx->ustorm_st_context.flags,
-		  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 0);
-	SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-		  ISCSI_REG1_NUM_FAST_SGES, 0);
-
-	fw_task_ctx->ustorm_ag_context.icid = (u16)qedi_conn->iscsi_conn_id;
-	SET_FIELD(fw_task_ctx->ustorm_ag_context.flags1,
-		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
+	/* Update header info */
+	logout_pdu_header.opcode = logout_hdr->opcode;
+	logout_pdu_header.reason_code = 0x80 | logout_hdr->flags;
+	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
+	logout_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	logout_pdu_header.exp_stat_sn = be32_to_cpu(logout_hdr->exp_statsn);
+	logout_pdu_header.cmd_sn = be32_to_cpu(logout_hdr->cmdsn);
+	logout_pdu_header.cid = qedi_conn->iscsi_conn_id;
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = 0;
+	task_params.rx_io_size = 0;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+
+	rval = init_initiator_logout_request_task(&task_params,
+						  &logout_pdu_header,
+						  NULL, NULL);
+	if (rval)
+		return -1;
 
 	spin_lock(&qedi_conn->list_lock);
 	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
@@ -1247,9 +1192,7 @@ int qedi_send_iscsi_logout(struct qedi_conn *qedi_conn,
 	qedi_conn->active_cmd_count++;
 	spin_unlock(&qedi_conn->list_lock);
 
-	qedi_add_to_sq(qedi_conn, task, tid, ptu_invalidate, false);
 	qedi_ring_doorbell(qedi_conn);
-
 	return 0;
 }
 
@@ -1533,47 +1476,46 @@ ldel_exit:
 static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
 			       struct iscsi_task *mtask)
 {
-	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
+	struct iscsi_tmf_request_hdr tmf_pdu_header;
+	struct iscsi_task_params task_params;
 	struct qedi_ctx *qedi = qedi_conn->qedi;
 	struct iscsi_task_context *fw_task_ctx;
-	struct iscsi_tmf_request_hdr *fw_tmf_request;
-	struct iscsi_sge *single_sge;
-	struct qedi_cmd *qedi_cmd;
-	struct qedi_cmd *cmd;
+	struct iscsi_conn *conn = qedi_conn->cls_conn->dd_data;
 	struct iscsi_task *ctask;
 	struct iscsi_tm *tmf_hdr;
-	struct iscsi_sge *req_sge;
-	struct iscsi_sge *resp_sge;
-	u32 lun[2];
-	s16 tid = 0, ptu_invalidate = 0;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_cmd *cmd;
+	struct qedi_endpoint *ep;
+	u32 scsi_lun[2];
+	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
 
-	req_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
-	resp_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
-	qedi_cmd = (struct qedi_cmd *)mtask->dd_data;
 	tmf_hdr = (struct iscsi_tm *)mtask->hdr;
+	qedi_cmd = (struct qedi_cmd *)mtask->dd_data;
+	ep = qedi_conn->ep;
 
-	tid = qedi_cmd->task_id;
-	qedi_update_itt_map(qedi, tid, mtask->itt, qedi_cmd);
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
 
-	fw_task_ctx = qedi_get_task_mem(&qedi->tasks, tid);
+	fw_task_ctx =
+	     (struct iscsi_task_context *)qedi_get_task_mem(&qedi->tasks, tid);
 	memset(fw_task_ctx, 0, sizeof(struct iscsi_task_context));
 
-	fw_tmf_request = &fw_task_ctx->ystorm_st_context.pdu_hdr.tmf_request;
-	fw_tmf_request->itt = qedi_set_itt(tid, get_itt(mtask->itt));
-	fw_tmf_request->cmd_sn = be32_to_cpu(tmf_hdr->cmdsn);
+	qedi_cmd->task_id = tid;
 
-	memcpy(lun, &tmf_hdr->lun, sizeof(struct scsi_lun));
-	fw_tmf_request->lun.lo = be32_to_cpu(lun[0]);
-	fw_tmf_request->lun.hi = be32_to_cpu(lun[1]);
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&tmf_pdu_header, 0, sizeof(tmf_pdu_header));
 
-	if (qedi->tid_reuse_count[tid] == QEDI_MAX_TASK_NUM) {
-		ptu_invalidate = 1;
-		qedi->tid_reuse_count[tid] = 0;
-	}
-	fw_task_ctx->ystorm_st_context.state.reuse_count =
-						qedi->tid_reuse_count[tid];
-	fw_task_ctx->mstorm_st_context.reuse_count =
-						qedi->tid_reuse_count[tid]++;
+	/* Update header info */
+	qedi_update_itt_map(qedi, tid, mtask->itt, qedi_cmd);
+	tmf_pdu_header.itt = qedi_set_itt(tid, get_itt(mtask->itt));
+	tmf_pdu_header.cmd_sn = be32_to_cpu(tmf_hdr->cmdsn);
+
+	memcpy(scsi_lun, &tmf_hdr->lun, sizeof(struct scsi_lun));
+	tmf_pdu_header.lun.lo = be32_to_cpu(scsi_lun[0]);
+	tmf_pdu_header.lun.hi = be32_to_cpu(scsi_lun[1]);
 
 	if ((tmf_hdr->flags & ISCSI_FLAG_TM_FUNC_MASK) ==
 	     ISCSI_TM_FUNC_ABORT_TASK) {
@@ -1584,53 +1526,34 @@ static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
 			return 0;
 		}
 		cmd = (struct qedi_cmd *)ctask->dd_data;
-		fw_tmf_request->rtt =
+		tmf_pdu_header.rtt =
 				qedi_set_itt(cmd->task_id,
 					     get_itt(tmf_hdr->rtt));
 	} else {
-		fw_tmf_request->rtt = ISCSI_RESERVED_TAG;
+		tmf_pdu_header.rtt = ISCSI_RESERVED_TAG;
 	}
 
-	fw_tmf_request->opcode = tmf_hdr->opcode;
-	fw_tmf_request->function = tmf_hdr->flags;
-	fw_tmf_request->hdr_second_dword = ntoh24(tmf_hdr->dlength);
-	fw_tmf_request->ref_cmd_sn = be32_to_cpu(tmf_hdr->refcmdsn);
-
-	single_sge = &fw_task_ctx->mstorm_st_context.sgl_union.single_sge;
-	fw_task_ctx->mstorm_st_context.task_type = ISCSI_TASK_TYPE_MIDPATH;
-	fw_task_ctx->mstorm_ag_context.task_cid = (u16)qedi_conn->iscsi_conn_id;
-	single_sge->sge_addr.lo = resp_sge->sge_addr.lo;
-	single_sge->sge_addr.hi = resp_sge->sge_addr.hi;
-	single_sge->sge_len = resp_sge->sge_len;
-
-	SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-		  ISCSI_MFLAGS_SINGLE_SGE, 1);
-	SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-		  ISCSI_MFLAGS_SLOW_IO, 0);
-	fw_task_ctx->mstorm_st_context.sgl_size = 1;
-	fw_task_ctx->mstorm_st_context.rem_task_size = resp_sge->sge_len;
-
-	/* Ustorm context */
-	fw_task_ctx->ustorm_st_context.rem_rcv_len = 0;
-	fw_task_ctx->ustorm_st_context.exp_data_transfer_len = 0;
-	fw_task_ctx->ustorm_st_context.exp_data_sn = 0;
-	fw_task_ctx->ustorm_st_context.task_type =  ISCSI_TASK_TYPE_MIDPATH;
-	fw_task_ctx->ustorm_st_context.cq_rss_number = 0;
-
-	SET_FIELD(fw_task_ctx->ustorm_st_context.flags,
-		  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 0);
-	SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-		  ISCSI_REG1_NUM_FAST_SGES, 0);
-
-	fw_task_ctx->ustorm_ag_context.icid = (u16)qedi_conn->iscsi_conn_id;
-	SET_FIELD(fw_task_ctx->ustorm_ag_context.flags1,
-		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
-	fw_task_ctx->ustorm_st_context.lun.lo = be32_to_cpu(lun[0]);
-	fw_task_ctx->ustorm_st_context.lun.hi = be32_to_cpu(lun[1]);
+	tmf_pdu_header.opcode = tmf_hdr->opcode;
+	tmf_pdu_header.function = tmf_hdr->flags;
+	tmf_pdu_header.hdr_second_dword = ntoh24(tmf_hdr->dlength);
+	tmf_pdu_header.ref_cmd_sn = be32_to_cpu(tmf_hdr->refcmdsn);
 
-	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
-		  "Add TMF to SQ, tmf tid=0x%x, itt=0x%x, cid=0x%x\n",
-		  tid,  mtask->itt, qedi_conn->iscsi_conn_id);
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = 0;
+	task_params.rx_io_size = 0;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_tmf_request_task(&task_params,
+					       &tmf_pdu_header);
+	if (rval)
+		return -1;
 
 	spin_lock(&qedi_conn->list_lock);
 	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
@@ -1638,7 +1561,6 @@ static int qedi_send_iscsi_tmf(struct qedi_conn *qedi_conn,
 	qedi_conn->active_cmd_count++;
 	spin_unlock(&qedi_conn->list_lock);
 
-	qedi_add_to_sq(qedi_conn, mtask, tid, ptu_invalidate, false);
 	qedi_ring_doorbell(qedi_conn);
 	return 0;
 }
@@ -1689,101 +1611,98 @@ int qedi_iscsi_abort_work(struct qedi_conn *qedi_conn,
 int qedi_send_iscsi_text(struct qedi_conn *qedi_conn,
 			 struct iscsi_task *task)
 {
-	struct qedi_ctx *qedi = qedi_conn->qedi;
+	struct iscsi_text_request_hdr text_request_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
 	struct iscsi_task_context *fw_task_ctx;
-	struct iscsi_text_request_hdr *fw_text_request;
-	struct iscsi_cached_sge_ctx *cached_sge;
-	struct iscsi_sge *single_sge;
-	struct qedi_cmd *qedi_cmd;
-	/* For 6.5 hdr iscsi_hdr */
+	struct qedi_ctx *qedi = qedi_conn->qedi;
 	struct iscsi_text *text_hdr;
-	struct iscsi_sge *req_sge;
-	struct iscsi_sge *resp_sge;
-	s16 ptu_invalidate = 0;
+	struct scsi_sge *req_sge = NULL;
+	struct scsi_sge *resp_sge = NULL;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
 	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
 
-	req_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
-	resp_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	req_sge = (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	resp_sge = (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
 	qedi_cmd = (struct qedi_cmd *)task->dd_data;
 	text_hdr = (struct iscsi_text *)task->hdr;
+	ep = qedi_conn->ep;
 
 	tid = qedi_get_task_idx(qedi);
 	if (tid == -1)
 		return -ENOMEM;
 
-	fw_task_ctx = qedi_get_task_mem(&qedi->tasks, tid);
+	fw_task_ctx =
+	     (struct iscsi_task_context *)qedi_get_task_mem(&qedi->tasks, tid);
 	memset(fw_task_ctx, 0, sizeof(struct iscsi_task_context));
 
 	qedi_cmd->task_id = tid;
 
-	/* Ystorm context */
-	fw_text_request =
-			&fw_task_ctx->ystorm_st_context.pdu_hdr.text_request;
-	fw_text_request->opcode = text_hdr->opcode;
-	fw_text_request->flags_attr = text_hdr->flags;
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&text_request_pdu_header, 0, sizeof(text_request_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+
+	/* Update header info */
+	text_request_pdu_header.opcode = text_hdr->opcode;
+	text_request_pdu_header.flags_attr = text_hdr->flags;
 
 	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
-	fw_text_request->itt = qedi_set_itt(tid, get_itt(task->itt));
-	fw_text_request->ttt = text_hdr->ttt;
-	fw_text_request->cmd_sn = be32_to_cpu(text_hdr->cmdsn);
-	fw_text_request->exp_stat_sn = be32_to_cpu(text_hdr->exp_statsn);
-	fw_text_request->hdr_second_dword = ntoh24(text_hdr->dlength);
-
-	if (qedi->tid_reuse_count[tid] == QEDI_MAX_TASK_NUM) {
-		ptu_invalidate = 1;
-		qedi->tid_reuse_count[tid] = 0;
-	}
-	fw_task_ctx->ystorm_st_context.state.reuse_count =
-						     qedi->tid_reuse_count[tid];
-	fw_task_ctx->mstorm_st_context.reuse_count =
-						   qedi->tid_reuse_count[tid]++;
-
-	cached_sge =
-	       &fw_task_ctx->ystorm_st_context.state.sgl_ctx_union.cached_sge;
-	cached_sge->sge.sge_len = req_sge->sge_len;
-	cached_sge->sge.sge_addr.lo = (u32)(qedi_conn->gen_pdu.req_dma_addr);
-	cached_sge->sge.sge_addr.hi =
+	text_request_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	text_request_pdu_header.ttt = text_hdr->ttt;
+	text_request_pdu_header.cmd_sn = be32_to_cpu(text_hdr->cmdsn);
+	text_request_pdu_header.exp_stat_sn = be32_to_cpu(text_hdr->exp_statsn);
+	text_request_pdu_header.hdr_second_dword = ntoh24(text_hdr->dlength);
+
+	/* Fill tx AHS and rx buffer */
+	tx_sgl_task_params.sgl =
+			       (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	tx_sgl_task_params.sgl_phys_addr.lo =
+					 (u32)(qedi_conn->gen_pdu.req_dma_addr);
+	tx_sgl_task_params.sgl_phys_addr.hi =
 			      (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+	tx_sgl_task_params.total_buffer_size = req_sge->sge_len;
+	tx_sgl_task_params.num_sges = 1;
+
+	rx_sgl_task_params.sgl =
+			      (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	rx_sgl_task_params.sgl_phys_addr.lo =
+					(u32)(qedi_conn->gen_pdu.resp_dma_addr);
+	rx_sgl_task_params.sgl_phys_addr.hi =
+			     (u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+	rx_sgl_task_params.total_buffer_size = resp_sge->sge_len;
+	rx_sgl_task_params.num_sges = 1;
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = ntoh24(text_hdr->dlength);
+	task_params.rx_io_size = resp_sge->sge_len;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_text_request_task(&task_params,
+						&text_request_pdu_header,
+						&tx_sgl_task_params,
+						&rx_sgl_task_params);
+	if (rval)
+		return -1;
 
-	/* Mstorm context */
-	single_sge = &fw_task_ctx->mstorm_st_context.sgl_union.single_sge;
-	fw_task_ctx->mstorm_st_context.task_type = 0x2;
-	fw_task_ctx->mstorm_ag_context.task_cid = (u16)qedi_conn->iscsi_conn_id;
-	single_sge->sge_addr.lo = resp_sge->sge_addr.lo;
-	single_sge->sge_addr.hi = resp_sge->sge_addr.hi;
-	single_sge->sge_len = resp_sge->sge_len;
-
-	SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-		  ISCSI_MFLAGS_SINGLE_SGE, 1);
-	SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-		  ISCSI_MFLAGS_SLOW_IO, 0);
-	fw_task_ctx->mstorm_st_context.sgl_size = 1;
-	fw_task_ctx->mstorm_st_context.rem_task_size = resp_sge->sge_len;
-
-	/* Ustorm context */
-	fw_task_ctx->ustorm_ag_context.exp_data_acked =
-						      ntoh24(text_hdr->dlength);
-	fw_task_ctx->ustorm_st_context.rem_rcv_len = resp_sge->sge_len;
-	fw_task_ctx->ustorm_st_context.exp_data_transfer_len =
-						      ntoh24(text_hdr->dlength);
-	fw_task_ctx->ustorm_st_context.exp_data_sn =
-					      be32_to_cpu(text_hdr->exp_statsn);
-	fw_task_ctx->ustorm_st_context.cq_rss_number = 0;
-	fw_task_ctx->ustorm_st_context.task_type = 0x2;
-	fw_task_ctx->ustorm_ag_context.icid = (u16)qedi_conn->iscsi_conn_id;
-	SET_FIELD(fw_task_ctx->ustorm_ag_context.flags1,
-		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
-
-	/*  Add command in active command list */
 	spin_lock(&qedi_conn->list_lock);
 	list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
 	qedi_cmd->io_cmd_in_list = true;
 	qedi_conn->active_cmd_count++;
 	spin_unlock(&qedi_conn->list_lock);
 
-	qedi_add_to_sq(qedi_conn, task, tid, ptu_invalidate, false);
 	qedi_ring_doorbell(qedi_conn);
-
 	return 0;
 }
 
@@ -1791,58 +1710,62 @@ int qedi_send_iscsi_nopout(struct qedi_conn *qedi_conn,
 			   struct iscsi_task *task,
 			   char *datap, int data_len, int unsol)
 {
+	struct iscsi_nop_out_hdr nop_out_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct iscsi_task_params task_params;
 	struct qedi_ctx *qedi = qedi_conn->qedi;
 	struct iscsi_task_context *fw_task_ctx;
-	struct iscsi_nop_out_hdr *fw_nop_out;
-	struct qedi_cmd *qedi_cmd;
-	/* For 6.5 hdr iscsi_hdr */
 	struct iscsi_nopout *nopout_hdr;
-	struct iscsi_cached_sge_ctx *cached_sge;
-	struct iscsi_sge *single_sge;
-	struct iscsi_sge *req_sge;
-	struct iscsi_sge *resp_sge;
-	u32 lun[2];
-	s16 ptu_invalidate = 0;
+	struct scsi_sge *req_sge = NULL;
+	struct scsi_sge *resp_sge = NULL;
+	struct qedi_cmd *qedi_cmd;
+	struct qedi_endpoint *ep;
+	u32 scsi_lun[2];
 	s16 tid = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
 
-	req_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
-	resp_sge = (struct iscsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+	req_sge = (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	resp_sge = (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
 	qedi_cmd = (struct qedi_cmd *)task->dd_data;
 	nopout_hdr = (struct iscsi_nopout *)task->hdr;
+	ep = qedi_conn->ep;
 
 	tid = qedi_get_task_idx(qedi);
-	if (tid == -1) {
-		QEDI_WARN(&qedi->dbg_ctx, "Invalid tid\n");
+	if (tid == -1)
 		return -ENOMEM;
-	}
-
-	fw_task_ctx = qedi_get_task_mem(&qedi->tasks, tid);
 
+	fw_task_ctx =
+	     (struct iscsi_task_context *)qedi_get_task_mem(&qedi->tasks, tid);
 	memset(fw_task_ctx, 0, sizeof(struct iscsi_task_context));
+
 	qedi_cmd->task_id = tid;
 
-	/* Ystorm context */
-	fw_nop_out = &fw_task_ctx->ystorm_st_context.pdu_hdr.nop_out;
-	SET_FIELD(fw_nop_out->flags_attr, ISCSI_NOP_OUT_HDR_CONST1, 1);
-	SET_FIELD(fw_nop_out->flags_attr, ISCSI_NOP_OUT_HDR_RSRV, 0);
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&nop_out_pdu_header, 0, sizeof(nop_out_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+
+	/* Update header info */
+	nop_out_pdu_header.opcode = nopout_hdr->opcode;
+	SET_FIELD(nop_out_pdu_header.flags_attr, ISCSI_NOP_OUT_HDR_CONST1, 1);
+	SET_FIELD(nop_out_pdu_header.flags_attr, ISCSI_NOP_OUT_HDR_RSRV, 0);
 
-	memcpy(lun, &nopout_hdr->lun, sizeof(struct scsi_lun));
-	fw_nop_out->lun.lo = be32_to_cpu(lun[0]);
-	fw_nop_out->lun.hi = be32_to_cpu(lun[1]);
+	memcpy(scsi_lun, &nopout_hdr->lun, sizeof(struct scsi_lun));
+	nop_out_pdu_header.lun.lo = be32_to_cpu(scsi_lun[0]);
+	nop_out_pdu_header.lun.hi = be32_to_cpu(scsi_lun[1]);
+	nop_out_pdu_header.cmd_sn = be32_to_cpu(nopout_hdr->cmdsn);
+	nop_out_pdu_header.exp_stat_sn = be32_to_cpu(nopout_hdr->exp_statsn);
 
 	qedi_update_itt_map(qedi, tid, task->itt, qedi_cmd);
 
 	if (nopout_hdr->ttt != ISCSI_TTT_ALL_ONES) {
-		fw_nop_out->itt = be32_to_cpu(nopout_hdr->itt);
-		fw_nop_out->ttt = be32_to_cpu(nopout_hdr->ttt);
-		fw_task_ctx->ystorm_st_context.state.buffer_offset[0] = 0;
-		fw_task_ctx->ystorm_st_context.state.local_comp = 1;
-		SET_FIELD(fw_task_ctx->ustorm_st_context.flags,
-			  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 1);
+		nop_out_pdu_header.itt = be32_to_cpu(nopout_hdr->itt);
+		nop_out_pdu_header.ttt = be32_to_cpu(nopout_hdr->ttt);
 	} else {
-		fw_nop_out->itt = qedi_set_itt(tid, get_itt(task->itt));
-		fw_nop_out->ttt = ISCSI_TTT_ALL_ONES;
-		fw_task_ctx->ystorm_st_context.state.buffer_offset[0] = 0;
+		nop_out_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+		nop_out_pdu_header.ttt = ISCSI_TTT_ALL_ONES;
 
 		spin_lock(&qedi_conn->list_lock);
 		list_add_tail(&qedi_cmd->io_cmd, &qedi_conn->active_cmd_list);
@@ -1851,53 +1774,46 @@ int qedi_send_iscsi_nopout(struct qedi_conn *qedi_conn,
 		spin_unlock(&qedi_conn->list_lock);
 	}
 
-	fw_nop_out->opcode = ISCSI_OPCODE_NOP_OUT;
-	fw_nop_out->cmd_sn = be32_to_cpu(nopout_hdr->cmdsn);
-	fw_nop_out->exp_stat_sn = be32_to_cpu(nopout_hdr->exp_statsn);
-
-	cached_sge =
-	       &fw_task_ctx->ystorm_st_context.state.sgl_ctx_union.cached_sge;
-	cached_sge->sge.sge_len = req_sge->sge_len;
-	cached_sge->sge.sge_addr.lo = (u32)(qedi_conn->gen_pdu.req_dma_addr);
-	cached_sge->sge.sge_addr.hi =
-			(u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
-
-	/* Mstorm context */
-	fw_task_ctx->mstorm_st_context.task_type = ISCSI_TASK_TYPE_MIDPATH;
-	fw_task_ctx->mstorm_ag_context.task_cid = (u16)qedi_conn->iscsi_conn_id;
-
-	single_sge = &fw_task_ctx->mstorm_st_context.sgl_union.single_sge;
-	single_sge->sge_addr.lo = resp_sge->sge_addr.lo;
-	single_sge->sge_addr.hi = resp_sge->sge_addr.hi;
-	single_sge->sge_len = resp_sge->sge_len;
-	fw_task_ctx->mstorm_st_context.rem_task_size = resp_sge->sge_len;
-
-	if (qedi->tid_reuse_count[tid] == QEDI_MAX_TASK_NUM) {
-		ptu_invalidate = 1;
-		qedi->tid_reuse_count[tid] = 0;
-	}
-	fw_task_ctx->ystorm_st_context.state.reuse_count =
-						qedi->tid_reuse_count[tid];
-	fw_task_ctx->mstorm_st_context.reuse_count =
-						qedi->tid_reuse_count[tid]++;
-	/* Ustorm context */
-	fw_task_ctx->ustorm_st_context.rem_rcv_len = resp_sge->sge_len;
-	fw_task_ctx->ustorm_st_context.exp_data_transfer_len = data_len;
-	fw_task_ctx->ustorm_st_context.exp_data_sn = 0;
-	fw_task_ctx->ustorm_st_context.task_type =  ISCSI_TASK_TYPE_MIDPATH;
-	fw_task_ctx->ustorm_st_context.cq_rss_number = 0;
-
-	SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-		  ISCSI_REG1_NUM_FAST_SGES, 0);
-
-	fw_task_ctx->ustorm_ag_context.icid = (u16)qedi_conn->iscsi_conn_id;
-	SET_FIELD(fw_task_ctx->ustorm_ag_context.flags1,
-		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
-
-	fw_task_ctx->ustorm_st_context.lun.lo = be32_to_cpu(lun[0]);
-	fw_task_ctx->ustorm_st_context.lun.hi = be32_to_cpu(lun[1]);
-
-	qedi_add_to_sq(qedi_conn, task, tid, ptu_invalidate, false);
+	/* Fill tx AHS and rx buffer */
+	if (data_len) {
+		tx_sgl_task_params.sgl =
+			       (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+		tx_sgl_task_params.sgl_phys_addr.lo =
+					 (u32)(qedi_conn->gen_pdu.req_dma_addr);
+		tx_sgl_task_params.sgl_phys_addr.hi =
+			      (u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
+		tx_sgl_task_params.total_buffer_size = data_len;
+		tx_sgl_task_params.num_sges = 1;
+
+		rx_sgl_task_params.sgl =
+			      (struct scsi_sge *)qedi_conn->gen_pdu.resp_bd_tbl;
+		rx_sgl_task_params.sgl_phys_addr.lo =
+					(u32)(qedi_conn->gen_pdu.resp_dma_addr);
+		rx_sgl_task_params.sgl_phys_addr.hi =
+			     (u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
+		rx_sgl_task_params.total_buffer_size = resp_sge->sge_len;
+		rx_sgl_task_params.num_sges = 1;
+	}
+
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = 0;
+	task_params.tx_io_size = data_len;
+	task_params.rx_io_size = resp_sge->sge_len;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	rval = init_initiator_nop_out_task(&task_params,
+					   &nop_out_pdu_header,
+					   &tx_sgl_task_params,
+					   &rx_sgl_task_params);
+	if (rval)
+		return -1;
+
 	qedi_ring_doorbell(qedi_conn);
 	return 0;
 }
@@ -1905,7 +1821,7 @@ int qedi_send_iscsi_nopout(struct qedi_conn *qedi_conn,
 static int qedi_split_bd(struct qedi_cmd *cmd, u64 addr, int sg_len,
 			 int bd_index)
 {
-	struct iscsi_sge *bd = cmd->io_tbl.sge_tbl;
+	struct scsi_sge *bd = cmd->io_tbl.sge_tbl;
 	int frag_size, sg_frags;
 
 	sg_frags = 0;
@@ -1938,7 +1854,7 @@ static int qedi_split_bd(struct qedi_cmd *cmd, u64 addr, int sg_len,
 static int qedi_map_scsi_sg(struct qedi_ctx *qedi, struct qedi_cmd *cmd)
 {
 	struct scsi_cmnd *sc = cmd->scsi_cmd;
-	struct iscsi_sge *bd = cmd->io_tbl.sge_tbl;
+	struct scsi_sge *bd = cmd->io_tbl.sge_tbl;
 	struct scatterlist *sg;
 	int byte_count = 0;
 	int bd_count = 0;
@@ -2040,7 +1956,7 @@ static void qedi_iscsi_map_sg_list(struct qedi_cmd *cmd)
 		if (bd_count == 0)
 			return;
 	} else {
-		struct iscsi_sge *bd = cmd->io_tbl.sge_tbl;
+		struct scsi_sge *bd = cmd->io_tbl.sge_tbl;
 
 		bd[0].sge_addr.lo = 0;
 		bd[0].sge_addr.hi = 0;
@@ -2136,244 +2052,182 @@ int qedi_iscsi_send_ioreq(struct iscsi_task *task)
 	struct qedi_conn *qedi_conn = conn->dd_data;
 	struct qedi_cmd *cmd = task->dd_data;
 	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_cmd_hdr cmd_pdu_header;
+	struct scsi_sgl_task_params tx_sgl_task_params;
+	struct scsi_sgl_task_params rx_sgl_task_params;
+	struct scsi_sgl_task_params *prx_sgl = NULL;
+	struct scsi_sgl_task_params *ptx_sgl = NULL;
+	struct iscsi_task_params task_params;
+	struct iscsi_conn_params conn_params;
+	struct scsi_initiator_cmd_params cmd_params;
 	struct iscsi_task_context *fw_task_ctx;
-	struct iscsi_cached_sge_ctx *cached_sge;
-	struct iscsi_phys_sgl_ctx *phys_sgl;
-	struct iscsi_virt_sgl_ctx *virt_sgl;
-	struct ystorm_iscsi_task_st_ctx *yst_cxt;
-	struct mstorm_iscsi_task_st_ctx *mst_cxt;
-	struct iscsi_sgl *sgl_struct;
-	struct iscsi_sge *single_sge;
+	struct iscsi_cls_conn *cls_conn;
 	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
-	struct iscsi_sge *bd = cmd->io_tbl.sge_tbl;
-	enum iscsi_task_type task_type;
-	struct iscsi_cmd_hdr *fw_cmd;
-	u32 lun[2];
-	u32 exp_data;
-	u16 cq_idx = smp_processor_id() % qedi->num_queues;
-	s16 ptu_invalidate = 0;
+	enum iscsi_task_type task_type = MAX_ISCSI_TASK_TYPE;
+	struct qedi_endpoint *ep;
+	u32 scsi_lun[2];
 	s16 tid = 0;
-	u8 num_fast_sgs;
+	u16 sq_idx = 0;
+	u16 cq_idx;
+	int rval = 0;
 
-	tid = qedi_get_task_idx(qedi);
-	if (tid == -1)
-		return -ENOMEM;
+	ep = qedi_conn->ep;
+	cls_conn = qedi_conn->cls_conn;
+	conn = cls_conn->dd_data;
 
 	qedi_iscsi_map_sg_list(cmd);
+	int_to_scsilun(sc->device->lun, (struct scsi_lun *)scsi_lun);
 
-	int_to_scsilun(sc->device->lun, (struct scsi_lun *)lun);
-	fw_task_ctx = qedi_get_task_mem(&qedi->tasks, tid);
+	tid = qedi_get_task_idx(qedi);
+	if (tid == -1)
+		return -ENOMEM;
 
+	fw_task_ctx =
+	     (struct iscsi_task_context *)qedi_get_task_mem(&qedi->tasks, tid);
 	memset(fw_task_ctx, 0, sizeof(struct iscsi_task_context));
-	cmd->task_id = tid;
 
-	/* Ystorm context */
-	fw_cmd = &fw_task_ctx->ystorm_st_context.pdu_hdr.cmd;
-	SET_FIELD(fw_cmd->flags_attr, ISCSI_CMD_HDR_ATTR, ISCSI_ATTR_SIMPLE);
+	cmd->task_id = tid;
 
+	memset(&task_params, 0, sizeof(task_params));
+	memset(&cmd_pdu_header, 0, sizeof(cmd_pdu_header));
+	memset(&tx_sgl_task_params, 0, sizeof(tx_sgl_task_params));
+	memset(&rx_sgl_task_params, 0, sizeof(rx_sgl_task_params));
+	memset(&conn_params, 0, sizeof(conn_params));
+	memset(&cmd_params, 0, sizeof(cmd_params));
+
+	cq_idx = smp_processor_id() % qedi->num_queues;
+	/* Update header info */
+	SET_FIELD(cmd_pdu_header.flags_attr, ISCSI_CMD_HDR_ATTR,
+		  ISCSI_ATTR_SIMPLE);
 	if (sc->sc_data_direction == DMA_TO_DEVICE) {
-		if (conn->session->initial_r2t_en) {
-			exp_data = min((conn->session->imm_data_en *
-					conn->max_xmit_dlength),
-				       conn->session->first_burst);
-			exp_data = min(exp_data, scsi_bufflen(sc));
-			fw_task_ctx->ustorm_ag_context.exp_data_acked =
-							  cpu_to_le32(exp_data);
-		} else {
-			fw_task_ctx->ustorm_ag_context.exp_data_acked =
-			      min(conn->session->first_burst, scsi_bufflen(sc));
-		}
-
-		SET_FIELD(fw_cmd->flags_attr, ISCSI_CMD_HDR_WRITE, 1);
+		SET_FIELD(cmd_pdu_header.flags_attr,
+			  ISCSI_CMD_HDR_WRITE, 1);
 		task_type = ISCSI_TASK_TYPE_INITIATOR_WRITE;
 	} else {
-		if (scsi_bufflen(sc))
-			SET_FIELD(fw_cmd->flags_attr, ISCSI_CMD_HDR_READ, 1);
+		SET_FIELD(cmd_pdu_header.flags_attr,
+			  ISCSI_CMD_HDR_READ, 1);
 		task_type = ISCSI_TASK_TYPE_INITIATOR_READ;
 	}
 
-	fw_cmd->lun.lo = be32_to_cpu(lun[0]);
-	fw_cmd->lun.hi = be32_to_cpu(lun[1]);
+	cmd_pdu_header.lun.lo = be32_to_cpu(scsi_lun[0]);
+	cmd_pdu_header.lun.hi = be32_to_cpu(scsi_lun[1]);
 
 	qedi_update_itt_map(qedi, tid, task->itt, cmd);
-	fw_cmd->itt = qedi_set_itt(tid, get_itt(task->itt));
-	fw_cmd->expected_transfer_length = scsi_bufflen(sc);
-	fw_cmd->cmd_sn = be32_to_cpu(hdr->cmdsn);
-	fw_cmd->opcode = hdr->opcode;
-	qedi_cpy_scsi_cdb(sc, (u32 *)fw_cmd->cdb);
-
-	/* Mstorm context */
-	fw_task_ctx->mstorm_st_context.sense_db.lo = (u32)cmd->sense_buffer_dma;
-	fw_task_ctx->mstorm_st_context.sense_db.hi =
-					(u32)((u64)cmd->sense_buffer_dma >> 32);
-	fw_task_ctx->mstorm_ag_context.task_cid = qedi_conn->iscsi_conn_id;
-	fw_task_ctx->mstorm_st_context.task_type = task_type;
-
-	if (qedi->tid_reuse_count[tid] == QEDI_MAX_TASK_NUM) {
-		ptu_invalidate = 1;
-		qedi->tid_reuse_count[tid] = 0;
-	}
-	fw_task_ctx->ystorm_st_context.state.reuse_count =
-						     qedi->tid_reuse_count[tid];
-	fw_task_ctx->mstorm_st_context.reuse_count =
-						   qedi->tid_reuse_count[tid]++;
-
-	/* Ustorm context */
-	fw_task_ctx->ustorm_st_context.rem_rcv_len = scsi_bufflen(sc);
-	fw_task_ctx->ustorm_st_context.exp_data_transfer_len = scsi_bufflen(sc);
-	fw_task_ctx->ustorm_st_context.exp_data_sn =
-						   be32_to_cpu(hdr->exp_statsn);
-	fw_task_ctx->ustorm_st_context.task_type = task_type;
-	fw_task_ctx->ustorm_st_context.cq_rss_number = cq_idx;
-	fw_task_ctx->ustorm_ag_context.icid = (u16)qedi_conn->iscsi_conn_id;
-
-	SET_FIELD(fw_task_ctx->ustorm_ag_context.flags1,
-		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
-	SET_FIELD(fw_task_ctx->ustorm_st_context.flags,
-		  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 0);
-
-	num_fast_sgs = (cmd->io_tbl.sge_valid ?
-			min((u16)QEDI_FAST_SGE_COUNT,
-			    (u16)cmd->io_tbl.sge_valid) : 0);
-	SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-		  ISCSI_REG1_NUM_FAST_SGES, num_fast_sgs);
-
-	fw_task_ctx->ustorm_st_context.lun.lo = be32_to_cpu(lun[0]);
-	fw_task_ctx->ustorm_st_context.lun.hi = be32_to_cpu(lun[1]);
-
-	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO, "Total sge count [%d]\n",
-		  cmd->io_tbl.sge_valid);
-
-	yst_cxt = &fw_task_ctx->ystorm_st_context;
-	mst_cxt = &fw_task_ctx->mstorm_st_context;
-	/* Tx path */
+	cmd_pdu_header.itt = qedi_set_itt(tid, get_itt(task->itt));
+	cmd_pdu_header.expected_transfer_length = cpu_to_be32(hdr->data_length);
+	cmd_pdu_header.hdr_second_dword = ntoh24(hdr->dlength);
+	cmd_pdu_header.cmd_sn = be32_to_cpu(hdr->cmdsn);
+	cmd_pdu_header.opcode = hdr->opcode;
+	qedi_cpy_scsi_cdb(sc, (u32 *)cmd_pdu_header.cdb);
+
+	/* Fill tx AHS and rx buffer */
 	if (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE) {
-		/* not considering  superIO or FastIO */
-		if (cmd->io_tbl.sge_valid == 1) {
-			cached_sge = &yst_cxt->state.sgl_ctx_union.cached_sge;
-			cached_sge->sge.sge_addr.lo = bd[0].sge_addr.lo;
-			cached_sge->sge.sge_addr.hi = bd[0].sge_addr.hi;
-			cached_sge->sge.sge_len = bd[0].sge_len;
-			qedi->cached_sgls++;
-		} else if ((cmd->io_tbl.sge_valid != 1) && cmd->use_slowpath) {
-			SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-				  ISCSI_MFLAGS_SLOW_IO, 1);
-			SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-				  ISCSI_REG1_NUM_FAST_SGES, 0);
-			phys_sgl = &yst_cxt->state.sgl_ctx_union.phys_sgl;
-			phys_sgl->sgl_base.lo = (u32)(cmd->io_tbl.sge_tbl_dma);
-			phys_sgl->sgl_base.hi =
-				     (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
-			phys_sgl->sgl_size = cmd->io_tbl.sge_valid;
-			qedi->slow_sgls++;
-		} else if ((cmd->io_tbl.sge_valid != 1) && !cmd->use_slowpath) {
-			SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-				  ISCSI_MFLAGS_SLOW_IO, 0);
-			SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-				  ISCSI_REG1_NUM_FAST_SGES,
-				  min((u16)QEDI_FAST_SGE_COUNT,
-				      (u16)cmd->io_tbl.sge_valid));
-			virt_sgl = &yst_cxt->state.sgl_ctx_union.virt_sgl;
-			virt_sgl->sgl_base.lo = (u32)(cmd->io_tbl.sge_tbl_dma);
-			virt_sgl->sgl_base.hi =
+		tx_sgl_task_params.sgl = cmd->io_tbl.sge_tbl;
+		tx_sgl_task_params.sgl_phys_addr.lo =
+						 (u32)(cmd->io_tbl.sge_tbl_dma);
+		tx_sgl_task_params.sgl_phys_addr.hi =
 				      (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
-			virt_sgl->sgl_initial_offset =
-				 (u32)bd[0].sge_addr.lo & (QEDI_PAGE_SIZE - 1);
-			qedi->fast_sgls++;
-		}
-		fw_task_ctx->mstorm_st_context.sgl_size = cmd->io_tbl.sge_valid;
-		fw_task_ctx->mstorm_st_context.rem_task_size = scsi_bufflen(sc);
-	} else {
-	/* Rx path */
-		if (cmd->io_tbl.sge_valid == 1) {
-			SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-				  ISCSI_MFLAGS_SLOW_IO, 0);
-			SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-				  ISCSI_MFLAGS_SINGLE_SGE, 1);
-			single_sge = &mst_cxt->sgl_union.single_sge;
-			single_sge->sge_addr.lo = bd[0].sge_addr.lo;
-			single_sge->sge_addr.hi = bd[0].sge_addr.hi;
-			single_sge->sge_len = bd[0].sge_len;
-			qedi->cached_sgls++;
-		} else if ((cmd->io_tbl.sge_valid != 1) && cmd->use_slowpath) {
-			sgl_struct = &mst_cxt->sgl_union.sgl_struct;
-			sgl_struct->sgl_addr.lo =
-						(u32)(cmd->io_tbl.sge_tbl_dma);
-			sgl_struct->sgl_addr.hi =
-				     (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
-			SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-				  ISCSI_MFLAGS_SLOW_IO, 1);
-			SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-				  ISCSI_REG1_NUM_FAST_SGES, 0);
-			sgl_struct->updated_sge_size = 0;
-			sgl_struct->updated_sge_offset = 0;
-			qedi->slow_sgls++;
-		} else if ((cmd->io_tbl.sge_valid != 1) && !cmd->use_slowpath) {
-			sgl_struct = &mst_cxt->sgl_union.sgl_struct;
-			sgl_struct->sgl_addr.lo =
-						(u32)(cmd->io_tbl.sge_tbl_dma);
-			sgl_struct->sgl_addr.hi =
-				     (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
-			sgl_struct->byte_offset =
-				(u32)bd[0].sge_addr.lo & (QEDI_PAGE_SIZE - 1);
-			SET_FIELD(fw_task_ctx->mstorm_st_context.flags.mflags,
-				  ISCSI_MFLAGS_SLOW_IO, 0);
-			SET_FIELD(fw_task_ctx->ustorm_st_context.reg1.reg1_map,
-				  ISCSI_REG1_NUM_FAST_SGES, 0);
-			sgl_struct->updated_sge_size = 0;
-			sgl_struct->updated_sge_offset = 0;
-			qedi->fast_sgls++;
-		}
-		fw_task_ctx->mstorm_st_context.sgl_size = cmd->io_tbl.sge_valid;
-		fw_task_ctx->mstorm_st_context.rem_task_size = scsi_bufflen(sc);
-	}
-
-	if (cmd->io_tbl.sge_valid == 1)
-		/* Singel-SGL */
-		qedi->use_cached_sge = true;
-	else {
+		tx_sgl_task_params.total_buffer_size = scsi_bufflen(sc);
+		tx_sgl_task_params.num_sges = cmd->io_tbl.sge_valid;
 		if (cmd->use_slowpath)
-			qedi->use_slow_sge = true;
-		else
-			qedi->use_fast_sge = true;
-	}
+			tx_sgl_task_params.small_mid_sge = true;
+	} else if (task_type == ISCSI_TASK_TYPE_INITIATOR_READ) {
+		rx_sgl_task_params.sgl = cmd->io_tbl.sge_tbl;
+		rx_sgl_task_params.sgl_phys_addr.lo =
+						 (u32)(cmd->io_tbl.sge_tbl_dma);
+		rx_sgl_task_params.sgl_phys_addr.hi =
+				      (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32);
+		rx_sgl_task_params.total_buffer_size = scsi_bufflen(sc);
+		rx_sgl_task_params.num_sges = cmd->io_tbl.sge_valid;
+	}
+
+	/* Add conn param */
+	conn_params.first_burst_length = conn->session->first_burst;
+	conn_params.max_send_pdu_length = conn->max_xmit_dlength;
+	conn_params.max_burst_length = conn->session->max_burst;
+	if (conn->session->initial_r2t_en)
+		conn_params.initial_r2t = true;
+	if (conn->session->imm_data_en)
+		conn_params.immediate_data = true;
+
+	/* Add cmd params */
+	cmd_params.sense_data_buffer_phys_addr.lo = (u32)cmd->sense_buffer_dma;
+	cmd_params.sense_data_buffer_phys_addr.hi =
+					(u32)((u64)cmd->sense_buffer_dma >> 32);
+	/* Fill fw input params */
+	task_params.context = fw_task_ctx;
+	task_params.conn_icid = (u16)qedi_conn->iscsi_conn_id;
+	task_params.itid = tid;
+	task_params.cq_rss_number = cq_idx;
+	if (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE)
+		task_params.tx_io_size = scsi_bufflen(sc);
+	else if (task_type == ISCSI_TASK_TYPE_INITIATOR_READ)
+		task_params.rx_io_size = scsi_bufflen(sc);
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+	task_params.sqe = &ep->sq[sq_idx];
+
 	QEDI_INFO(&qedi->dbg_ctx, QEDI_LOG_IO,
-		  "%s: %s-SGL: num_sges=0x%x first-sge-lo=0x%x first-sge-hi=0x%x",
+		  "%s: %s-SGL: sg_len=0x%x num_sges=0x%x first-sge-lo=0x%x first-sge-hi=0x%x\n",
 		  (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE) ?
 		  "Write " : "Read ", (cmd->io_tbl.sge_valid == 1) ?
 		  "Single" : (cmd->use_slowpath ? "SLOW" : "FAST"),
-		  (u16)cmd->io_tbl.sge_valid, (u32)(cmd->io_tbl.sge_tbl_dma),
+		  (u16)cmd->io_tbl.sge_valid, scsi_bufflen(sc),
+		  (u32)(cmd->io_tbl.sge_tbl_dma),
 		  (u32)((u64)cmd->io_tbl.sge_tbl_dma >> 32));
 
-	/*  Add command in active command list */
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+
+	if (task_params.tx_io_size != 0)
+		ptx_sgl = &tx_sgl_task_params;
+	if (task_params.rx_io_size != 0)
+		prx_sgl = &rx_sgl_task_params;
+
+	rval = init_initiator_rw_iscsi_task(&task_params, &conn_params,
+					    &cmd_params, &cmd_pdu_header,
+					    ptx_sgl, prx_sgl,
+					    NULL);
+	if (rval)
+		return -1;
+
 	spin_lock(&qedi_conn->list_lock);
 	list_add_tail(&cmd->io_cmd, &qedi_conn->active_cmd_list);
 	cmd->io_cmd_in_list = true;
 	qedi_conn->active_cmd_count++;
 	spin_unlock(&qedi_conn->list_lock);
 
-	qedi_add_to_sq(qedi_conn, task, tid, ptu_invalidate, false);
 	qedi_ring_doorbell(qedi_conn);
-	if (qedi_io_tracing)
-		qedi_trace_io(qedi, task, tid, QEDI_IO_TRACE_REQ);
-
 	return 0;
 }
 
 int qedi_iscsi_cleanup_task(struct iscsi_task *task, bool mark_cmd_node_deleted)
 {
+	struct iscsi_task_params task_params;
+	struct qedi_endpoint *ep;
 	struct iscsi_conn *conn = task->conn;
 	struct qedi_conn *qedi_conn = conn->dd_data;
 	struct qedi_cmd *cmd = task->dd_data;
-	s16 ptu_invalidate = 0;
+	u16 sq_idx = 0;
+	int rval = 0;
 
 	QEDI_INFO(&qedi_conn->qedi->dbg_ctx, QEDI_LOG_SCSI_TM,
 		  "issue cleanup tid=0x%x itt=0x%x task_state=%d cmd_state=0%x cid=0x%x\n",
 		  cmd->task_id, get_itt(task->itt), task->state,
 		  cmd->state, qedi_conn->iscsi_conn_id);
 
-	qedi_add_to_sq(qedi_conn, task, cmd->task_id, ptu_invalidate, true);
-	qedi_ring_doorbell(qedi_conn);
+	memset(&task_params, 0, sizeof(task_params));
+	ep = qedi_conn->ep;
+
+	sq_idx = qedi_get_wqe_idx(qedi_conn);
+
+	task_params.sqe = &ep->sq[sq_idx];
+	memset(task_params.sqe, 0, sizeof(struct iscsi_wqe));
+	task_params.itid = cmd->task_id;
 
+	rval = init_cleanup_task(&task_params);
+	if (rval)
+		return rval;
+
+	qedi_ring_doorbell(qedi_conn);
 	return 0;
 }
diff --git a/drivers/scsi/qedi/qedi_fw_api.c b/drivers/scsi/qedi/qedi_fw_api.c
new file mode 100644
index 000000000000..fd354d4e03eb
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw_api.c
@@ -0,0 +1,781 @@
+/* QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include "qedi_hsi.h"
+#include <linux/qed/qed_if.h>
+
+#include "qedi_fw_iscsi.h"
+#include "qedi_fw_scsi.h"
+
+#define SCSI_NUM_SGES_IN_CACHE 0x4
+
+static bool scsi_is_slow_sgl(u16 num_sges, bool small_mid_sge)
+{
+	return (num_sges > SCSI_NUM_SGES_SLOW_SGL_THR && small_mid_sge);
+}
+
+static
+void init_scsi_sgl_context(struct scsi_sgl_params *ctx_sgl_params,
+			   struct scsi_cached_sges *ctx_data_desc,
+			   struct scsi_sgl_task_params *sgl_task_params)
+{
+	u8 sge_index;
+	u8 num_sges;
+	u32 val;
+
+	num_sges = (sgl_task_params->num_sges > SCSI_NUM_SGES_IN_CACHE) ?
+			     SCSI_NUM_SGES_IN_CACHE : sgl_task_params->num_sges;
+
+	/* sgl params */
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.lo);
+	ctx_sgl_params->sgl_addr.lo = val;
+	val = cpu_to_le32(sgl_task_params->sgl_phys_addr.hi);
+	ctx_sgl_params->sgl_addr.hi = val;
+	val = cpu_to_le32(sgl_task_params->total_buffer_size);
+	ctx_sgl_params->sgl_total_length = val;
+	ctx_sgl_params->sgl_num_sges = cpu_to_le16(sgl_task_params->num_sges);
+
+	for (sge_index = 0; sge_index < num_sges; sge_index++) {
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.lo);
+		ctx_data_desc->sge[sge_index].sge_addr.lo = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_addr.hi);
+		ctx_data_desc->sge[sge_index].sge_addr.hi = val;
+		val = cpu_to_le32(sgl_task_params->sgl[sge_index].sge_len);
+		ctx_data_desc->sge[sge_index].sge_len = val;
+	}
+}
+
+static u32 calc_rw_task_size(struct iscsi_task_params *task_params,
+			     enum iscsi_task_type task_type,
+			     struct scsi_sgl_task_params *sgl_task_params,
+			     struct scsi_dif_task_params *dif_task_params)
+{
+	u32 io_size;
+
+	if (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE ||
+	    task_type == ISCSI_TASK_TYPE_TARGET_READ)
+		io_size = task_params->tx_io_size;
+	else
+		io_size = task_params->rx_io_size;
+
+	if (!io_size)
+		return 0;
+
+	if (!dif_task_params)
+		return io_size;
+
+	return !dif_task_params->dif_on_network ?
+	       io_size : sgl_task_params->total_buffer_size;
+}
+
+static void
+init_dif_context_flags(struct iscsi_dif_flags *ctx_dif_flags,
+		       struct scsi_dif_task_params *dif_task_params)
+{
+	if (!dif_task_params)
+		return;
+
+	SET_FIELD(ctx_dif_flags->flags, ISCSI_DIF_FLAGS_PROT_INTERVAL_SIZE_LOG,
+		  dif_task_params->dif_block_size_log);
+	SET_FIELD(ctx_dif_flags->flags, ISCSI_DIF_FLAGS_DIF_TO_PEER,
+		  dif_task_params->dif_on_network ? 1 : 0);
+	SET_FIELD(ctx_dif_flags->flags, ISCSI_DIF_FLAGS_HOST_INTERFACE,
+		  dif_task_params->dif_on_host ? 1 : 0);
+}
+
+static void init_sqe(struct iscsi_task_params *task_params,
+		     struct scsi_sgl_task_params *sgl_task_params,
+		     struct scsi_dif_task_params *dif_task_params,
+		     struct iscsi_common_hdr *pdu_header,
+		     struct scsi_initiator_cmd_params *cmd_params,
+		     enum iscsi_task_type task_type,
+		     bool is_cleanup)
+{
+	if (!task_params->sqe)
+		return;
+
+	memset(task_params->sqe, 0, sizeof(*task_params->sqe));
+	task_params->sqe->task_id = cpu_to_le16(task_params->itid);
+	if (is_cleanup) {
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+			  ISCSI_WQE_TYPE_TASK_CLEANUP);
+		return;
+	}
+
+	switch (task_type) {
+	case ISCSI_TASK_TYPE_INITIATOR_WRITE:
+	{
+		u32 buf_size = 0;
+		u32 num_sges = 0;
+
+		init_dif_context_flags(&task_params->sqe->prot_flags,
+				       dif_task_params);
+
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+			  ISCSI_WQE_TYPE_NORMAL);
+
+		if (task_params->tx_io_size) {
+			buf_size = calc_rw_task_size(task_params, task_type,
+						     sgl_task_params,
+						     dif_task_params);
+
+		if (scsi_is_slow_sgl(sgl_task_params->num_sges,
+				     sgl_task_params->small_mid_sge))
+			num_sges = ISCSI_WQE_NUM_SGES_SLOWIO;
+		else
+			num_sges = min(sgl_task_params->num_sges,
+				       (u16)SCSI_NUM_SGES_SLOW_SGL_THR);
+	}
+
+	SET_FIELD(task_params->sqe->flags, ISCSI_WQE_NUM_SGES, num_sges);
+	SET_FIELD(task_params->sqe->contlen_cdbsize, ISCSI_WQE_CONT_LEN,
+		  buf_size);
+
+	if (GET_FIELD(pdu_header->hdr_second_dword,
+		      ISCSI_CMD_HDR_TOTAL_AHS_LEN))
+		SET_FIELD(task_params->sqe->contlen_cdbsize, ISCSI_WQE_CDB_SIZE,
+			  cmd_params->extended_cdb_sge.sge_len);
+	}
+		break;
+	case ISCSI_TASK_TYPE_INITIATOR_READ:
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+			  ISCSI_WQE_TYPE_NORMAL);
+
+		if (GET_FIELD(pdu_header->hdr_second_dword,
+			      ISCSI_CMD_HDR_TOTAL_AHS_LEN))
+			SET_FIELD(task_params->sqe->contlen_cdbsize,
+				  ISCSI_WQE_CDB_SIZE,
+				  cmd_params->extended_cdb_sge.sge_len);
+		break;
+	case ISCSI_TASK_TYPE_LOGIN_RESPONSE:
+	case ISCSI_TASK_TYPE_MIDPATH:
+	{
+		bool advance_statsn = true;
+
+		if (task_type == ISCSI_TASK_TYPE_LOGIN_RESPONSE)
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+				  ISCSI_WQE_TYPE_LOGIN);
+		else
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_WQE_TYPE,
+				  ISCSI_WQE_TYPE_MIDDLE_PATH);
+
+		if (task_type == ISCSI_TASK_TYPE_MIDPATH) {
+			u8 opcode = GET_FIELD(pdu_header->hdr_first_byte,
+					      ISCSI_COMMON_HDR_OPCODE);
+
+			if (opcode != ISCSI_OPCODE_TEXT_RESPONSE &&
+			    (opcode != ISCSI_OPCODE_NOP_IN ||
+			    pdu_header->itt == ISCSI_TTT_ALL_ONES))
+				advance_statsn = false;
+		}
+
+		SET_FIELD(task_params->sqe->flags, ISCSI_WQE_RESPONSE,
+			  advance_statsn ? 1 : 0);
+
+		if (task_params->tx_io_size) {
+			SET_FIELD(task_params->sqe->contlen_cdbsize,
+				  ISCSI_WQE_CONT_LEN, task_params->tx_io_size);
+
+		if (scsi_is_slow_sgl(sgl_task_params->num_sges,
+				     sgl_task_params->small_mid_sge))
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_NUM_SGES,
+				  ISCSI_WQE_NUM_SGES_SLOWIO);
+		else
+			SET_FIELD(task_params->sqe->flags, ISCSI_WQE_NUM_SGES,
+				  min(sgl_task_params->num_sges,
+				      (u16)SCSI_NUM_SGES_SLOW_SGL_THR));
+		}
+	}
+		break;
+	default:
+		break;
+	}
+}
+
+static void init_default_iscsi_task(struct iscsi_task_params *task_params,
+				    struct data_hdr *pdu_header,
+				    enum iscsi_task_type task_type)
+{
+	struct iscsi_task_context *context;
+	u16 index;
+	u32 val;
+
+	context = task_params->context;
+	memset(context, 0, sizeof(*context));
+
+	for (index = 0; index <
+	     ARRAY_SIZE(context->ystorm_st_context.pdu_hdr.data.data);
+	     index++) {
+		val = cpu_to_le32(pdu_header->data[index]);
+		context->ystorm_st_context.pdu_hdr.data.data[index] = val;
+	}
+
+	context->mstorm_st_context.task_type = task_type;
+	context->mstorm_ag_context.task_cid =
+					    cpu_to_le16(task_params->conn_icid);
+
+	SET_FIELD(context->ustorm_ag_context.flags1,
+		  USTORM_ISCSI_TASK_AG_CTX_R2T2RECV, 1);
+
+	context->ustorm_st_context.task_type = task_type;
+	context->ustorm_st_context.cq_rss_number = task_params->cq_rss_number;
+	context->ustorm_ag_context.icid = cpu_to_le16(task_params->conn_icid);
+}
+
+static
+void init_initiator_rw_cdb_ystorm_context(struct ystorm_iscsi_task_st_ctx *ystc,
+					  struct scsi_initiator_cmd_params *cmd)
+{
+	union iscsi_task_hdr *ctx_pdu_hdr = &ystc->pdu_hdr;
+	u32 val;
+
+	if (!cmd->extended_cdb_sge.sge_len)
+		return;
+
+	SET_FIELD(ctx_pdu_hdr->ext_cdb_cmd.hdr_second_dword,
+		  ISCSI_EXT_CDB_CMD_HDR_CDB_SIZE,
+		  cmd->extended_cdb_sge.sge_len);
+	val = cpu_to_le32(cmd->extended_cdb_sge.sge_addr.lo);
+	ctx_pdu_hdr->ext_cdb_cmd.cdb_sge.sge_addr.lo = val;
+	val = cpu_to_le32(cmd->extended_cdb_sge.sge_addr.hi);
+	ctx_pdu_hdr->ext_cdb_cmd.cdb_sge.sge_addr.hi = val;
+	val = cpu_to_le32(cmd->extended_cdb_sge.sge_len);
+	ctx_pdu_hdr->ext_cdb_cmd.cdb_sge.sge_len  = val;
+}
+
+static
+void init_ustorm_task_contexts(struct ustorm_iscsi_task_st_ctx *ustorm_st_cxt,
+			       struct ustorm_iscsi_task_ag_ctx *ustorm_ag_cxt,
+			       u32 remaining_recv_len,
+			       u32 expected_data_transfer_len,
+			       u8 num_sges, bool tx_dif_conn_err_en)
+{
+	u32 val;
+
+	ustorm_st_cxt->rem_rcv_len = cpu_to_le32(remaining_recv_len);
+	ustorm_ag_cxt->exp_data_acked = cpu_to_le32(expected_data_transfer_len);
+	val = cpu_to_le32(expected_data_transfer_len);
+	ustorm_st_cxt->exp_data_transfer_len = val;
+	SET_FIELD(ustorm_st_cxt->reg1.reg1_map, ISCSI_REG1_NUM_SGES, num_sges);
+	SET_FIELD(ustorm_ag_cxt->flags2,
+		  USTORM_ISCSI_TASK_AG_CTX_DIF_ERROR_CF_EN,
+		  tx_dif_conn_err_en ? 1 : 0);
+}
+
+static
+void set_rw_exp_data_acked_and_cont_len(struct iscsi_task_context *context,
+					struct iscsi_conn_params  *conn_params,
+					enum iscsi_task_type task_type,
+					u32 task_size,
+					u32 exp_data_transfer_len,
+					u8 total_ahs_length)
+{
+	u32 max_unsolicited_data = 0, val;
+
+	if (total_ahs_length &&
+	    (task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE ||
+	     task_type == ISCSI_TASK_TYPE_INITIATOR_READ))
+		SET_FIELD(context->ustorm_st_context.flags2,
+			  USTORM_ISCSI_TASK_ST_CTX_AHS_EXIST, 1);
+
+	switch (task_type) {
+	case ISCSI_TASK_TYPE_INITIATOR_WRITE:
+		if (!conn_params->initial_r2t)
+			max_unsolicited_data = conn_params->first_burst_length;
+		else if (conn_params->immediate_data)
+			max_unsolicited_data =
+					  min(conn_params->first_burst_length,
+					      conn_params->max_send_pdu_length);
+
+		context->ustorm_ag_context.exp_data_acked =
+				   cpu_to_le32(total_ahs_length == 0 ?
+						min(exp_data_transfer_len,
+						    max_unsolicited_data) :
+						((u32)(total_ahs_length +
+						       ISCSI_AHS_CNTL_SIZE)));
+		break;
+	case ISCSI_TASK_TYPE_TARGET_READ:
+		val = cpu_to_le32(exp_data_transfer_len);
+		context->ustorm_ag_context.exp_data_acked = val;
+		break;
+	case ISCSI_TASK_TYPE_INITIATOR_READ:
+		context->ustorm_ag_context.exp_data_acked =
+					cpu_to_le32((total_ahs_length == 0 ? 0 :
+						     total_ahs_length +
+						     ISCSI_AHS_CNTL_SIZE));
+		break;
+	case ISCSI_TASK_TYPE_TARGET_WRITE:
+		val = cpu_to_le32(task_size);
+		context->ustorm_ag_context.exp_cont_len = val;
+		break;
+	default:
+		break;
+	}
+}
+
+static
+void init_rtdif_task_context(struct rdif_task_context *rdif_context,
+			     struct tdif_task_context *tdif_context,
+			     struct scsi_dif_task_params *dif_task_params,
+			     enum iscsi_task_type task_type)
+{
+	u32 val;
+
+	if (!dif_task_params->dif_on_network || !dif_task_params->dif_on_host)
+		return;
+
+	if (task_type == ISCSI_TASK_TYPE_TARGET_WRITE ||
+	    task_type == ISCSI_TASK_TYPE_INITIATOR_READ) {
+		rdif_context->app_tag_value =
+				  cpu_to_le16(dif_task_params->application_tag);
+		rdif_context->partial_crc_value = cpu_to_le16(0xffff);
+		val = cpu_to_le32(dif_task_params->initial_ref_tag);
+		rdif_context->initial_ref_tag = val;
+		rdif_context->app_tag_mask =
+			     cpu_to_le16(dif_task_params->application_tag_mask);
+		SET_FIELD(rdif_context->flags0, RDIF_TASK_CONTEXT_CRC_SEED,
+			  dif_task_params->crc_seed ? 1 : 0);
+		SET_FIELD(rdif_context->flags0, RDIF_TASK_CONTEXT_HOSTGUARDTYPE,
+			  dif_task_params->host_guard_type);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_PROTECTIONTYPE,
+			  dif_task_params->protection_type);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_INITIALREFTAGVALID, 1);
+		SET_FIELD(rdif_context->flags0,
+			  RDIF_TASK_CONTEXT_KEEPREFTAGCONST,
+			  dif_task_params->keep_ref_tag_const ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_VALIDATEAPPTAG,
+			  (dif_task_params->validate_app_tag &&
+			  dif_task_params->dif_on_network) ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_VALIDATEGUARD,
+			  (dif_task_params->validate_guard &&
+			  dif_task_params->dif_on_network) ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_VALIDATEREFTAG,
+			  (dif_task_params->validate_ref_tag &&
+			  dif_task_params->dif_on_network) ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_HOSTINTERFACE,
+			  dif_task_params->dif_on_host ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_NETWORKINTERFACE,
+			  dif_task_params->dif_on_network ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARDGUARD,
+			  dif_task_params->forward_guard ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARDAPPTAG,
+			  dif_task_params->forward_app_tag ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARDREFTAG,
+			  dif_task_params->forward_ref_tag ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARDAPPTAGWITHMASK,
+			  dif_task_params->forward_app_tag_with_mask ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_FORWARDREFTAGWITHMASK,
+			  dif_task_params->forward_ref_tag_with_mask ? 1 : 0);
+		SET_FIELD(rdif_context->flags1,
+			  RDIF_TASK_CONTEXT_INTERVALSIZE,
+			  dif_task_params->dif_block_size_log - 9);
+		SET_FIELD(rdif_context->state,
+			  RDIF_TASK_CONTEXT_REFTAGMASK,
+			  dif_task_params->ref_tag_mask);
+		SET_FIELD(rdif_context->state, RDIF_TASK_CONTEXT_IGNOREAPPTAG,
+			  dif_task_params->ignore_app_tag);
+	}
+
+	if (task_type == ISCSI_TASK_TYPE_TARGET_READ ||
+	    task_type == ISCSI_TASK_TYPE_INITIATOR_WRITE) {
+		tdif_context->app_tag_value =
+				  cpu_to_le16(dif_task_params->application_tag);
+		tdif_context->partial_crc_valueB =
+		       cpu_to_le16(dif_task_params->crc_seed ? 0xffff : 0x0000);
+		tdif_context->partial_crc_value_a =
+		       cpu_to_le16(dif_task_params->crc_seed ? 0xffff : 0x0000);
+		SET_FIELD(tdif_context->flags0, TDIF_TASK_CONTEXT_CRC_SEED,
+			  dif_task_params->crc_seed ? 1 : 0);
+
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_SETERRORWITHEOP,
+			  dif_task_params->tx_dif_conn_err_en ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_FORWARDGUARD,
+			  dif_task_params->forward_guard   ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_FORWARDAPPTAG,
+			  dif_task_params->forward_app_tag ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_FORWARDREFTAG,
+			  dif_task_params->forward_ref_tag ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_INTERVALSIZE,
+			  dif_task_params->dif_block_size_log - 9);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_HOSTINTERFACE,
+			  dif_task_params->dif_on_host    ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_NETWORKINTERFACE,
+			  dif_task_params->dif_on_network ? 1 : 0);
+		val = cpu_to_le32(dif_task_params->initial_ref_tag);
+		tdif_context->initial_ref_tag = val;
+		tdif_context->app_tag_mask =
+			     cpu_to_le16(dif_task_params->application_tag_mask);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_HOSTGUARDTYPE,
+			  dif_task_params->host_guard_type);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_PROTECTIONTYPE,
+			  dif_task_params->protection_type);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_INITIALREFTAGVALID,
+			  dif_task_params->initial_ref_tag_is_valid ? 1 : 0);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_KEEPREFTAGCONST,
+			  dif_task_params->keep_ref_tag_const ? 1 : 0);
+		SET_FIELD(tdif_context->flags1, TDIF_TASK_CONTEXT_VALIDATEGUARD,
+			  (dif_task_params->validate_guard &&
+			   dif_task_params->dif_on_host) ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_VALIDATEAPPTAG,
+			  (dif_task_params->validate_app_tag &&
+			  dif_task_params->dif_on_host) ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_VALIDATEREFTAG,
+			  (dif_task_params->validate_ref_tag &&
+			   dif_task_params->dif_on_host) ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_FORWARDAPPTAGWITHMASK,
+			  dif_task_params->forward_app_tag_with_mask ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_FORWARDREFTAGWITHMASK,
+			  dif_task_params->forward_ref_tag_with_mask ? 1 : 0);
+		SET_FIELD(tdif_context->flags1,
+			  TDIF_TASK_CONTEXT_REFTAGMASK,
+			  dif_task_params->ref_tag_mask);
+		SET_FIELD(tdif_context->flags0,
+			  TDIF_TASK_CONTEXT_IGNOREAPPTAG,
+			  dif_task_params->ignore_app_tag ? 1 : 0);
+	}
+}
+
+static void set_local_completion_context(struct iscsi_task_context *context)
+{
+	SET_FIELD(context->ystorm_st_context.state.flags,
+		  YSTORM_ISCSI_TASK_STATE_LOCAL_COMP, 1);
+	SET_FIELD(context->ustorm_st_context.flags,
+		  USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP, 1);
+}
+
+static int init_rw_iscsi_task(struct iscsi_task_params *task_params,
+			      enum iscsi_task_type task_type,
+			      struct iscsi_conn_params *conn_params,
+			      struct iscsi_common_hdr *pdu_header,
+			      struct scsi_sgl_task_params *sgl_task_params,
+			      struct scsi_initiator_cmd_params *cmd_params,
+			      struct scsi_dif_task_params *dif_task_params)
+{
+	u32 exp_data_transfer_len = conn_params->max_burst_length;
+	struct iscsi_task_context *cxt;
+	bool slow_io = false;
+	u32 task_size, val;
+	u8 num_sges = 0;
+
+	task_size = calc_rw_task_size(task_params, task_type, sgl_task_params,
+				      dif_task_params);
+
+	init_default_iscsi_task(task_params, (struct data_hdr *)pdu_header,
+				task_type);
+
+	cxt = task_params->context;
+
+	val = cpu_to_le32(task_size);
+	cxt->ystorm_st_context.pdu_hdr.cmd.expected_transfer_length = val;
+	init_initiator_rw_cdb_ystorm_context(&cxt->ystorm_st_context,
+					     cmd_params);
+	val = cpu_to_le32(cmd_params->sense_data_buffer_phys_addr.lo);
+	cxt->mstorm_st_context.sense_db.lo = val;
+
+	val = cpu_to_le32(cmd_params->sense_data_buffer_phys_addr.hi);
+	cxt->mstorm_st_context.sense_db.hi = val;
+
+	if (task_params->tx_io_size) {
+		init_dif_context_flags(&cxt->ystorm_st_context.state.dif_flags,
+				       dif_task_params);
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      sgl_task_params);
+
+		slow_io = scsi_is_slow_sgl(sgl_task_params->num_sges,
+					   sgl_task_params->small_mid_sge);
+
+		num_sges = !slow_io ? min_t(u16, sgl_task_params->num_sges,
+					    (u16)SCSI_NUM_SGES_SLOW_SGL_THR) :
+				      ISCSI_WQE_NUM_SGES_SLOWIO;
+
+		if (slow_io) {
+			SET_FIELD(cxt->ystorm_st_context.state.flags,
+				  YSTORM_ISCSI_TASK_STATE_SLOW_IO, 1);
+		}
+	} else if (task_params->rx_io_size) {
+		init_dif_context_flags(&cxt->mstorm_st_context.dif_flags,
+				       dif_task_params);
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      sgl_task_params);
+		num_sges = !scsi_is_slow_sgl(sgl_task_params->num_sges,
+				sgl_task_params->small_mid_sge) ?
+				min_t(u16, sgl_task_params->num_sges,
+				      (u16)SCSI_NUM_SGES_SLOW_SGL_THR) :
+				ISCSI_WQE_NUM_SGES_SLOWIO;
+		cxt->mstorm_st_context.rem_task_size = cpu_to_le32(task_size);
+	}
+
+	if (exp_data_transfer_len > task_size  ||
+	    task_type != ISCSI_TASK_TYPE_TARGET_WRITE)
+		exp_data_transfer_len = task_size;
+
+	init_ustorm_task_contexts(&task_params->context->ustorm_st_context,
+				  &task_params->context->ustorm_ag_context,
+				  task_size, exp_data_transfer_len, num_sges,
+				  dif_task_params ?
+				  dif_task_params->tx_dif_conn_err_en : false);
+
+	set_rw_exp_data_acked_and_cont_len(task_params->context, conn_params,
+					   task_type, task_size,
+					   exp_data_transfer_len,
+					GET_FIELD(pdu_header->hdr_second_dword,
+						  ISCSI_CMD_HDR_TOTAL_AHS_LEN));
+
+	if (dif_task_params)
+		init_rtdif_task_context(&task_params->context->rdif_context,
+					&task_params->context->tdif_context,
+					dif_task_params, task_type);
+
+	init_sqe(task_params, sgl_task_params, dif_task_params, pdu_header,
+		 cmd_params, task_type, false);
+
+	return 0;
+}
+
+int init_initiator_rw_iscsi_task(struct iscsi_task_params *task_params,
+				 struct iscsi_conn_params *conn_params,
+				 struct scsi_initiator_cmd_params *cmd_params,
+				 struct iscsi_cmd_hdr *cmd_header,
+				 struct scsi_sgl_task_params *tx_sgl_params,
+				 struct scsi_sgl_task_params *rx_sgl_params,
+				 struct scsi_dif_task_params *dif_task_params)
+{
+	if (GET_FIELD(cmd_header->flags_attr, ISCSI_CMD_HDR_WRITE))
+		return init_rw_iscsi_task(task_params,
+					  ISCSI_TASK_TYPE_INITIATOR_WRITE,
+					  conn_params,
+					  (struct iscsi_common_hdr *)cmd_header,
+					  tx_sgl_params, cmd_params,
+					  dif_task_params);
+	else if (GET_FIELD(cmd_header->flags_attr, ISCSI_CMD_HDR_READ))
+		return init_rw_iscsi_task(task_params,
+					  ISCSI_TASK_TYPE_INITIATOR_READ,
+					  conn_params,
+					  (struct iscsi_common_hdr *)cmd_header,
+					  rx_sgl_params, cmd_params,
+					  dif_task_params);
+	else
+		return -1;
+}
+
+int init_initiator_login_request_task(struct iscsi_task_params *task_params,
+				      struct iscsi_login_req_hdr  *login_header,
+				      struct scsi_sgl_task_params *tx_params,
+				      struct scsi_sgl_task_params *rx_params)
+{
+	struct iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)login_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_params->total_buffer_size : 0, 0,
+				  0);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_params);
+
+	cxt->mstorm_st_context.rem_task_size =
+			cpu_to_le32(task_params->rx_io_size ?
+				    rx_params->total_buffer_size : 0);
+
+	init_sqe(task_params, tx_params, NULL,
+		 (struct iscsi_common_hdr *)login_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_nop_out_task(struct iscsi_task_params *task_params,
+				struct iscsi_nop_out_hdr *nop_out_pdu_header,
+				struct scsi_sgl_task_params *tx_sgl_task_params,
+				struct scsi_sgl_task_params *rx_sgl_task_params)
+{
+	struct iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)nop_out_pdu_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	if (nop_out_pdu_header->itt == ISCSI_ITT_ALL_ONES)
+		set_local_completion_context(task_params->context);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_sgl_task_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_sgl_task_params);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_sgl_task_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_sgl_task_params->total_buffer_size : 0,
+				  0, 0);
+
+	cxt->mstorm_st_context.rem_task_size =
+				cpu_to_le32(task_params->rx_io_size ?
+					rx_sgl_task_params->total_buffer_size :
+					0);
+
+	init_sqe(task_params, tx_sgl_task_params, NULL,
+		 (struct iscsi_common_hdr *)nop_out_pdu_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_logout_request_task(struct iscsi_task_params *task_params,
+				       struct iscsi_logout_req_hdr *logout_hdr,
+				       struct scsi_sgl_task_params *tx_params,
+				       struct scsi_sgl_task_params *rx_params)
+{
+	struct iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)logout_hdr,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_params);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_params->total_buffer_size : 0,
+				  0, 0);
+
+	cxt->mstorm_st_context.rem_task_size =
+					cpu_to_le32(task_params->rx_io_size ?
+					rx_params->total_buffer_size : 0);
+
+	init_sqe(task_params, tx_params, NULL,
+		 (struct iscsi_common_hdr *)logout_hdr, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_tmf_request_task(struct iscsi_task_params *task_params,
+				    struct iscsi_tmf_request_hdr *tmf_header)
+{
+	init_default_iscsi_task(task_params, (struct data_hdr *)tmf_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	init_sqe(task_params, NULL, NULL,
+		 (struct iscsi_common_hdr *)tmf_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_initiator_text_request_task(struct iscsi_task_params *task_params,
+				     struct iscsi_text_request_hdr *text_header,
+				     struct scsi_sgl_task_params *tx_params,
+				     struct scsi_sgl_task_params *rx_params)
+{
+	struct iscsi_task_context *cxt;
+
+	cxt = task_params->context;
+
+	init_default_iscsi_task(task_params,
+				(struct data_hdr *)text_header,
+				ISCSI_TASK_TYPE_MIDPATH);
+
+	if (task_params->tx_io_size)
+		init_scsi_sgl_context(&cxt->ystorm_st_context.state.sgl_params,
+				      &cxt->ystorm_st_context.state.data_desc,
+				      tx_params);
+
+	if (task_params->rx_io_size)
+		init_scsi_sgl_context(&cxt->mstorm_st_context.sgl_params,
+				      &cxt->mstorm_st_context.data_desc,
+				      rx_params);
+
+	cxt->mstorm_st_context.rem_task_size =
+				cpu_to_le32(task_params->rx_io_size ?
+					rx_params->total_buffer_size : 0);
+
+	init_ustorm_task_contexts(&cxt->ustorm_st_context,
+				  &cxt->ustorm_ag_context,
+				  task_params->rx_io_size ?
+				  rx_params->total_buffer_size : 0,
+				  task_params->tx_io_size ?
+				  tx_params->total_buffer_size : 0, 0, 0);
+
+	init_sqe(task_params, tx_params, NULL,
+		 (struct iscsi_common_hdr *)text_header, NULL,
+		 ISCSI_TASK_TYPE_MIDPATH, false);
+
+	return 0;
+}
+
+int init_cleanup_task(struct iscsi_task_params *task_params)
+{
+	init_sqe(task_params, NULL, NULL, NULL, NULL, ISCSI_TASK_TYPE_MIDPATH,
+		 true);
+	return 0;
+}
diff --git a/drivers/scsi/qedi/qedi_fw_iscsi.h b/drivers/scsi/qedi/qedi_fw_iscsi.h
new file mode 100644
index 000000000000..b6f24f91849d
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw_iscsi.h
@@ -0,0 +1,117 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_FW_ISCSI_H_
+#define _QEDI_FW_ISCSI_H_
+
+#include "qedi_fw_scsi.h"
+
+struct iscsi_task_params {
+	struct iscsi_task_context *context;
+	struct iscsi_wqe	  *sqe;
+	u32			  tx_io_size;
+	u32			  rx_io_size;
+	u16			  conn_icid;
+	u16			  itid;
+	u8			  cq_rss_number;
+};
+
+struct iscsi_conn_params {
+	u32	first_burst_length;
+	u32	max_send_pdu_length;
+	u32	max_burst_length;
+	bool	initial_r2t;
+	bool	immediate_data;
+};
+
+/* @brief init_initiator_read_iscsi_task - initializes iSCSI Initiator Read
+ * task context.
+ *
+ * @param task_params	  - Pointer to task parameters struct
+ * @param conn_params	  - Connection Parameters
+ * @param cmd_params	  - command specific parameters
+ * @param cmd_pdu_header  - PDU Header Parameters
+ * @param sgl_task_params - Pointer to SGL task params
+ * @param dif_task_params - Pointer to DIF parameters struct
+ */
+int init_initiator_rw_iscsi_task(struct iscsi_task_params *task_params,
+				 struct iscsi_conn_params *conn_params,
+				 struct scsi_initiator_cmd_params *cmd_params,
+				 struct iscsi_cmd_hdr *cmd_pdu_header,
+				 struct scsi_sgl_task_params *tx_sgl_params,
+				 struct scsi_sgl_task_params *rx_sgl_params,
+				 struct scsi_dif_task_params *dif_task_params);
+
+/* @brief init_initiator_login_request_task - initializes iSCSI Initiator Login
+ * Request task context.
+ *
+ * @param task_params		  - Pointer to task parameters struct
+ * @param login_req_pdu_header    - PDU Header Parameters
+ * @param tx_sgl_task_params	  - Pointer to SGL task params
+ * @param rx_sgl_task_params	  - Pointer to SGL task params
+ */
+int init_initiator_login_request_task(struct iscsi_task_params *task_params,
+				      struct iscsi_login_req_hdr *login_header,
+				      struct scsi_sgl_task_params *tx_params,
+				      struct scsi_sgl_task_params *rx_params);
+
+/* @brief init_initiator_nop_out_task - initializes iSCSI Initiator NOP Out
+ * task context.
+ *
+ * @param task_params		- Pointer to task parameters struct
+ * @param nop_out_pdu_header    - PDU Header Parameters
+ * @param tx_sgl_task_params	- Pointer to SGL task params
+ * @param rx_sgl_task_params	- Pointer to SGL task params
+ */
+int init_initiator_nop_out_task(struct iscsi_task_params *task_params,
+				struct iscsi_nop_out_hdr *nop_out_pdu_header,
+				struct scsi_sgl_task_params *tx_sgl_params,
+				struct scsi_sgl_task_params *rx_sgl_params);
+
+/* @brief init_initiator_logout_request_task - initializes iSCSI Initiator
+ * Logout Request task context.
+ *
+ * @param task_params		- Pointer to task parameters struct
+ * @param logout_pdu_header  - PDU Header Parameters
+ * @param tx_sgl_task_params	- Pointer to SGL task params
+ * @param rx_sgl_task_params	- Pointer to SGL task params
+ */
+int init_initiator_logout_request_task(struct iscsi_task_params *task_params,
+				       struct iscsi_logout_req_hdr *logout_hdr,
+				       struct scsi_sgl_task_params *tx_params,
+				       struct scsi_sgl_task_params *rx_params);
+
+/* @brief init_initiator_tmf_request_task - initializes iSCSI Initiator TMF
+ * task context.
+ *
+ * @param task_params	- Pointer to task parameters struct
+ * @param tmf_pdu_header - PDU Header Parameters
+ */
+int init_initiator_tmf_request_task(struct iscsi_task_params *task_params,
+				    struct iscsi_tmf_request_hdr *tmf_header);
+
+/* @brief init_initiator_text_request_task - initializes iSCSI Initiator Text
+ * Request task context.
+ *
+ * @param task_params		     - Pointer to task parameters struct
+ * @param text_request_pdu_header    - PDU Header Parameters
+ * @param tx_sgl_task_params	     - Pointer to Tx SGL task params
+ * @param rx_sgl_task_params	     - Pointer to Rx SGL task params
+ */
+int init_initiator_text_request_task(struct iscsi_task_params *task_params,
+				     struct iscsi_text_request_hdr *text_header,
+				     struct scsi_sgl_task_params *tx_params,
+				     struct scsi_sgl_task_params *rx_params);
+
+/* @brief init_cleanup_task - initializes Clean task (SQE)
+ *
+ * @param task_params - Pointer to task parameters struct
+ */
+int init_cleanup_task(struct iscsi_task_params *task_params);
+#endif
diff --git a/drivers/scsi/qedi/qedi_fw_scsi.h b/drivers/scsi/qedi/qedi_fw_scsi.h
new file mode 100644
index 000000000000..cdaf918f1019
--- /dev/null
+++ b/drivers/scsi/qedi/qedi_fw_scsi.h
@@ -0,0 +1,55 @@
+/*
+ * QLogic iSCSI Offload Driver
+ * Copyright (c) 2016 Cavium Inc.
+ *
+ * This software is available under the terms of the GNU General Public License
+ * (GPL) Version 2, available from the file COPYING in the main directory of
+ * this source tree.
+ */
+
+#ifndef _QEDI_FW_SCSI_H_
+#define _QEDI_FW_SCSI_H_
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include "qedi_hsi.h"
+#include <linux/qed/qed_if.h>
+
+struct scsi_sgl_task_params {
+	struct scsi_sge	*sgl;
+	struct regpair	sgl_phys_addr;
+	u32		total_buffer_size;
+	u16		num_sges;
+	bool		small_mid_sge;
+};
+
+struct scsi_dif_task_params {
+	u32	initial_ref_tag;
+	bool	initial_ref_tag_is_valid;
+	u16	application_tag;
+	u16	application_tag_mask;
+	u16	dif_block_size_log;
+	bool	dif_on_network;
+	bool	dif_on_host;
+	u8	host_guard_type;
+	u8	protection_type;
+	u8	ref_tag_mask;
+	bool	crc_seed;
+	bool	tx_dif_conn_err_en;
+	bool	ignore_app_tag;
+	bool	keep_ref_tag_const;
+	bool	validate_guard;
+	bool	validate_app_tag;
+	bool	validate_ref_tag;
+	bool	forward_guard;
+	bool	forward_app_tag;
+	bool	forward_ref_tag;
+	bool	forward_app_tag_with_mask;
+	bool	forward_ref_tag_with_mask;
+};
+
+struct scsi_initiator_cmd_params {
+	struct scsi_sge	extended_cdb_sge;
+	struct regpair	sense_data_buffer_phys_addr;
+};
+#endif
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index b9f79d36142d..d5eff68507e5 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -175,7 +175,7 @@ static void qedi_destroy_cmd_pool(struct qedi_ctx *qedi,
 		if (cmd->io_tbl.sge_tbl)
 			dma_free_coherent(&qedi->pdev->dev,
 					  QEDI_ISCSI_MAX_BDS_PER_CMD *
-					  sizeof(struct iscsi_sge),
+					  sizeof(struct scsi_sge),
 					  cmd->io_tbl.sge_tbl,
 					  cmd->io_tbl.sge_tbl_dma);
 
@@ -191,7 +191,7 @@ static int qedi_alloc_sget(struct qedi_ctx *qedi, struct iscsi_session *session,
 			   struct qedi_cmd *cmd)
 {
 	struct qedi_io_bdt *io = &cmd->io_tbl;
-	struct iscsi_sge *sge;
+	struct scsi_sge *sge;
 
 	io->sge_tbl = dma_alloc_coherent(&qedi->pdev->dev,
 					 QEDI_ISCSI_MAX_BDS_PER_CMD *
@@ -708,22 +708,20 @@ static void qedi_conn_get_stats(struct iscsi_cls_conn *cls_conn,
 
 static void qedi_iscsi_prep_generic_pdu_bd(struct qedi_conn *qedi_conn)
 {
-	struct iscsi_sge *bd_tbl;
+	struct scsi_sge *bd_tbl;
 
-	bd_tbl = (struct iscsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
+	bd_tbl = (struct scsi_sge *)qedi_conn->gen_pdu.req_bd_tbl;
 
 	bd_tbl->sge_addr.hi =
 		(u32)((u64)qedi_conn->gen_pdu.req_dma_addr >> 32);
 	bd_tbl->sge_addr.lo = (u32)qedi_conn->gen_pdu.req_dma_addr;
 	bd_tbl->sge_len = qedi_conn->gen_pdu.req_wr_ptr -
 				qedi_conn->gen_pdu.req_buf;
-	bd_tbl->reserved0 = 0;
-	bd_tbl = (struct iscsi_sge  *)qedi_conn->gen_pdu.resp_bd_tbl;
+	bd_tbl = (struct scsi_sge  *)qedi_conn->gen_pdu.resp_bd_tbl;
 	bd_tbl->sge_addr.hi =
 			(u32)((u64)qedi_conn->gen_pdu.resp_dma_addr >> 32);
 	bd_tbl->sge_addr.lo = (u32)qedi_conn->gen_pdu.resp_dma_addr;
 	bd_tbl->sge_len = ISCSI_DEF_MAX_RECV_SEG_LEN;
-	bd_tbl->reserved0 = 0;
 }
 
 static int qedi_iscsi_send_generic_request(struct iscsi_task *task)
diff --git a/drivers/scsi/qedi/qedi_iscsi.h b/drivers/scsi/qedi/qedi_iscsi.h
index d3c06bbddb4e..3247287cb0e7 100644
--- a/drivers/scsi/qedi/qedi_iscsi.h
+++ b/drivers/scsi/qedi/qedi_iscsi.h
@@ -102,7 +102,7 @@ struct qedi_endpoint {
 #define QEDI_SQ_WQES_MIN	16
 
 struct qedi_io_bdt {
-	struct iscsi_sge *sge_tbl;
+	struct scsi_sge *sge_tbl;
 	dma_addr_t sge_tbl_dma;
 	u16 sge_valid;
 };
diff --git a/drivers/scsi/qedi/qedi_version.h b/drivers/scsi/qedi/qedi_version.h
index 9543a1b139d4..d61e3ac22e67 100644
--- a/drivers/scsi/qedi/qedi_version.h
+++ b/drivers/scsi/qedi/qedi_version.h
@@ -7,8 +7,8 @@
  * this source tree.
  */
 
-#define QEDI_MODULE_VERSION	"8.10.3.0"
+#define QEDI_MODULE_VERSION	"8.10.4.0"
 #define QEDI_DRIVER_MAJOR_VER		8
 #define QEDI_DRIVER_MINOR_VER		10
-#define QEDI_DRIVER_REV_VER		3
+#define QEDI_DRIVER_REV_VER		4
 #define QEDI_DRIVER_ENG_VER		0
diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h
index 52966b9bfde3..fbab6e0514f0 100644
--- a/include/linux/qed/common_hsi.h
+++ b/include/linux/qed/common_hsi.h
@@ -100,8 +100,8 @@
 #define MAX_NUM_LL2_TX_STATS_COUNTERS	32
 
 #define FW_MAJOR_VERSION	8
-#define FW_MINOR_VERSION	10
-#define FW_REVISION_VERSION	10
+#define FW_MINOR_VERSION	15
+#define FW_REVISION_VERSION	3
 #define FW_ENGINEERING_VERSION	0
 
 /***********************/
@@ -187,6 +187,9 @@
 
 /* DEMS */
 #define DQ_DEMS_LEGACY			0
+#define DQ_DEMS_TOE_MORE_TO_SEND	3
+#define DQ_DEMS_TOE_LOCAL_ADV_WND	4
+#define DQ_DEMS_ROCE_CQ_CONS		7
 
 /* XCM agg val selection */
 #define DQ_XCM_AGG_VAL_SEL_WORD2  0
@@ -214,6 +217,9 @@
 #define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3
 #define DQ_XCM_ISCSI_EXP_STAT_SN_CMD	DQ_XCM_AGG_VAL_SEL_REG6
 #define DQ_XCM_ROCE_SQ_PROD_CMD	DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_TOE_TX_BD_PROD_CMD	DQ_XCM_AGG_VAL_SEL_WORD4
+#define DQ_XCM_TOE_MORE_TO_SEND_SEQ_CMD	DQ_XCM_AGG_VAL_SEL_REG3
+#define DQ_XCM_TOE_LOCAL_ADV_WND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG4
 
 /* UCM agg val selection (HW) */
 #define	DQ_UCM_AGG_VAL_SEL_WORD0	0
@@ -269,6 +275,8 @@
 #define DQ_XCM_ISCSI_DQ_FLUSH_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
 #define DQ_XCM_ISCSI_SLOW_PATH_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
 #define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23)
+#define DQ_XCM_TOE_DQ_FLUSH_CMD		BIT(DQ_XCM_AGG_FLG_SHIFT_CF19)
+#define DQ_XCM_TOE_SLOW_PATH_CMD	BIT(DQ_XCM_AGG_FLG_SHIFT_CF22)
 
 /* UCM agg counter flag selection (HW) */
 #define	DQ_UCM_AGG_FLG_SHIFT_CF0	0
@@ -285,6 +293,9 @@
 #define DQ_UCM_ETH_PMD_RX_ARM_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF5)
 #define DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF4)
 #define DQ_UCM_ROCE_CQ_ARM_CF_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF5)
+#define DQ_UCM_TOE_TIMER_STOP_ALL_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF3)
+#define DQ_UCM_TOE_SLOW_PATH_CF_CMD	BIT(DQ_UCM_AGG_FLG_SHIFT_CF4)
+#define DQ_UCM_TOE_DQ_CF_CMD		BIT(DQ_UCM_AGG_FLG_SHIFT_CF5)
 
 /* TCM agg counter flag selection (HW) */
 #define DQ_TCM_AGG_FLG_SHIFT_CF0	0
@@ -301,6 +312,9 @@
 #define DQ_TCM_FCOE_TIMER_STOP_ALL_CMD      BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
 #define DQ_TCM_ISCSI_FLUSH_Q0_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
 #define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
+#define DQ_TCM_TOE_FLUSH_Q0_CMD		BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
+#define DQ_TCM_TOE_TIMER_STOP_ALL_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF3)
+#define DQ_TCM_IWARP_POST_RQ_CF_CMD	BIT(DQ_TCM_AGG_FLG_SHIFT_CF1)
 
 /* PWM address mapping */
 #define DQ_PWM_OFFSET_DPM_BASE	0x0
@@ -689,6 +703,16 @@ struct iscsi_eqe_data {
 #define ISCSI_EQE_DATA_RESERVED0_SHIFT			7
 };
 
+struct rdma_eqe_destroy_qp {
+	__le32 cid;
+	u8 reserved[4];
+};
+
+union rdma_eqe_data {
+	struct regpair async_handle;
+	struct rdma_eqe_destroy_qp rdma_destroy_qp_data;
+};
+
 struct malicious_vf_eqe_data {
 	u8 vf_id;
 	u8 err_id;
@@ -705,9 +729,9 @@ union event_ring_data {
 	u8 bytes[8];
 	struct vf_pf_channel_eqe_data vf_pf_channel;
 	struct iscsi_eqe_data iscsi_info;
+	union rdma_eqe_data rdma_data;
 	struct malicious_vf_eqe_data malicious_vf;
 	struct initial_cleanup_eqe_data vf_init_cleanup;
-	struct regpair roce_handle;
 };
 
 /* Event Ring Entry */
diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h
index 4b402fb0eaad..34d93eb5bfba 100644
--- a/include/linux/qed/eth_common.h
+++ b/include/linux/qed/eth_common.h
@@ -49,6 +49,9 @@
 #define ETH_RX_CQE_PAGE_SIZE_BYTES                      4096
 #define ETH_RX_NUM_NEXT_PAGE_BDS                        2
 
+#define ETH_MAX_TUNN_LSO_INNER_IPV4_OFFSET          253
+#define ETH_MAX_TUNN_LSO_INNER_IPV6_OFFSET          251
+
 #define ETH_TX_MIN_BDS_PER_NON_LSO_PKT                          1
 #define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET                       18
 #define ETH_TX_MAX_BDS_PER_LSO_PACKET	255
diff --git a/include/linux/qed/fcoe_common.h b/include/linux/qed/fcoe_common.h
index 2e417a45c5f7..947a635d04bb 100644
--- a/include/linux/qed/fcoe_common.h
+++ b/include/linux/qed/fcoe_common.h
@@ -109,13 +109,6 @@ struct fcoe_conn_terminate_ramrod_data {
 	struct regpair terminate_params_addr;
 };
 
-struct fcoe_fast_sgl_ctx {
-	struct regpair sgl_start_addr;
-	__le32 sgl_byte_offset;
-	__le16 task_reuse_cnt;
-	__le16 init_offset_in_first_sge;
-};
-
 struct fcoe_slow_sgl_ctx {
 	struct regpair base_sgl_addr;
 	__le16 curr_sge_off;
@@ -124,23 +117,16 @@ struct fcoe_slow_sgl_ctx {
 	__le16 reserved;
 };
 
-struct fcoe_sge {
-	struct regpair sge_addr;
-	__le16 size;
-	__le16 reserved0;
-	u8 reserved1[3];
-	u8 is_valid_sge;
-};
-
-union fcoe_data_desc_ctx {
-	struct fcoe_fast_sgl_ctx fast;
-	struct fcoe_slow_sgl_ctx slow;
-	struct fcoe_sge single_sge;
-};
-
 union fcoe_dix_desc_ctx {
 	struct fcoe_slow_sgl_ctx dix_sgl;
-	struct fcoe_sge cached_dix_sge;
+	struct scsi_sge cached_dix_sge;
+};
+
+struct fcoe_fast_sgl_ctx {
+	struct regpair sgl_start_addr;
+	__le32 sgl_byte_offset;
+	__le16 task_reuse_cnt;
+	__le16 init_offset_in_first_sge;
 };
 
 struct fcoe_fcp_cmd_payload {
@@ -172,57 +158,6 @@ enum fcoe_mode_type {
 	MAX_FCOE_MODE_TYPE
 };
 
-struct fcoe_mstorm_fcoe_task_st_ctx_fp {
-	__le16 flags;
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_RSRV0_MASK                 0x7FFF
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_RSRV0_SHIFT                0
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_MP_INCLUDE_FC_HEADER_MASK  0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_MP_INCLUDE_FC_HEADER_SHIFT 15
-	__le16 difDataResidue;
-	__le16 parent_id;
-	__le16 single_sge_saved_offset;
-	__le32 data_2_trns_rem;
-	__le32 offset_in_io;
-	union fcoe_dix_desc_ctx dix_desc;
-	union fcoe_data_desc_ctx data_desc;
-};
-
-struct fcoe_mstorm_fcoe_task_st_ctx_non_fp {
-	__le16 flags;
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HOST_INTERFACE_MASK            0x3
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HOST_INTERFACE_SHIFT           0
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_TO_PEER_MASK               0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_TO_PEER_SHIFT              2
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_APP_TAG_MASK      0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_APP_TAG_SHIFT     3
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_INTERVAL_SIZE_LOG_MASK         0xF
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_INTERVAL_SIZE_LOG_SHIFT        4
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_BLOCK_SIZE_MASK            0x3
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_BLOCK_SIZE_SHIFT           8
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RESERVED_MASK                  0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RESERVED_SHIFT                 10
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HAS_FIRST_PACKET_ARRIVED_MASK  0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HAS_FIRST_PACKET_ARRIVED_SHIFT 11
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_REF_TAG_MASK      0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_REF_TAG_SHIFT     12
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_CACHED_SGE_FLG_MASK        0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_CACHED_SGE_FLG_SHIFT       13
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_OFFSET_IN_IO_VALID_MASK        0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_OFFSET_IN_IO_VALID_SHIFT       14
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_SUPPORTED_MASK             0x1
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_SUPPORTED_SHIFT            15
-	u8 tx_rx_sgl_mode;
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE_MASK               0x7
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE_SHIFT              0
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE_MASK               0x7
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE_SHIFT              3
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RSRV1_MASK                     0x3
-#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RSRV1_SHIFT                    6
-	u8 rsrv2;
-	__le32 num_prm_zero_read;
-	struct regpair rsp_buf_addr;
-};
-
 struct fcoe_rx_stat {
 	struct regpair fcoe_rx_byte_cnt;
 	struct regpair fcoe_rx_data_pkt_cnt;
@@ -236,16 +171,6 @@ struct fcoe_rx_stat {
 	__le32 rsrv;
 };
 
-enum fcoe_sgl_mode {
-	FCOE_SLOW_SGL,
-	FCOE_SINGLE_FAST_SGE,
-	FCOE_2_FAST_SGE,
-	FCOE_3_FAST_SGE,
-	FCOE_4_FAST_SGE,
-	FCOE_MUL_FAST_SGES,
-	MAX_FCOE_SGL_MODE
-};
-
 struct fcoe_stat_ramrod_data {
 	struct regpair stat_params_addr;
 };
@@ -328,22 +253,24 @@ union fcoe_tx_info_union_ctx {
 struct ystorm_fcoe_task_st_ctx {
 	u8 task_type;
 	u8 sgl_mode;
-#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK  0x7
+#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK  0x1
 #define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT 0
-#define YSTORM_FCOE_TASK_ST_CTX_RSRV_MASK         0x1F
-#define YSTORM_FCOE_TASK_ST_CTX_RSRV_SHIFT        3
+#define YSTORM_FCOE_TASK_ST_CTX_RSRV_MASK         0x7F
+#define YSTORM_FCOE_TASK_ST_CTX_RSRV_SHIFT        1
 	u8 cached_dix_sge;
 	u8 expect_first_xfer;
 	__le32 num_pbf_zero_write;
 	union protection_info_union_ctx protection_info_union;
 	__le32 data_2_trns_rem;
+	struct scsi_sgl_params sgl_params;
+	u8 reserved1[12];
 	union fcoe_tx_info_union_ctx tx_info_union;
 	union fcoe_dix_desc_ctx dix_desc;
-	union fcoe_data_desc_ctx data_desc;
+	struct scsi_cached_sges data_desc;
 	__le16 ox_id;
 	__le16 rx_id;
 	__le32 task_rety_identifier;
-	__le32 reserved1[2];
+	u8 reserved2[8];
 };
 
 struct ystorm_fcoe_task_ag_ctx {
@@ -484,22 +411,22 @@ struct tstorm_fcoe_task_ag_ctx {
 struct fcoe_tstorm_fcoe_task_st_ctx_read_write {
 	union fcoe_cleanup_addr_exp_ro_union cleanup_addr_exp_ro_union;
 	__le16 flags;
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK       0x7
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK       0x1
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT      0
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK   0x1
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT  3
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT  1
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK        0x1
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT       4
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT       2
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK       0x1
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT      5
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT      3
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK  0x1
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT 6
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT 4
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MASK   0x1
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT  7
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT  5
 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK        0x3
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT       8
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK             0x3F
-#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT            10
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT       6
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK             0xFF
+#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT            8
 	__le16 seq_cnt;
 	u8 seq_id;
 	u8 ooo_rx_seq_id;
@@ -582,8 +509,34 @@ struct mstorm_fcoe_task_ag_ctx {
 };
 
 struct mstorm_fcoe_task_st_ctx {
-	struct fcoe_mstorm_fcoe_task_st_ctx_non_fp non_fp;
-	struct fcoe_mstorm_fcoe_task_st_ctx_fp fp;
+	struct regpair rsp_buf_addr;
+	__le32 rsrv[2];
+	struct scsi_sgl_params sgl_params;
+	__le32 data_2_trns_rem;
+	__le32 data_buffer_offset;
+	__le16 parent_id;
+	__le16 flags;
+#define MSTORM_FCOE_TASK_ST_CTX_INTERVAL_SIZE_LOG_MASK     0xF
+#define MSTORM_FCOE_TASK_ST_CTX_INTERVAL_SIZE_LOG_SHIFT    0
+#define MSTORM_FCOE_TASK_ST_CTX_HOST_INTERFACE_MASK        0x3
+#define MSTORM_FCOE_TASK_ST_CTX_HOST_INTERFACE_SHIFT       4
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_TO_PEER_MASK           0x1
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_TO_PEER_SHIFT          6
+#define MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER_MASK  0x1
+#define MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER_SHIFT 7
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_BLOCK_SIZE_MASK        0x3
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_BLOCK_SIZE_SHIFT       8
+#define MSTORM_FCOE_TASK_ST_CTX_VALIDATE_DIX_REF_TAG_MASK  0x1
+#define MSTORM_FCOE_TASK_ST_CTX_VALIDATE_DIX_REF_TAG_SHIFT 10
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_CACHED_SGE_FLG_MASK    0x1
+#define MSTORM_FCOE_TASK_ST_CTX_DIX_CACHED_SGE_FLG_SHIFT   11
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_SUPPORTED_MASK         0x1
+#define MSTORM_FCOE_TASK_ST_CTX_DIF_SUPPORTED_SHIFT        12
+#define MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK           0x1
+#define MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT          13
+#define MSTORM_FCOE_TASK_ST_CTX_RESERVED_MASK              0x3
+#define MSTORM_FCOE_TASK_ST_CTX_RESERVED_SHIFT             14
+	struct scsi_cached_sges data_desc;
 };
 
 struct ustorm_fcoe_task_ag_ctx {
@@ -646,6 +599,7 @@ struct ustorm_fcoe_task_ag_ctx {
 
 struct fcoe_task_context {
 	struct ystorm_fcoe_task_st_ctx ystorm_st_context;
+	struct regpair ystorm_st_padding[2];
 	struct tdif_task_context tdif_context;
 	struct ystorm_fcoe_task_ag_ctx ystorm_ag_context;
 	struct tstorm_fcoe_task_ag_ctx tstorm_ag_context;
@@ -668,20 +622,20 @@ struct fcoe_tx_stat {
 struct fcoe_wqe {
 	__le16 task_id;
 	__le16 flags;
-#define FCOE_WQE_REQ_TYPE_MASK        0xF
-#define FCOE_WQE_REQ_TYPE_SHIFT       0
-#define FCOE_WQE_SGL_MODE_MASK        0x7
-#define FCOE_WQE_SGL_MODE_SHIFT       4
-#define FCOE_WQE_CONTINUATION_MASK    0x1
-#define FCOE_WQE_CONTINUATION_SHIFT   7
-#define FCOE_WQE_INVALIDATE_PTU_MASK  0x1
-#define FCOE_WQE_INVALIDATE_PTU_SHIFT 8
-#define FCOE_WQE_SUPER_IO_MASK        0x1
-#define FCOE_WQE_SUPER_IO_SHIFT       9
-#define FCOE_WQE_SEND_AUTO_RSP_MASK   0x1
-#define FCOE_WQE_SEND_AUTO_RSP_SHIFT  10
-#define FCOE_WQE_RESERVED0_MASK       0x1F
-#define FCOE_WQE_RESERVED0_SHIFT      11
+#define FCOE_WQE_REQ_TYPE_MASK       0xF
+#define FCOE_WQE_REQ_TYPE_SHIFT      0
+#define FCOE_WQE_SGL_MODE_MASK       0x1
+#define FCOE_WQE_SGL_MODE_SHIFT      4
+#define FCOE_WQE_CONTINUATION_MASK   0x1
+#define FCOE_WQE_CONTINUATION_SHIFT  5
+#define FCOE_WQE_SEND_AUTO_RSP_MASK  0x1
+#define FCOE_WQE_SEND_AUTO_RSP_SHIFT 6
+#define FCOE_WQE_RESERVED_MASK       0x1
+#define FCOE_WQE_RESERVED_SHIFT      7
+#define FCOE_WQE_NUM_SGES_MASK       0xF
+#define FCOE_WQE_NUM_SGES_SHIFT      8
+#define FCOE_WQE_RESERVED1_MASK      0xF
+#define FCOE_WQE_RESERVED1_SHIFT     12
 	union fcoe_additional_info_union additional_info_union;
 };
 
diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h
index 4c5747babcf6..69949f8e354b 100644
--- a/include/linux/qed/iscsi_common.h
+++ b/include/linux/qed/iscsi_common.h
@@ -39,17 +39,9 @@
 /* iSCSI HSI constants */
 #define ISCSI_DEFAULT_MTU       (1500)
 
-/* Current iSCSI HSI version number composed of two fields (16 bit) */
-#define ISCSI_HSI_MAJOR_VERSION (0)
-#define ISCSI_HSI_MINOR_VERSION (0)
-
 /* KWQ (kernel work queue) layer codes */
 #define ISCSI_SLOW_PATH_LAYER_CODE   (6)
 
-/* CQE completion status */
-#define ISCSI_EQE_COMPLETION_SUCCESS (0x0)
-#define ISCSI_EQE_RST_CONN_RCVD (0x1)
-
 /* iSCSI parameter defaults */
 #define ISCSI_DEFAULT_HEADER_DIGEST         (0)
 #define ISCSI_DEFAULT_DATA_DIGEST           (0)
@@ -68,6 +60,10 @@
 #define ISCSI_MIN_VAL_MAX_OUTSTANDING_R2T   (1)
 #define ISCSI_MAX_VAL_MAX_OUTSTANDING_R2T   (0xff)
 
+#define ISCSI_AHS_CNTL_SIZE 4
+
+#define ISCSI_WQE_NUM_SGES_SLOWIO           (0xf)
+
 /* iSCSI reserved params */
 #define ISCSI_ITT_ALL_ONES	(0xffffffff)
 #define ISCSI_TTT_ALL_ONES	(0xffffffff)
@@ -173,19 +169,6 @@ struct iscsi_async_msg_hdr {
 	__le32 reserved7;
 };
 
-struct iscsi_sge {
-	struct regpair sge_addr;
-	__le16 sge_len;
-	__le16 reserved0;
-	__le32 reserved1;
-};
-
-struct iscsi_cached_sge_ctx {
-	struct iscsi_sge sge;
-	struct regpair reserved;
-	__le32 dsgl_curr_offset[2];
-};
-
 struct iscsi_cmd_hdr {
 	__le16 reserved1;
 	u8 flags_attr;
@@ -229,8 +212,13 @@ struct iscsi_common_hdr {
 #define ISCSI_COMMON_HDR_DATA_SEG_LEN_SHIFT  0
 #define ISCSI_COMMON_HDR_TOTAL_AHS_LEN_MASK  0xFF
 #define ISCSI_COMMON_HDR_TOTAL_AHS_LEN_SHIFT 24
-	__le32 lun_reserved[4];
-	__le32 data[6];
+	struct regpair lun_reserved;
+	__le32 itt;
+	__le32 ttt;
+	__le32 cmdstat_sn;
+	__le32 exp_statcmd_sn;
+	__le32 max_cmd_sn;
+	__le32 data[3];
 };
 
 struct iscsi_conn_offload_params {
@@ -246,8 +234,10 @@ struct iscsi_conn_offload_params {
 #define ISCSI_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B_SHIFT 0
 #define ISCSI_CONN_OFFLOAD_PARAMS_TARGET_MODE_MASK     0x1
 #define ISCSI_CONN_OFFLOAD_PARAMS_TARGET_MODE_SHIFT    1
-#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_MASK       0x3F
-#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_SHIFT      2
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_MASK	0x1
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_SHIFT	2
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_MASK	0x1F
+#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_SHIFT	3
 	u8 pbl_page_size_log;
 	u8 pbe_page_size_log;
 	u8 default_cq;
@@ -278,8 +268,12 @@ struct iscsi_conn_update_ramrod_params {
 #define ISCSI_CONN_UPDATE_RAMROD_PARAMS_INITIAL_R2T_SHIFT    2
 #define ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA_MASK  0x1
 #define ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA_SHIFT 3
-#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_MASK       0xF
-#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_SHIFT      4
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_BLOCK_SIZE_MASK  0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_BLOCK_SIZE_SHIFT 4
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_HOST_EN_MASK  0x1
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_HOST_EN_SHIFT 5
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_MASK       0x3
+#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_SHIFT      6
 	u8 reserved0[3];
 	__le32 max_seq_size;
 	__le32 max_send_pdu_length;
@@ -312,7 +306,7 @@ struct iscsi_ext_cdb_cmd_hdr {
 	__le32 expected_transfer_length;
 	__le32 cmd_sn;
 	__le32 exp_stat_sn;
-	struct iscsi_sge cdb_sge;
+	struct scsi_sge cdb_sge;
 };
 
 struct iscsi_login_req_hdr {
@@ -519,8 +513,8 @@ struct iscsi_logout_response_hdr {
 	__le32 exp_cmd_sn;
 	__le32 max_cmd_sn;
 	__le32 reserved4;
-	__le16 time2retain;
-	__le16 time2wait;
+	__le16 time_2_retain;
+	__le16 time_2_wait;
 	__le32 reserved5[1];
 };
 
@@ -602,7 +596,7 @@ struct iscsi_tmf_response_hdr {
 #define ISCSI_TMF_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT 24
 	struct regpair reserved0;
 	__le32 itt;
-	__le32 rtt;
+	__le32 reserved1;
 	__le32 stat_sn;
 	__le32 exp_cmd_sn;
 	__le32 max_cmd_sn;
@@ -641,7 +635,7 @@ struct iscsi_reject_hdr {
 #define ISCSI_REJECT_HDR_TOTAL_AHS_LEN_MASK  0xFF
 #define ISCSI_REJECT_HDR_TOTAL_AHS_LEN_SHIFT 24
 	struct regpair reserved0;
-	__le32 reserved1;
+	__le32 all_ones;
 	__le32 reserved2;
 	__le32 stat_sn;
 	__le32 exp_cmd_sn;
@@ -688,7 +682,9 @@ struct iscsi_cqe_solicited {
 	__le16 itid;
 	u8 task_type;
 	u8 fw_dbg_field;
-	__le32 reserved1[2];
+	u8 caused_conn_err;
+	u8 reserved0[3];
+	__le32 reserved1[1];
 	union iscsi_task_hdr iscsi_hdr;
 };
 
@@ -727,35 +723,6 @@ enum iscsi_cqe_unsolicited_type {
 	MAX_ISCSI_CQE_UNSOLICITED_TYPE
 };
 
-struct iscsi_virt_sgl_ctx {
-	struct regpair sgl_base;
-	struct regpair dsgl_base;
-	__le32 sgl_initial_offset;
-	__le32 dsgl_initial_offset;
-	__le32 dsgl_curr_offset[2];
-};
-
-struct iscsi_sgl_var_params {
-	u8 sgl_ptr;
-	u8 dsgl_ptr;
-	__le16 sge_offset;
-	__le16 dsge_offset;
-};
-
-struct iscsi_phys_sgl_ctx {
-	struct regpair sgl_base;
-	struct regpair dsgl_base;
-	u8 sgl_size;
-	u8 dsgl_size;
-	__le16 reserved;
-	struct iscsi_sgl_var_params var_params[2];
-};
-
-union iscsi_data_desc_ctx {
-	struct iscsi_virt_sgl_ctx virt_sgl;
-	struct iscsi_phys_sgl_ctx phys_sgl;
-	struct iscsi_cached_sge_ctx cached_sge;
-};
 
 struct iscsi_debug_modes {
 	u8 flags;
@@ -771,8 +738,10 @@ struct iscsi_debug_modes {
 #define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_REJECT_OR_ASYNC_SHIFT 4
 #define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_NOP_MASK              0x1
 #define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_NOP_SHIFT             5
-#define ISCSI_DEBUG_MODES_RESERVED0_MASK                       0x3
-#define ISCSI_DEBUG_MODES_RESERVED0_SHIFT                      6
+#define ISCSI_DEBUG_MODES_ASSERT_IF_DATA_DIGEST_ERROR_MASK     0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_DATA_DIGEST_ERROR_SHIFT    6
+#define ISCSI_DEBUG_MODES_ASSERT_IF_DIF_ERROR_MASK             0x1
+#define ISCSI_DEBUG_MODES_ASSERT_IF_DIF_ERROR_SHIFT            7
 };
 
 struct iscsi_dif_flags {
@@ -806,7 +775,6 @@ enum iscsi_eqe_opcode {
 	ISCSI_EVENT_TYPE_ASYN_FIN_WAIT2,
 	ISCSI_EVENT_TYPE_ISCSI_CONN_ERROR,
 	ISCSI_EVENT_TYPE_TCP_CONN_ERROR,
-	ISCSI_EVENT_TYPE_ASYN_DELETE_OOO_ISLES,
 	MAX_ISCSI_EQE_OPCODE
 };
 
@@ -856,31 +824,11 @@ enum iscsi_error_types {
 	ISCSI_CONN_ERROR_PROTOCOL_ERR_DIF_TX,
 	ISCSI_CONN_ERROR_SENSE_DATA_LENGTH,
 	ISCSI_CONN_ERROR_DATA_PLACEMENT_ERROR,
+	ISCSI_CONN_ERROR_INVALID_ITT,
 	ISCSI_ERROR_UNKNOWN,
 	MAX_ISCSI_ERROR_TYPES
 };
 
-struct iscsi_mflags {
-	u8 mflags;
-#define ISCSI_MFLAGS_SLOW_IO_MASK     0x1
-#define ISCSI_MFLAGS_SLOW_IO_SHIFT    0
-#define ISCSI_MFLAGS_SINGLE_SGE_MASK  0x1
-#define ISCSI_MFLAGS_SINGLE_SGE_SHIFT 1
-#define ISCSI_MFLAGS_RESERVED_MASK    0x3F
-#define ISCSI_MFLAGS_RESERVED_SHIFT   2
-};
-
-struct iscsi_sgl {
-	struct regpair sgl_addr;
-	__le16 updated_sge_size;
-	__le16 updated_sge_offset;
-	__le32 byte_offset;
-};
-
-union iscsi_mstorm_sgl {
-	struct iscsi_sgl sgl_struct;
-	struct iscsi_sge single_sge;
-};
 
 enum iscsi_ramrod_cmd_id {
 	ISCSI_RAMROD_CMD_ID_UNUSED = 0,
@@ -896,10 +844,10 @@ enum iscsi_ramrod_cmd_id {
 
 struct iscsi_reg1 {
 	__le32 reg1_map;
-#define ISCSI_REG1_NUM_FAST_SGES_MASK  0x7
-#define ISCSI_REG1_NUM_FAST_SGES_SHIFT 0
-#define ISCSI_REG1_RESERVED1_MASK      0x1FFFFFFF
-#define ISCSI_REG1_RESERVED1_SHIFT     3
+#define ISCSI_REG1_NUM_SGES_MASK   0xF
+#define ISCSI_REG1_NUM_SGES_SHIFT  0
+#define ISCSI_REG1_RESERVED1_MASK  0xFFFFFFF
+#define ISCSI_REG1_RESERVED1_SHIFT 4
 };
 
 union iscsi_seq_num {
@@ -967,22 +915,33 @@ struct iscsi_spe_func_init {
 };
 
 struct ystorm_iscsi_task_state {
-	union iscsi_data_desc_ctx sgl_ctx_union;
-	__le32 buffer_offset[2];
-	__le16 bytes_nxt_dif;
-	__le16 rxmit_bytes_nxt_dif;
-	union iscsi_seq_num seq_num_union;
-	u8 dif_bytes_leftover;
-	u8 rxmit_dif_bytes_leftover;
-	__le16 reuse_count;
-	struct iscsi_dif_flags dif_flags;
-	u8 local_comp;
+	struct scsi_cached_sges data_desc;
+	struct scsi_sgl_params sgl_params;
 	__le32 exp_r2t_sn;
-	__le32 sgl_offset[2];
+	__le32 buffer_offset;
+	union iscsi_seq_num seq_num;
+	struct iscsi_dif_flags dif_flags;
+	u8 flags;
+#define YSTORM_ISCSI_TASK_STATE_LOCAL_COMP_MASK  0x1
+#define YSTORM_ISCSI_TASK_STATE_LOCAL_COMP_SHIFT 0
+#define YSTORM_ISCSI_TASK_STATE_SLOW_IO_MASK     0x1
+#define YSTORM_ISCSI_TASK_STATE_SLOW_IO_SHIFT    1
+#define YSTORM_ISCSI_TASK_STATE_RESERVED0_MASK   0x3F
+#define YSTORM_ISCSI_TASK_STATE_RESERVED0_SHIFT  2
+};
+
+struct ystorm_iscsi_task_rxmit_opt {
+	__le32 fast_rxmit_sge_offset;
+	__le32 scan_start_buffer_offset;
+	__le32 fast_rxmit_buffer_offset;
+	u8 scan_start_sgl_index;
+	u8 fast_rxmit_sgl_index;
+	__le16 reserved;
 };
 
 struct ystorm_iscsi_task_st_ctx {
 	struct ystorm_iscsi_task_state state;
+	struct ystorm_iscsi_task_rxmit_opt rxmit_opt;
 	union iscsi_task_hdr pdu_hdr;
 };
 
@@ -1152,25 +1111,16 @@ struct ustorm_iscsi_task_ag_ctx {
 };
 
 struct mstorm_iscsi_task_st_ctx {
-	union iscsi_mstorm_sgl sgl_union;
-	struct iscsi_dif_flags dif_flags;
-	struct iscsi_mflags flags;
-	u8 sgl_size;
-	u8 host_sge_index;
-	__le16 dix_cur_sge_offset;
-	__le16 dix_cur_sge_size;
-	__le32 data_offset_rtid;
-	u8 dif_offset;
-	u8 dix_sgl_size;
-	u8 dix_sge_index;
+	struct scsi_cached_sges data_desc;
+	struct scsi_sgl_params sgl_params;
+	__le32 rem_task_size;
+	__le32 data_buffer_offset;
 	u8 task_type;
+	struct iscsi_dif_flags dif_flags;
+	u8 reserved0[2];
 	struct regpair sense_db;
-	struct regpair dix_sgl_cur_sge;
-	__le32 rem_task_size;
-	__le16 reuse_count;
-	__le16 dif_data_residue;
-	u8 reserved0[4];
-	__le32 reserved1[1];
+	__le32 expected_itt;
+	__le32 reserved1;
 };
 
 struct ustorm_iscsi_task_st_ctx {
@@ -1184,7 +1134,7 @@ struct ustorm_iscsi_task_st_ctx {
 #define USTORM_ISCSI_TASK_ST_CTX_AHS_EXIST_SHIFT            0
 #define USTORM_ISCSI_TASK_ST_CTX_RESERVED1_MASK             0x7F
 #define USTORM_ISCSI_TASK_ST_CTX_RESERVED1_SHIFT            1
-	u8 reserved2;
+	struct iscsi_dif_flags dif_flags;
 	__le16 reserved3;
 	__le32 reserved4;
 	__le32 reserved5;
@@ -1207,10 +1157,10 @@ struct ustorm_iscsi_task_st_ctx {
 #define USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP_SHIFT           2
 #define USTORM_ISCSI_TASK_ST_CTX_Q0_R2TQE_WRITE_MASK        0x1
 #define USTORM_ISCSI_TASK_ST_CTX_Q0_R2TQE_WRITE_SHIFT       3
-#define USTORM_ISCSI_TASK_ST_CTX_TOTALDATAACKED_DONE_MASK   0x1
-#define USTORM_ISCSI_TASK_ST_CTX_TOTALDATAACKED_DONE_SHIFT  4
-#define USTORM_ISCSI_TASK_ST_CTX_HQSCANNED_DONE_MASK        0x1
-#define USTORM_ISCSI_TASK_ST_CTX_HQSCANNED_DONE_SHIFT       5
+#define USTORM_ISCSI_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_MASK  0x1
+#define USTORM_ISCSI_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_SHIFT 4
+#define USTORM_ISCSI_TASK_ST_CTX_HQ_SCANNED_DONE_MASK        0x1
+#define USTORM_ISCSI_TASK_ST_CTX_HQ_SCANNED_DONE_SHIFT       5
 #define USTORM_ISCSI_TASK_ST_CTX_R2T2RECV_DONE_MASK         0x1
 #define USTORM_ISCSI_TASK_ST_CTX_R2T2RECV_DONE_SHIFT        6
 #define USTORM_ISCSI_TASK_ST_CTX_RESERVED0_MASK             0x1
@@ -1220,7 +1170,6 @@ struct ustorm_iscsi_task_st_ctx {
 
 struct iscsi_task_context {
 	struct ystorm_iscsi_task_st_ctx ystorm_st_context;
-	struct regpair ystorm_st_padding[2];
 	struct ystorm_iscsi_task_ag_ctx ystorm_ag_context;
 	struct regpair ystorm_ag_padding[2];
 	struct tdif_task_context tdif_context;
@@ -1272,32 +1221,22 @@ struct iscsi_uhqe {
 #define ISCSI_UHQE_TASK_ID_LO_SHIFT         24
 };
 
-struct iscsi_wqe_field {
-	__le32 contlen_cdbsize_field;
-#define ISCSI_WQE_FIELD_CONT_LEN_MASK  0xFFFFFF
-#define ISCSI_WQE_FIELD_CONT_LEN_SHIFT 0
-#define ISCSI_WQE_FIELD_CDB_SIZE_MASK  0xFF
-#define ISCSI_WQE_FIELD_CDB_SIZE_SHIFT 24
-};
-
-union iscsi_wqe_field_union {
-	struct iscsi_wqe_field cont_field;
-	__le32 prev_tid;
-};
 
 struct iscsi_wqe {
 	__le16 task_id;
 	u8 flags;
 #define ISCSI_WQE_WQE_TYPE_MASK        0x7
 #define ISCSI_WQE_WQE_TYPE_SHIFT       0
-#define ISCSI_WQE_NUM_FAST_SGES_MASK   0x7
-#define ISCSI_WQE_NUM_FAST_SGES_SHIFT  3
-#define ISCSI_WQE_PTU_INVALIDATE_MASK  0x1
-#define ISCSI_WQE_PTU_INVALIDATE_SHIFT 6
+#define ISCSI_WQE_NUM_SGES_MASK  0xF
+#define ISCSI_WQE_NUM_SGES_SHIFT 3
 #define ISCSI_WQE_RESPONSE_MASK        0x1
 #define ISCSI_WQE_RESPONSE_SHIFT       7
 	struct iscsi_dif_flags prot_flags;
-	union iscsi_wqe_field_union cont_prevtid_union;
+	__le32 contlen_cdbsize;
+#define ISCSI_WQE_CONT_LEN_MASK  0xFFFFFF
+#define ISCSI_WQE_CONT_LEN_SHIFT 0
+#define ISCSI_WQE_CDB_SIZE_MASK  0xFF
+#define ISCSI_WQE_CDB_SIZE_SHIFT 24
 };
 
 enum iscsi_wqe_type {
@@ -1318,17 +1257,15 @@ struct iscsi_xhqe {
 	u8 total_ahs_length;
 	u8 opcode;
 	u8 flags;
-#define ISCSI_XHQE_NUM_FAST_SGES_MASK  0x7
-#define ISCSI_XHQE_NUM_FAST_SGES_SHIFT 0
-#define ISCSI_XHQE_FINAL_MASK          0x1
-#define ISCSI_XHQE_FINAL_SHIFT         3
-#define ISCSI_XHQE_SUPER_IO_MASK       0x1
-#define ISCSI_XHQE_SUPER_IO_SHIFT      4
-#define ISCSI_XHQE_STATUS_BIT_MASK     0x1
-#define ISCSI_XHQE_STATUS_BIT_SHIFT    5
-#define ISCSI_XHQE_RESERVED_MASK       0x3
-#define ISCSI_XHQE_RESERVED_SHIFT      6
-	union iscsi_seq_num seq_num_union;
+#define ISCSI_XHQE_FINAL_MASK       0x1
+#define ISCSI_XHQE_FINAL_SHIFT      0
+#define ISCSI_XHQE_STATUS_BIT_MASK  0x1
+#define ISCSI_XHQE_STATUS_BIT_SHIFT 1
+#define ISCSI_XHQE_NUM_SGES_MASK    0xF
+#define ISCSI_XHQE_NUM_SGES_SHIFT   2
+#define ISCSI_XHQE_RESERVED0_MASK   0x3
+#define ISCSI_XHQE_RESERVED0_SHIFT  6
+	union iscsi_seq_num seq_num;
 	__le16 reserved1;
 };
 
diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h
index bad02df213df..866f063026de 100644
--- a/include/linux/qed/roce_common.h
+++ b/include/linux/qed/roce_common.h
@@ -38,4 +38,21 @@
 
 #define ROCE_MAX_QPS	(32 * 1024)
 
+enum roce_async_events_type {
+	ROCE_ASYNC_EVENT_NONE = 0,
+	ROCE_ASYNC_EVENT_COMM_EST = 1,
+	ROCE_ASYNC_EVENT_SQ_DRAINED,
+	ROCE_ASYNC_EVENT_SRQ_LIMIT,
+	ROCE_ASYNC_EVENT_LAST_WQE_REACHED,
+	ROCE_ASYNC_EVENT_CQ_ERR,
+	ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR,
+	ROCE_ASYNC_EVENT_LOCAL_CATASTROPHIC_ERR,
+	ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR,
+	ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR,
+	ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR,
+	ROCE_ASYNC_EVENT_SRQ_EMPTY,
+	ROCE_ASYNC_EVENT_DESTROY_QP_DONE,
+	MAX_ROCE_ASYNC_EVENTS_TYPE
+};
+
 #endif /* __ROCE_COMMON__ */
diff --git a/include/linux/qed/storage_common.h b/include/linux/qed/storage_common.h
index 03f3e37ab059..08df82a096b6 100644
--- a/include/linux/qed/storage_common.h
+++ b/include/linux/qed/storage_common.h
@@ -40,6 +40,8 @@
 #define BDQ_ID_IMM_DATA          (1)
 #define BDQ_NUM_IDS          (2)
 
+#define SCSI_NUM_SGES_SLOW_SGL_THR      8
+
 #define BDQ_MAX_EXTERNAL_RING_SIZE (1 << 15)
 
 struct scsi_bd {
@@ -52,6 +54,16 @@ struct scsi_bdq_ram_drv_data {
 	__le16 reserved0[3];
 };
 
+struct scsi_sge {
+	struct regpair sge_addr;
+	__le32 sge_len;
+	__le32 reserved;
+};
+
+struct scsi_cached_sges {
+	struct scsi_sge sge[4];
+};
+
 struct scsi_drv_cmdq {
 	__le16 cmdq_cons;
 	__le16 reserved0;
@@ -99,11 +111,19 @@ struct scsi_ram_per_bdq_resource_drv_data {
 	struct scsi_bdq_ram_drv_data drv_data_per_bdq_id[BDQ_NUM_IDS];
 };
 
-struct scsi_sge {
-	struct regpair sge_addr;
-	__le16 sge_len;
-	__le16 reserved0;
-	__le32 reserved1;
+enum scsi_sgl_mode {
+	SCSI_TX_SLOW_SGL,
+	SCSI_FAST_SGL,
+	MAX_SCSI_SGL_MODE
+};
+
+struct scsi_sgl_params {
+	struct regpair sgl_addr;
+	__le32 sgl_total_length;
+	__le32 sge_offset;
+	__le16 sgl_num_sges;
+	u8 sgl_index;
+	u8 reserved;
 };
 
 struct scsi_terminate_extra_params {
diff --git a/include/linux/qed/tcp_common.h b/include/linux/qed/tcp_common.h
index 46fe7856f1b2..a5e843268f0e 100644
--- a/include/linux/qed/tcp_common.h
+++ b/include/linux/qed/tcp_common.h
@@ -173,6 +173,7 @@ enum tcp_seg_placement_event {
 	TCP_EVENT_ADD_ISLE_RIGHT,
 	TCP_EVENT_ADD_ISLE_LEFT,
 	TCP_EVENT_JOIN,
+	TCP_EVENT_DELETE_ISLES,
 	TCP_EVENT_NOP,
 	MAX_TCP_SEG_PLACEMENT_EVENT
 };
-- 
cgit v1.2.3


From 8961df8950b1235cb7594e143a31bcc63757b660 Mon Sep 17 00:00:00 2001
From: Richard Genoud <richard.genoud@gmail.com>
Date: Fri, 3 Mar 2017 15:13:44 +0100
Subject: tty/serial: atmel: move atmel_serial header into driver directory

atmel_serial.h is only used by atmel_serial.c, so there's no need for
it to lie in include/linux.

Suggested-by: Joe Perches <joe@perches.com>
Signed-off-by: Richard Genoud <richard.genoud@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 MAINTAINERS                       |   2 +-
 drivers/tty/serial/atmel_serial.c |   2 +-
 drivers/tty/serial/atmel_serial.h | 169 ++++++++++++++++++++++++++++++++++++++
 include/linux/atmel_serial.h      | 169 --------------------------------------
 4 files changed, 171 insertions(+), 171 deletions(-)
 create mode 100644 drivers/tty/serial/atmel_serial.h
 delete mode 100644 include/linux/atmel_serial.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c776906f67a9..010ab746ea4b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8283,7 +8283,7 @@ MICROCHIP / ATMEL AT91 / AT32 SERIAL DRIVER
 M:	Richard Genoud <richard.genoud@gmail.com>
 S:	Maintained
 F:	drivers/tty/serial/atmel_serial.c
-F:	include/linux/atmel_serial.h
+F:	drivers/tty/serial/atmel_serial.h
 
 MICROCHIP / ATMEL DMA DRIVER
 M:	Ludovic Desroches <ludovic.desroches@microchip.com>
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index 8cc152e67bfb..d9c05e05d896 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -38,7 +38,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/atmel_pdc.h>
-#include <linux/atmel_serial.h>
 #include <linux/uaccess.h>
 #include <linux/platform_data/atmel.h>
 #include <linux/timer.h>
@@ -71,6 +70,7 @@
 #include <linux/serial_core.h>
 
 #include "serial_mctrl_gpio.h"
+#include "atmel_serial.h"
 
 static void atmel_start_rx(struct uart_port *port);
 static void atmel_stop_rx(struct uart_port *port);
diff --git a/drivers/tty/serial/atmel_serial.h b/drivers/tty/serial/atmel_serial.h
new file mode 100644
index 000000000000..bd2560502f3c
--- /dev/null
+++ b/drivers/tty/serial/atmel_serial.h
@@ -0,0 +1,169 @@
+/*
+ * include/linux/atmel_serial.h
+ *
+ * Copyright (C) 2005 Ivan Kokshaysky
+ * Copyright (C) SAN People
+ *
+ * USART registers.
+ * Based on AT91RM9200 datasheet revision E.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef ATMEL_SERIAL_H
+#define ATMEL_SERIAL_H
+
+#define ATMEL_US_CR		0x00	/* Control Register */
+#define	ATMEL_US_RSTRX		BIT(2)	/* Reset Receiver */
+#define	ATMEL_US_RSTTX		BIT(3)	/* Reset Transmitter */
+#define	ATMEL_US_RXEN		BIT(4)	/* Receiver Enable */
+#define	ATMEL_US_RXDIS		BIT(5)	/* Receiver Disable */
+#define	ATMEL_US_TXEN		BIT(6)	/* Transmitter Enable */
+#define	ATMEL_US_TXDIS		BIT(7)	/* Transmitter Disable */
+#define	ATMEL_US_RSTSTA		BIT(8)	/* Reset Status Bits */
+#define	ATMEL_US_STTBRK		BIT(9)	/* Start Break */
+#define	ATMEL_US_STPBRK		BIT(10)	/* Stop Break */
+#define	ATMEL_US_STTTO		BIT(11)	/* Start Time-out */
+#define	ATMEL_US_SENDA		BIT(12)	/* Send Address */
+#define	ATMEL_US_RSTIT		BIT(13)	/* Reset Iterations */
+#define	ATMEL_US_RSTNACK	BIT(14)	/* Reset Non Acknowledge */
+#define	ATMEL_US_RETTO		BIT(15)	/* Rearm Time-out */
+#define	ATMEL_US_DTREN		BIT(16)	/* Data Terminal Ready Enable */
+#define	ATMEL_US_DTRDIS		BIT(17)	/* Data Terminal Ready Disable */
+#define	ATMEL_US_RTSEN		BIT(18)	/* Request To Send Enable */
+#define	ATMEL_US_RTSDIS		BIT(19)	/* Request To Send Disable */
+#define	ATMEL_US_TXFCLR		BIT(24)	/* Transmit FIFO Clear */
+#define	ATMEL_US_RXFCLR		BIT(25)	/* Receive FIFO Clear */
+#define	ATMEL_US_TXFLCLR	BIT(26)	/* Transmit FIFO Lock Clear */
+#define	ATMEL_US_FIFOEN		BIT(30)	/* FIFO enable */
+#define	ATMEL_US_FIFODIS	BIT(31)	/* FIFO disable */
+
+#define ATMEL_US_MR		0x04	/* Mode Register */
+#define	ATMEL_US_USMODE		GENMASK(3, 0)	/* Mode of the USART */
+#define		ATMEL_US_USMODE_NORMAL		0
+#define		ATMEL_US_USMODE_RS485		1
+#define		ATMEL_US_USMODE_HWHS		2
+#define		ATMEL_US_USMODE_MODEM		3
+#define		ATMEL_US_USMODE_ISO7816_T0	4
+#define		ATMEL_US_USMODE_ISO7816_T1	6
+#define		ATMEL_US_USMODE_IRDA		8
+#define	ATMEL_US_USCLKS		GENMASK(5, 4)	/* Clock Selection */
+#define		ATMEL_US_USCLKS_MCK		(0 <<  4)
+#define		ATMEL_US_USCLKS_MCK_DIV8	(1 <<  4)
+#define		ATMEL_US_USCLKS_SCK		(3 <<  4)
+#define	ATMEL_US_CHRL		GENMASK(7, 6)	/* Character Length */
+#define		ATMEL_US_CHRL_5			(0 <<  6)
+#define		ATMEL_US_CHRL_6			(1 <<  6)
+#define		ATMEL_US_CHRL_7			(2 <<  6)
+#define		ATMEL_US_CHRL_8			(3 <<  6)
+#define	ATMEL_US_SYNC		BIT(8)		/* Synchronous Mode Select */
+#define	ATMEL_US_PAR		GENMASK(11, 9)	/* Parity Type */
+#define		ATMEL_US_PAR_EVEN		(0 <<  9)
+#define		ATMEL_US_PAR_ODD		(1 <<  9)
+#define		ATMEL_US_PAR_SPACE		(2 <<  9)
+#define		ATMEL_US_PAR_MARK		(3 <<  9)
+#define		ATMEL_US_PAR_NONE		(4 <<  9)
+#define		ATMEL_US_PAR_MULTI_DROP		(6 <<  9)
+#define	ATMEL_US_NBSTOP		GENMASK(13, 12)	/* Number of Stop Bits */
+#define		ATMEL_US_NBSTOP_1		(0 << 12)
+#define		ATMEL_US_NBSTOP_1_5		(1 << 12)
+#define		ATMEL_US_NBSTOP_2		(2 << 12)
+#define	ATMEL_US_CHMODE		GENMASK(15, 14)	/* Channel Mode */
+#define		ATMEL_US_CHMODE_NORMAL		(0 << 14)
+#define		ATMEL_US_CHMODE_ECHO		(1 << 14)
+#define		ATMEL_US_CHMODE_LOC_LOOP	(2 << 14)
+#define		ATMEL_US_CHMODE_REM_LOOP	(3 << 14)
+#define	ATMEL_US_MSBF		BIT(16)	/* Bit Order */
+#define	ATMEL_US_MODE9		BIT(17)	/* 9-bit Character Length */
+#define	ATMEL_US_CLKO		BIT(18)	/* Clock Output Select */
+#define	ATMEL_US_OVER		BIT(19)	/* Oversampling Mode */
+#define	ATMEL_US_INACK		BIT(20)	/* Inhibit Non Acknowledge */
+#define	ATMEL_US_DSNACK		BIT(21)	/* Disable Successive NACK */
+#define	ATMEL_US_MAX_ITER	GENMASK(26, 24)	/* Max Iterations */
+#define	ATMEL_US_FILTER		BIT(28)	/* Infrared Receive Line Filter */
+
+#define ATMEL_US_IER		0x08	/* Interrupt Enable Register */
+#define	ATMEL_US_RXRDY		BIT(0)	/* Receiver Ready */
+#define	ATMEL_US_TXRDY		BIT(1)	/* Transmitter Ready */
+#define	ATMEL_US_RXBRK		BIT(2)	/* Break Received / End of Break */
+#define	ATMEL_US_ENDRX		BIT(3)	/* End of Receiver Transfer */
+#define	ATMEL_US_ENDTX		BIT(4)	/* End of Transmitter Transfer */
+#define	ATMEL_US_OVRE		BIT(5)	/* Overrun Error */
+#define	ATMEL_US_FRAME		BIT(6)	/* Framing Error */
+#define	ATMEL_US_PARE		BIT(7)	/* Parity Error */
+#define	ATMEL_US_TIMEOUT	BIT(8)	/* Receiver Time-out */
+#define	ATMEL_US_TXEMPTY	BIT(9)	/* Transmitter Empty */
+#define	ATMEL_US_ITERATION	BIT(10)	/* Max number of Repetitions Reached */
+#define	ATMEL_US_TXBUFE		BIT(11)	/* Transmission Buffer Empty */
+#define	ATMEL_US_RXBUFF		BIT(12)	/* Reception Buffer Full */
+#define	ATMEL_US_NACK		BIT(13)	/* Non Acknowledge */
+#define	ATMEL_US_RIIC		BIT(16)	/* Ring Indicator Input Change */
+#define	ATMEL_US_DSRIC		BIT(17)	/* Data Set Ready Input Change */
+#define	ATMEL_US_DCDIC		BIT(18)	/* Data Carrier Detect Input Change */
+#define	ATMEL_US_CTSIC		BIT(19)	/* Clear to Send Input Change */
+#define	ATMEL_US_RI		BIT(20)	/* RI */
+#define	ATMEL_US_DSR		BIT(21)	/* DSR */
+#define	ATMEL_US_DCD		BIT(22)	/* DCD */
+#define	ATMEL_US_CTS		BIT(23)	/* CTS */
+
+#define ATMEL_US_IDR		0x0c	/* Interrupt Disable Register */
+#define ATMEL_US_IMR		0x10	/* Interrupt Mask Register */
+#define ATMEL_US_CSR		0x14	/* Channel Status Register */
+#define ATMEL_US_RHR		0x18	/* Receiver Holding Register */
+#define ATMEL_US_THR		0x1c	/* Transmitter Holding Register */
+#define	ATMEL_US_SYNH		BIT(15)	/* Transmit/Receive Sync */
+
+#define ATMEL_US_BRGR		0x20	/* Baud Rate Generator Register */
+#define	ATMEL_US_CD		GENMASK(15, 0)	/* Clock Divider */
+#define ATMEL_US_FP_OFFSET	16	/* Fractional Part */
+#define ATMEL_US_FP_MASK	0x7
+
+#define ATMEL_US_RTOR		0x24	/* Receiver Time-out Register for USART */
+#define ATMEL_UA_RTOR		0x28	/* Receiver Time-out Register for UART */
+#define	ATMEL_US_TO		GENMASK(15, 0)	/* Time-out Value */
+
+#define ATMEL_US_TTGR		0x28	/* Transmitter Timeguard Register */
+#define	ATMEL_US_TG		GENMASK(7, 0)	/* Timeguard Value */
+
+#define ATMEL_US_FIDI		0x40	/* FI DI Ratio Register */
+#define ATMEL_US_NER		0x44	/* Number of Errors Register */
+#define ATMEL_US_IF		0x4c	/* IrDA Filter Register */
+
+#define ATMEL_US_CMPR		0x90	/* Comparaison Register */
+#define ATMEL_US_FMR		0xa0	/* FIFO Mode Register */
+#define	ATMEL_US_TXRDYM(data)	(((data) & 0x3) << 0)	/* TX Ready Mode */
+#define	ATMEL_US_RXRDYM(data)	(((data) & 0x3) << 4)	/* RX Ready Mode */
+#define		ATMEL_US_ONE_DATA	0x0
+#define		ATMEL_US_TWO_DATA	0x1
+#define		ATMEL_US_FOUR_DATA	0x2
+#define	ATMEL_US_FRTSC		BIT(7)	/* FIFO RTS pin Control */
+#define	ATMEL_US_TXFTHRES(thr)	(((thr) & 0x3f) << 8)	/* TX FIFO Threshold */
+#define	ATMEL_US_RXFTHRES(thr)	(((thr) & 0x3f) << 16)	/* RX FIFO Threshold */
+#define	ATMEL_US_RXFTHRES2(thr)	(((thr) & 0x3f) << 24)	/* RX FIFO Threshold2 */
+
+#define ATMEL_US_FLR		0xa4	/* FIFO Level Register */
+#define	ATMEL_US_TXFL(reg)	(((reg) >> 0) & 0x3f)	/* TX FIFO Level */
+#define	ATMEL_US_RXFL(reg)	(((reg) >> 16) & 0x3f)	/* RX FIFO Level */
+
+#define ATMEL_US_FIER		0xa8	/* FIFO Interrupt Enable Register */
+#define ATMEL_US_FIDR		0xac	/* FIFO Interrupt Disable Register */
+#define ATMEL_US_FIMR		0xb0	/* FIFO Interrupt Mask Register */
+#define ATMEL_US_FESR		0xb4	/* FIFO Event Status Register */
+#define	ATMEL_US_TXFEF		BIT(0)	/* Transmit FIFO Empty Flag */
+#define	ATMEL_US_TXFFF		BIT(1)	/* Transmit FIFO Full Flag */
+#define	ATMEL_US_TXFTHF		BIT(2)	/* Transmit FIFO Threshold Flag */
+#define	ATMEL_US_RXFEF		BIT(3)	/* Receive FIFO Empty Flag */
+#define	ATMEL_US_RXFFF		BIT(4)	/* Receive FIFO Full Flag */
+#define	ATMEL_US_RXFTHF		BIT(5)	/* Receive FIFO Threshold Flag */
+#define	ATMEL_US_TXFPTEF	BIT(6)	/* Transmit FIFO Pointer Error Flag */
+#define	ATMEL_US_RXFPTEF	BIT(7)	/* Receive FIFO Pointer Error Flag */
+#define	ATMEL_US_TXFLOCK	BIT(8)	/* Transmit FIFO Lock (FESR only) */
+#define	ATMEL_US_RXFTHF2	BIT(9)	/* Receive FIFO Threshold Flag 2 */
+
+#define ATMEL_US_NAME		0xf0	/* Ip Name */
+#define ATMEL_US_VERSION	0xfc	/* Ip Version */
+
+#endif
diff --git a/include/linux/atmel_serial.h b/include/linux/atmel_serial.h
deleted file mode 100644
index bd2560502f3c..000000000000
--- a/include/linux/atmel_serial.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * include/linux/atmel_serial.h
- *
- * Copyright (C) 2005 Ivan Kokshaysky
- * Copyright (C) SAN People
- *
- * USART registers.
- * Based on AT91RM9200 datasheet revision E.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef ATMEL_SERIAL_H
-#define ATMEL_SERIAL_H
-
-#define ATMEL_US_CR		0x00	/* Control Register */
-#define	ATMEL_US_RSTRX		BIT(2)	/* Reset Receiver */
-#define	ATMEL_US_RSTTX		BIT(3)	/* Reset Transmitter */
-#define	ATMEL_US_RXEN		BIT(4)	/* Receiver Enable */
-#define	ATMEL_US_RXDIS		BIT(5)	/* Receiver Disable */
-#define	ATMEL_US_TXEN		BIT(6)	/* Transmitter Enable */
-#define	ATMEL_US_TXDIS		BIT(7)	/* Transmitter Disable */
-#define	ATMEL_US_RSTSTA		BIT(8)	/* Reset Status Bits */
-#define	ATMEL_US_STTBRK		BIT(9)	/* Start Break */
-#define	ATMEL_US_STPBRK		BIT(10)	/* Stop Break */
-#define	ATMEL_US_STTTO		BIT(11)	/* Start Time-out */
-#define	ATMEL_US_SENDA		BIT(12)	/* Send Address */
-#define	ATMEL_US_RSTIT		BIT(13)	/* Reset Iterations */
-#define	ATMEL_US_RSTNACK	BIT(14)	/* Reset Non Acknowledge */
-#define	ATMEL_US_RETTO		BIT(15)	/* Rearm Time-out */
-#define	ATMEL_US_DTREN		BIT(16)	/* Data Terminal Ready Enable */
-#define	ATMEL_US_DTRDIS		BIT(17)	/* Data Terminal Ready Disable */
-#define	ATMEL_US_RTSEN		BIT(18)	/* Request To Send Enable */
-#define	ATMEL_US_RTSDIS		BIT(19)	/* Request To Send Disable */
-#define	ATMEL_US_TXFCLR		BIT(24)	/* Transmit FIFO Clear */
-#define	ATMEL_US_RXFCLR		BIT(25)	/* Receive FIFO Clear */
-#define	ATMEL_US_TXFLCLR	BIT(26)	/* Transmit FIFO Lock Clear */
-#define	ATMEL_US_FIFOEN		BIT(30)	/* FIFO enable */
-#define	ATMEL_US_FIFODIS	BIT(31)	/* FIFO disable */
-
-#define ATMEL_US_MR		0x04	/* Mode Register */
-#define	ATMEL_US_USMODE		GENMASK(3, 0)	/* Mode of the USART */
-#define		ATMEL_US_USMODE_NORMAL		0
-#define		ATMEL_US_USMODE_RS485		1
-#define		ATMEL_US_USMODE_HWHS		2
-#define		ATMEL_US_USMODE_MODEM		3
-#define		ATMEL_US_USMODE_ISO7816_T0	4
-#define		ATMEL_US_USMODE_ISO7816_T1	6
-#define		ATMEL_US_USMODE_IRDA		8
-#define	ATMEL_US_USCLKS		GENMASK(5, 4)	/* Clock Selection */
-#define		ATMEL_US_USCLKS_MCK		(0 <<  4)
-#define		ATMEL_US_USCLKS_MCK_DIV8	(1 <<  4)
-#define		ATMEL_US_USCLKS_SCK		(3 <<  4)
-#define	ATMEL_US_CHRL		GENMASK(7, 6)	/* Character Length */
-#define		ATMEL_US_CHRL_5			(0 <<  6)
-#define		ATMEL_US_CHRL_6			(1 <<  6)
-#define		ATMEL_US_CHRL_7			(2 <<  6)
-#define		ATMEL_US_CHRL_8			(3 <<  6)
-#define	ATMEL_US_SYNC		BIT(8)		/* Synchronous Mode Select */
-#define	ATMEL_US_PAR		GENMASK(11, 9)	/* Parity Type */
-#define		ATMEL_US_PAR_EVEN		(0 <<  9)
-#define		ATMEL_US_PAR_ODD		(1 <<  9)
-#define		ATMEL_US_PAR_SPACE		(2 <<  9)
-#define		ATMEL_US_PAR_MARK		(3 <<  9)
-#define		ATMEL_US_PAR_NONE		(4 <<  9)
-#define		ATMEL_US_PAR_MULTI_DROP		(6 <<  9)
-#define	ATMEL_US_NBSTOP		GENMASK(13, 12)	/* Number of Stop Bits */
-#define		ATMEL_US_NBSTOP_1		(0 << 12)
-#define		ATMEL_US_NBSTOP_1_5		(1 << 12)
-#define		ATMEL_US_NBSTOP_2		(2 << 12)
-#define	ATMEL_US_CHMODE		GENMASK(15, 14)	/* Channel Mode */
-#define		ATMEL_US_CHMODE_NORMAL		(0 << 14)
-#define		ATMEL_US_CHMODE_ECHO		(1 << 14)
-#define		ATMEL_US_CHMODE_LOC_LOOP	(2 << 14)
-#define		ATMEL_US_CHMODE_REM_LOOP	(3 << 14)
-#define	ATMEL_US_MSBF		BIT(16)	/* Bit Order */
-#define	ATMEL_US_MODE9		BIT(17)	/* 9-bit Character Length */
-#define	ATMEL_US_CLKO		BIT(18)	/* Clock Output Select */
-#define	ATMEL_US_OVER		BIT(19)	/* Oversampling Mode */
-#define	ATMEL_US_INACK		BIT(20)	/* Inhibit Non Acknowledge */
-#define	ATMEL_US_DSNACK		BIT(21)	/* Disable Successive NACK */
-#define	ATMEL_US_MAX_ITER	GENMASK(26, 24)	/* Max Iterations */
-#define	ATMEL_US_FILTER		BIT(28)	/* Infrared Receive Line Filter */
-
-#define ATMEL_US_IER		0x08	/* Interrupt Enable Register */
-#define	ATMEL_US_RXRDY		BIT(0)	/* Receiver Ready */
-#define	ATMEL_US_TXRDY		BIT(1)	/* Transmitter Ready */
-#define	ATMEL_US_RXBRK		BIT(2)	/* Break Received / End of Break */
-#define	ATMEL_US_ENDRX		BIT(3)	/* End of Receiver Transfer */
-#define	ATMEL_US_ENDTX		BIT(4)	/* End of Transmitter Transfer */
-#define	ATMEL_US_OVRE		BIT(5)	/* Overrun Error */
-#define	ATMEL_US_FRAME		BIT(6)	/* Framing Error */
-#define	ATMEL_US_PARE		BIT(7)	/* Parity Error */
-#define	ATMEL_US_TIMEOUT	BIT(8)	/* Receiver Time-out */
-#define	ATMEL_US_TXEMPTY	BIT(9)	/* Transmitter Empty */
-#define	ATMEL_US_ITERATION	BIT(10)	/* Max number of Repetitions Reached */
-#define	ATMEL_US_TXBUFE		BIT(11)	/* Transmission Buffer Empty */
-#define	ATMEL_US_RXBUFF		BIT(12)	/* Reception Buffer Full */
-#define	ATMEL_US_NACK		BIT(13)	/* Non Acknowledge */
-#define	ATMEL_US_RIIC		BIT(16)	/* Ring Indicator Input Change */
-#define	ATMEL_US_DSRIC		BIT(17)	/* Data Set Ready Input Change */
-#define	ATMEL_US_DCDIC		BIT(18)	/* Data Carrier Detect Input Change */
-#define	ATMEL_US_CTSIC		BIT(19)	/* Clear to Send Input Change */
-#define	ATMEL_US_RI		BIT(20)	/* RI */
-#define	ATMEL_US_DSR		BIT(21)	/* DSR */
-#define	ATMEL_US_DCD		BIT(22)	/* DCD */
-#define	ATMEL_US_CTS		BIT(23)	/* CTS */
-
-#define ATMEL_US_IDR		0x0c	/* Interrupt Disable Register */
-#define ATMEL_US_IMR		0x10	/* Interrupt Mask Register */
-#define ATMEL_US_CSR		0x14	/* Channel Status Register */
-#define ATMEL_US_RHR		0x18	/* Receiver Holding Register */
-#define ATMEL_US_THR		0x1c	/* Transmitter Holding Register */
-#define	ATMEL_US_SYNH		BIT(15)	/* Transmit/Receive Sync */
-
-#define ATMEL_US_BRGR		0x20	/* Baud Rate Generator Register */
-#define	ATMEL_US_CD		GENMASK(15, 0)	/* Clock Divider */
-#define ATMEL_US_FP_OFFSET	16	/* Fractional Part */
-#define ATMEL_US_FP_MASK	0x7
-
-#define ATMEL_US_RTOR		0x24	/* Receiver Time-out Register for USART */
-#define ATMEL_UA_RTOR		0x28	/* Receiver Time-out Register for UART */
-#define	ATMEL_US_TO		GENMASK(15, 0)	/* Time-out Value */
-
-#define ATMEL_US_TTGR		0x28	/* Transmitter Timeguard Register */
-#define	ATMEL_US_TG		GENMASK(7, 0)	/* Timeguard Value */
-
-#define ATMEL_US_FIDI		0x40	/* FI DI Ratio Register */
-#define ATMEL_US_NER		0x44	/* Number of Errors Register */
-#define ATMEL_US_IF		0x4c	/* IrDA Filter Register */
-
-#define ATMEL_US_CMPR		0x90	/* Comparaison Register */
-#define ATMEL_US_FMR		0xa0	/* FIFO Mode Register */
-#define	ATMEL_US_TXRDYM(data)	(((data) & 0x3) << 0)	/* TX Ready Mode */
-#define	ATMEL_US_RXRDYM(data)	(((data) & 0x3) << 4)	/* RX Ready Mode */
-#define		ATMEL_US_ONE_DATA	0x0
-#define		ATMEL_US_TWO_DATA	0x1
-#define		ATMEL_US_FOUR_DATA	0x2
-#define	ATMEL_US_FRTSC		BIT(7)	/* FIFO RTS pin Control */
-#define	ATMEL_US_TXFTHRES(thr)	(((thr) & 0x3f) << 8)	/* TX FIFO Threshold */
-#define	ATMEL_US_RXFTHRES(thr)	(((thr) & 0x3f) << 16)	/* RX FIFO Threshold */
-#define	ATMEL_US_RXFTHRES2(thr)	(((thr) & 0x3f) << 24)	/* RX FIFO Threshold2 */
-
-#define ATMEL_US_FLR		0xa4	/* FIFO Level Register */
-#define	ATMEL_US_TXFL(reg)	(((reg) >> 0) & 0x3f)	/* TX FIFO Level */
-#define	ATMEL_US_RXFL(reg)	(((reg) >> 16) & 0x3f)	/* RX FIFO Level */
-
-#define ATMEL_US_FIER		0xa8	/* FIFO Interrupt Enable Register */
-#define ATMEL_US_FIDR		0xac	/* FIFO Interrupt Disable Register */
-#define ATMEL_US_FIMR		0xb0	/* FIFO Interrupt Mask Register */
-#define ATMEL_US_FESR		0xb4	/* FIFO Event Status Register */
-#define	ATMEL_US_TXFEF		BIT(0)	/* Transmit FIFO Empty Flag */
-#define	ATMEL_US_TXFFF		BIT(1)	/* Transmit FIFO Full Flag */
-#define	ATMEL_US_TXFTHF		BIT(2)	/* Transmit FIFO Threshold Flag */
-#define	ATMEL_US_RXFEF		BIT(3)	/* Receive FIFO Empty Flag */
-#define	ATMEL_US_RXFFF		BIT(4)	/* Receive FIFO Full Flag */
-#define	ATMEL_US_RXFTHF		BIT(5)	/* Receive FIFO Threshold Flag */
-#define	ATMEL_US_TXFPTEF	BIT(6)	/* Transmit FIFO Pointer Error Flag */
-#define	ATMEL_US_RXFPTEF	BIT(7)	/* Receive FIFO Pointer Error Flag */
-#define	ATMEL_US_TXFLOCK	BIT(8)	/* Transmit FIFO Lock (FESR only) */
-#define	ATMEL_US_RXFTHF2	BIT(9)	/* Receive FIFO Threshold Flag 2 */
-
-#define ATMEL_US_NAME		0xf0	/* Ip Name */
-#define ATMEL_US_VERSION	0xfc	/* Ip Version */
-
-#endif
-- 
cgit v1.2.3


From 83ee102a6998f808ac4c626e8f72344f0a355527 Mon Sep 17 00:00:00 2001
From: Doug Berger <opendmb@gmail.com>
Date: Mon, 13 Mar 2017 17:41:32 -0700
Subject: net: phy: bcm7xxx: add support for 28nm EPHY

This commit adds support for the internal fast ethernet 10/100 PHY
found in the BCM7260, BCM7268, and BCM7271 devices.

Signed-off-by: Doug Berger <opendmb@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/bcm7xxx.c | 215 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/brcmphy.h   |   3 +
 2 files changed, 216 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c
index d1c2614dad3a..caa9f6e17f34 100644
--- a/drivers/net/phy/bcm7xxx.c
+++ b/drivers/net/phy/bcm7xxx.c
@@ -1,7 +1,7 @@
 /*
  * Broadcom BCM7xxx internal transceivers support.
  *
- * Copyright (C) 2014, Broadcom Corporation
+ * Copyright (C) 2014-2017 Broadcom
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -19,7 +19,7 @@
 
 /* Broadcom BCM7xxx internal PHY registers */
 
-/* 40nm only register definitions */
+/* EPHY only register definitions */
 #define MII_BCM7XXX_100TX_AUX_CTL	0x10
 #define MII_BCM7XXX_100TX_FALSE_CAR	0x13
 #define MII_BCM7XXX_100TX_DISC		0x14
@@ -27,6 +27,19 @@
 #define  MII_BCM7XXX_64CLK_MDIO		BIT(12)
 #define MII_BCM7XXX_TEST		0x1f
 #define  MII_BCM7XXX_SHD_MODE_2		BIT(2)
+#define MII_BCM7XXX_SHD_2_ADDR_CTRL	0xe
+#define MII_BCM7XXX_SHD_2_CTRL_STAT	0xf
+#define MII_BCM7XXX_SHD_2_BIAS_TRIM	0x1a
+#define MII_BCM7XXX_SHD_3_AN_EEE_ADV	0x3
+#define MII_BCM7XXX_SHD_3_PCS_CTRL_2	0x6
+#define  MII_BCM7XXX_PCS_CTRL_2_DEF	0x4400
+#define MII_BCM7XXX_SHD_3_AN_STAT	0xb
+#define  MII_BCM7XXX_AN_NULL_MSG_EN	BIT(0)
+#define  MII_BCM7XXX_AN_EEE_EN		BIT(1)
+#define MII_BCM7XXX_SHD_3_EEE_THRESH	0xe
+#define  MII_BCM7XXX_EEE_THRESH_DEF	0x50
+#define MII_BCM7XXX_SHD_3_TL4		0x23
+#define  MII_BCM7XXX_TL4_RST_MSK	(BIT(2) | BIT(1))
 
 /* 28nm only register definitions */
 #define MISC_ADDR(base, channel)	base, channel
@@ -286,6 +299,181 @@ static int phy_set_clr_bits(struct phy_device *dev, int location,
 	return v;
 }
 
+static int bcm7xxx_28nm_ephy_01_afe_config_init(struct phy_device *phydev)
+{
+	int ret;
+
+	/* set shadow mode 2 */
+	ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST,
+			       MII_BCM7XXX_SHD_MODE_2, 0);
+	if (ret < 0)
+		return ret;
+
+	/* Set current trim values INT_trim = -1, Ext_trim =0 */
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_BIAS_TRIM, 0x3BE0);
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+	/* Cal reset */
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL,
+			MII_BCM7XXX_SHD_3_TL4);
+	if (ret < 0)
+		goto reset_shadow_mode;
+	ret = phy_set_clr_bits(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT,
+			       MII_BCM7XXX_TL4_RST_MSK, 0);
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+	/* Cal reset disable */
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL,
+			MII_BCM7XXX_SHD_3_TL4);
+	if (ret < 0)
+		goto reset_shadow_mode;
+	ret = phy_set_clr_bits(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT,
+			       0, MII_BCM7XXX_TL4_RST_MSK);
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+reset_shadow_mode:
+	/* reset shadow mode 2 */
+	ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0,
+			       MII_BCM7XXX_SHD_MODE_2);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+/* The 28nm EPHY does not support Clause 45 (MMD) used by bcm-phy-lib */
+static int bcm7xxx_28nm_ephy_apd_enable(struct phy_device *phydev)
+{
+	int ret;
+
+	/* set shadow mode 1 */
+	ret = phy_set_clr_bits(phydev, MII_BRCM_FET_BRCMTEST,
+			       MII_BRCM_FET_BT_SRE, 0);
+	if (ret < 0)
+		return ret;
+
+	/* Enable auto-power down */
+	ret = phy_set_clr_bits(phydev, MII_BRCM_FET_SHDW_AUXSTAT2,
+			       MII_BRCM_FET_SHDW_AS2_APDE, 0);
+	if (ret < 0)
+		return ret;
+
+	/* reset shadow mode 1 */
+	ret = phy_set_clr_bits(phydev, MII_BRCM_FET_BRCMTEST, 0,
+			       MII_BRCM_FET_BT_SRE);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int bcm7xxx_28nm_ephy_eee_enable(struct phy_device *phydev)
+{
+	int ret;
+
+	/* set shadow mode 2 */
+	ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST,
+			       MII_BCM7XXX_SHD_MODE_2, 0);
+	if (ret < 0)
+		return ret;
+
+	/* Advertise supported modes */
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL,
+			MII_BCM7XXX_SHD_3_AN_EEE_ADV);
+	if (ret < 0)
+		goto reset_shadow_mode;
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT,
+			MDIO_EEE_100TX);
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+	/* Restore Defaults */
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL,
+			MII_BCM7XXX_SHD_3_PCS_CTRL_2);
+	if (ret < 0)
+		goto reset_shadow_mode;
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT,
+			MII_BCM7XXX_PCS_CTRL_2_DEF);
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL,
+			MII_BCM7XXX_SHD_3_EEE_THRESH);
+	if (ret < 0)
+		goto reset_shadow_mode;
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT,
+			MII_BCM7XXX_EEE_THRESH_DEF);
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+	/* Enable EEE autonegotiation */
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_ADDR_CTRL,
+			MII_BCM7XXX_SHD_3_AN_STAT);
+	if (ret < 0)
+		goto reset_shadow_mode;
+	ret = phy_write(phydev, MII_BCM7XXX_SHD_2_CTRL_STAT,
+			(MII_BCM7XXX_AN_NULL_MSG_EN | MII_BCM7XXX_AN_EEE_EN));
+	if (ret < 0)
+		goto reset_shadow_mode;
+
+reset_shadow_mode:
+	/* reset shadow mode 2 */
+	ret = phy_set_clr_bits(phydev, MII_BCM7XXX_TEST, 0,
+			       MII_BCM7XXX_SHD_MODE_2);
+	if (ret < 0)
+		return ret;
+
+	/* Restart autoneg */
+	phy_write(phydev, MII_BMCR,
+		  (BMCR_SPEED100 | BMCR_ANENABLE | BMCR_ANRESTART));
+
+	return 0;
+}
+
+static int bcm7xxx_28nm_ephy_config_init(struct phy_device *phydev)
+{
+	u8 rev = phydev->phy_id & ~phydev->drv->phy_id_mask;
+	int ret = 0;
+
+	pr_info_once("%s: %s PHY revision: 0x%02x\n",
+		     phydev_name(phydev), phydev->drv->name, rev);
+
+	/* Dummy read to a register to workaround a possible issue upon reset
+	 * where the internal inverter may not allow the first MDIO transaction
+	 * to pass the MDIO management controller and make us return 0xffff for
+	 * such reads.
+	 */
+	phy_read(phydev, MII_BMSR);
+
+	/* Apply AFE software work-around if necessary */
+	if (rev == 0x01) {
+		ret = bcm7xxx_28nm_ephy_01_afe_config_init(phydev);
+		if (ret)
+			return ret;
+	}
+
+	ret = bcm7xxx_28nm_ephy_eee_enable(phydev);
+	if (ret)
+		return ret;
+
+	return bcm7xxx_28nm_ephy_apd_enable(phydev);
+}
+
+static int bcm7xxx_28nm_ephy_resume(struct phy_device *phydev)
+{
+	int ret;
+
+	/* Re-apply workarounds coming out suspend/resume */
+	ret = bcm7xxx_28nm_ephy_config_init(phydev);
+	if (ret)
+		return ret;
+
+	return genphy_config_aneg(phydev);
+}
+
 static int bcm7xxx_config_init(struct phy_device *phydev)
 {
 	int ret;
@@ -434,6 +622,23 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev)
 	.probe		= bcm7xxx_28nm_probe,				\
 }
 
+#define BCM7XXX_28NM_EPHY(_oui, _name)					\
+{									\
+	.phy_id		= (_oui),					\
+	.phy_id_mask	= 0xfffffff0,					\
+	.name		= _name,					\
+	.features	= PHY_BASIC_FEATURES,				\
+	.flags		= PHY_IS_INTERNAL,				\
+	.config_init	= bcm7xxx_28nm_ephy_config_init,		\
+	.config_aneg	= genphy_config_aneg,				\
+	.read_status	= genphy_read_status,				\
+	.resume		= bcm7xxx_28nm_ephy_resume,			\
+	.get_sset_count	= bcm_phy_get_sset_count,			\
+	.get_strings	= bcm_phy_get_strings,				\
+	.get_stats	= bcm7xxx_28nm_get_phy_stats,			\
+	.probe		= bcm7xxx_28nm_probe,				\
+}
+
 #define BCM7XXX_40NM_EPHY(_oui, _name)					\
 {									\
 	.phy_id         = (_oui),					\
@@ -450,6 +655,9 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev)
 
 static struct phy_driver bcm7xxx_driver[] = {
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7250, "Broadcom BCM7250"),
+	BCM7XXX_28NM_EPHY(PHY_ID_BCM7260, "Broadcom BCM7260"),
+	BCM7XXX_28NM_EPHY(PHY_ID_BCM7268, "Broadcom BCM7268"),
+	BCM7XXX_28NM_EPHY(PHY_ID_BCM7271, "Broadcom BCM7271"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7278, "Broadcom BCM7278"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7364, "Broadcom BCM7364"),
 	BCM7XXX_28NM_GPHY(PHY_ID_BCM7366, "Broadcom BCM7366"),
@@ -466,6 +674,9 @@ static struct phy_driver bcm7xxx_driver[] = {
 
 static struct mdio_device_id __maybe_unused bcm7xxx_tbl[] = {
 	{ PHY_ID_BCM7250, 0xfffffff0, },
+	{ PHY_ID_BCM7260, 0xfffffff0, },
+	{ PHY_ID_BCM7268, 0xfffffff0, },
+	{ PHY_ID_BCM7271, 0xfffffff0, },
 	{ PHY_ID_BCM7278, 0xfffffff0, },
 	{ PHY_ID_BCM7364, 0xfffffff0, },
 	{ PHY_ID_BCM7366, 0xfffffff0, },
diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h
index 55e517130311..abcda9b458ab 100644
--- a/include/linux/brcmphy.h
+++ b/include/linux/brcmphy.h
@@ -25,6 +25,9 @@
 #define PHY_ID_BCM57780			0x03625d90
 
 #define PHY_ID_BCM7250			0xae025280
+#define PHY_ID_BCM7260			0xae025190
+#define PHY_ID_BCM7268			0xae025090
+#define PHY_ID_BCM7271			0xae0253b0
 #define PHY_ID_BCM7278			0xae0251a0
 #define PHY_ID_BCM7364			0xae025260
 #define PHY_ID_BCM7366			0x600d8490
-- 
cgit v1.2.3


From 9c79ddaa0f962d1f26537a670b0652ff509a6fe0 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Tue, 14 Mar 2017 16:23:54 +0200
Subject: qed*: Add support for QL41xxx adapters

This adds the necessary infrastructure changes for initializing
and working with the new series of QL41xxx adapaters.

It also adds 2 new PCI device-IDs to qede:
  - 0x8070 for QL41xxx PFs
  - 0x8090 for VFs spawning from QL41xxx PFs

Signed-off-by: Tomer Tayar <Tomer.Tayar@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h           |  29 +++-
 drivers/net/ethernet/qlogic/qed/qed_debug.c     |   2 +-
 drivers/net/ethernet/qlogic/qed/qed_dev.c       | 186 ++++++++++++++++----
 drivers/net/ethernet/qlogic/qed/qed_hsi.h       |  61 +++++--
 drivers/net/ethernet/qlogic/qed/qed_l2.c        | 184 ++++++++++++--------
 drivers/net/ethernet/qlogic/qed/qed_main.c      |   7 +-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h       |   9 +-
 drivers/net/ethernet/qlogic/qed/qed_ptp.c       |  12 +-
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h  |  17 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c     |  30 +++-
 drivers/net/ethernet/qlogic/qede/qede.h         |  43 +++--
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c |  85 +++++----
 drivers/net/ethernet/qlogic/qede/qede_main.c    | 222 +++++++++++++-----------
 include/linux/qed/qed_if.h                      |  48 +++--
 include/linux/qed/rdma_common.h                 |   3 +-
 15 files changed, 630 insertions(+), 308 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index be99092d7209..ca30a27df035 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -219,7 +219,9 @@ enum QED_PORT_MODE {
 	QED_PORT_MODE_DE_4X20G,
 	QED_PORT_MODE_DE_1X40G,
 	QED_PORT_MODE_DE_2X25G,
-	QED_PORT_MODE_DE_1X25G
+	QED_PORT_MODE_DE_1X25G,
+	QED_PORT_MODE_DE_4X25G,
+	QED_PORT_MODE_DE_2X10G,
 };
 
 enum qed_dev_cap {
@@ -364,7 +366,8 @@ struct qed_hwfn {
 #define IS_LEAD_HWFN(edev)              (!((edev)->my_id))
 	u8				rel_pf_id;      /* Relative to engine*/
 	u8				abs_pf_id;
-#define QED_PATH_ID(_p_hwfn)		((_p_hwfn)->abs_pf_id & 1)
+#define QED_PATH_ID(_p_hwfn) \
+	(QED_IS_K2((_p_hwfn)->cdev) ? 0 : ((_p_hwfn)->abs_pf_id & 1))
 	u8				port_id;
 	bool				b_active;
 
@@ -523,9 +526,7 @@ struct qed_dev {
 	u8	dp_level;
 	char	name[NAME_SIZE];
 
-	u8	type;
-#define QED_DEV_TYPE_BB (0 << 0)
-#define QED_DEV_TYPE_AH BIT(0)
+	enum	qed_dev_type type;
 /* Translate type/revision combo into the proper conditions */
 #define QED_IS_BB(dev)  ((dev)->type == QED_DEV_TYPE_BB)
 #define QED_IS_BB_A0(dev)       (QED_IS_BB(dev) && \
@@ -540,6 +541,9 @@ struct qed_dev {
 
 	u16	vendor_id;
 	u16	device_id;
+#define QED_DEV_ID_MASK		0xff00
+#define QED_DEV_ID_MASK_BB	0x1600
+#define QED_DEV_ID_MASK_AH	0x8000
 
 	u16	chip_num;
 #define CHIP_NUM_MASK                   0xffff
@@ -654,10 +658,16 @@ struct qed_dev {
 	u32 rdma_max_srq_sge;
 };
 
-#define NUM_OF_VFS(dev)         MAX_NUM_VFS_BB
-#define NUM_OF_L2_QUEUES(dev)	MAX_NUM_L2_QUEUES_BB
-#define NUM_OF_SBS(dev)         MAX_SB_PER_PATH_BB
-#define NUM_OF_ENG_PFS(dev)     MAX_NUM_PFS_BB
+#define NUM_OF_VFS(dev)         (QED_IS_BB(dev) ? MAX_NUM_VFS_BB \
+						: MAX_NUM_VFS_K2)
+#define NUM_OF_L2_QUEUES(dev)   (QED_IS_BB(dev) ? MAX_NUM_L2_QUEUES_BB \
+						: MAX_NUM_L2_QUEUES_K2)
+#define NUM_OF_PORTS(dev)       (QED_IS_BB(dev) ? MAX_NUM_PORTS_BB \
+						: MAX_NUM_PORTS_K2)
+#define NUM_OF_SBS(dev)         (QED_IS_BB(dev) ? MAX_SB_PER_PATH_BB \
+						: MAX_SB_PER_PATH_K2)
+#define NUM_OF_ENG_PFS(dev)     (QED_IS_BB(dev) ? MAX_NUM_PFS_BB \
+						: MAX_NUM_PFS_K2)
 
 /**
  * @brief qed_concrete_to_sw_fid - get the sw function id from
@@ -694,6 +704,7 @@ void qed_configure_vp_wfq_on_link_change(struct qed_dev *cdev,
 
 void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 #define QED_LEADING_HWFN(dev)   (&dev->hwfns[0])
+int qed_device_num_engines(struct qed_dev *cdev);
 
 /* Other Linux specific common definitions */
 #define DP_NAME(cdev) ((cdev)->name)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c
index 5e81e8a7a109..483241b4b05d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_debug.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c
@@ -1557,7 +1557,7 @@ static enum dbg_status qed_dbg_dev_init(struct qed_hwfn *p_hwfn,
 		dev_data->mode_enable[MODE_K2] = 1;
 	} else if (QED_IS_BB_B0(p_hwfn->cdev)) {
 		dev_data->chip_id = CHIP_BB_B0;
-		dev_data->mode_enable[MODE_BB_B0] = 1;
+		dev_data->mode_enable[MODE_BB] = 1;
 	} else {
 		return DBG_STATUS_UNKNOWN_CHIP;
 	}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index e2a081ceaf52..bd4f43ffb5a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -674,11 +674,19 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
 	return rc;
 }
 
-static void qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
+static int qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 {
 	int hw_mode = 0;
 
-	hw_mode = (1 << MODE_BB_B0);
+	if (QED_IS_BB_B0(p_hwfn->cdev)) {
+		hw_mode |= 1 << MODE_BB;
+	} else if (QED_IS_AH(p_hwfn->cdev)) {
+		hw_mode |= 1 << MODE_K2;
+	} else {
+		DP_NOTICE(p_hwfn, "Unknown chip type %#x\n",
+			  p_hwfn->cdev->type);
+		return -EINVAL;
+	}
 
 	switch (p_hwfn->cdev->num_ports_in_engines) {
 	case 1:
@@ -693,7 +701,7 @@ static void qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 	default:
 		DP_NOTICE(p_hwfn, "num_ports_in_engine = %d not supported\n",
 			  p_hwfn->cdev->num_ports_in_engines);
-		return;
+		return -EINVAL;
 	}
 
 	switch (p_hwfn->cdev->mf_mode) {
@@ -719,6 +727,8 @@ static void qed_calc_hw_mode(struct qed_hwfn *p_hwfn)
 	DP_VERBOSE(p_hwfn, (NETIF_MSG_PROBE | NETIF_MSG_IFUP),
 		   "Configuring function for hw_mode: 0x%08x\n",
 		   p_hwfn->hw_info.hw_mode);
+
+	return 0;
 }
 
 /* Init run time data for all PFs on an engine. */
@@ -754,10 +764,10 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 	struct qed_qm_info *qm_info = &p_hwfn->qm_info;
 	struct qed_qm_common_rt_init_params params;
 	struct qed_dev *cdev = p_hwfn->cdev;
+	u8 vf_id, max_num_vfs;
 	u16 num_pfs, pf_id;
 	u32 concrete_fid;
 	int rc = 0;
-	u8 vf_id;
 
 	qed_init_cau_rt_data(cdev);
 
@@ -814,7 +824,8 @@ static int qed_hw_init_common(struct qed_hwfn *p_hwfn,
 		qed_fid_pretend(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
 	}
 
-	for (vf_id = 0; vf_id < MAX_NUM_VFS_BB; vf_id++) {
+	max_num_vfs = QED_IS_AH(cdev) ? MAX_NUM_VFS_K2 : MAX_NUM_VFS_BB;
+	for (vf_id = 0; vf_id < max_num_vfs; vf_id++) {
 		concrete_fid = qed_vfid_to_concrete(p_hwfn, vf_id);
 		qed_fid_pretend(p_hwfn, p_ptt, (u16) concrete_fid);
 		qed_wr(p_hwfn, p_ptt, CCFC_REG_STRONG_ENABLE_VF, 0x1);
@@ -1135,7 +1146,9 @@ int qed_hw_init(struct qed_dev *cdev,
 		/* Enable DMAE in PXP */
 		rc = qed_change_pci_hwfn(p_hwfn, p_hwfn->p_main_ptt, true);
 
-		qed_calc_hw_mode(p_hwfn);
+		rc = qed_calc_hw_mode(p_hwfn);
+		if (rc)
+			return rc;
 
 		rc = qed_mcp_load_req(p_hwfn, p_hwfn->p_main_ptt, &load_code);
 		if (rc) {
@@ -1485,10 +1498,25 @@ static void qed_hw_hwfn_free(struct qed_hwfn *p_hwfn)
 static void qed_hw_hwfn_prepare(struct qed_hwfn *p_hwfn)
 {
 	/* clear indirect access */
-	qed_wr(p_hwfn, p_hwfn->p_main_ptt, PGLUE_B_REG_PGL_ADDR_88_F0, 0);
-	qed_wr(p_hwfn, p_hwfn->p_main_ptt, PGLUE_B_REG_PGL_ADDR_8C_F0, 0);
-	qed_wr(p_hwfn, p_hwfn->p_main_ptt, PGLUE_B_REG_PGL_ADDR_90_F0, 0);
-	qed_wr(p_hwfn, p_hwfn->p_main_ptt, PGLUE_B_REG_PGL_ADDR_94_F0, 0);
+	if (QED_IS_AH(p_hwfn->cdev)) {
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_E8_F0_K2, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_EC_F0_K2, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_F0_F0_K2, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_F4_F0_K2, 0);
+	} else {
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_88_F0_BB, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_8C_F0_BB, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_90_F0_BB, 0);
+		qed_wr(p_hwfn, p_hwfn->p_main_ptt,
+		       PGLUE_B_REG_PGL_ADDR_94_F0_BB, 0);
+	}
 
 	/* Clean Previous errors if such exist */
 	qed_wr(p_hwfn, p_hwfn->p_main_ptt,
@@ -1610,6 +1638,7 @@ static u32 qed_hw_get_dflt_resc_num(struct qed_hwfn *p_hwfn,
 				    enum qed_resources res_id)
 {
 	u8 num_funcs = p_hwfn->num_funcs_on_engine;
+	bool b_ah = QED_IS_AH(p_hwfn->cdev);
 	struct qed_sb_cnt_info sb_cnt_info;
 	u32 dflt_resc_num = 0;
 
@@ -1620,17 +1649,22 @@ static u32 qed_hw_get_dflt_resc_num(struct qed_hwfn *p_hwfn,
 		dflt_resc_num = sb_cnt_info.sb_cnt;
 		break;
 	case QED_L2_QUEUE:
-		dflt_resc_num = MAX_NUM_L2_QUEUES_BB / num_funcs;
+		dflt_resc_num = (b_ah ? MAX_NUM_L2_QUEUES_K2
+				      : MAX_NUM_L2_QUEUES_BB) / num_funcs;
 		break;
 	case QED_VPORT:
 		dflt_resc_num = MAX_NUM_VPORTS_BB / num_funcs;
+		dflt_resc_num = (b_ah ? MAX_NUM_VPORTS_K2
+				      : MAX_NUM_VPORTS_BB) / num_funcs;
 		break;
 	case QED_RSS_ENG:
-		dflt_resc_num = ETH_RSS_ENGINE_NUM_BB / num_funcs;
+		dflt_resc_num = (b_ah ? ETH_RSS_ENGINE_NUM_K2
+				      : ETH_RSS_ENGINE_NUM_BB) / num_funcs;
 		break;
 	case QED_PQ:
 		/* The granularity of the PQs is 8 */
-		dflt_resc_num = MAX_QM_TX_QUEUES_BB / num_funcs;
+		dflt_resc_num = (b_ah ? MAX_QM_TX_QUEUES_K2
+				      : MAX_QM_TX_QUEUES_BB) / num_funcs;
 		dflt_resc_num &= ~0x7;
 		break;
 	case QED_RL:
@@ -1642,7 +1676,8 @@ static u32 qed_hw_get_dflt_resc_num(struct qed_hwfn *p_hwfn,
 		dflt_resc_num = ETH_NUM_MAC_FILTERS / num_funcs;
 		break;
 	case QED_ILT:
-		dflt_resc_num = PXP_NUM_ILT_RECORDS_BB / num_funcs;
+		dflt_resc_num = (b_ah ? PXP_NUM_ILT_RECORDS_K2
+				      : PXP_NUM_ILT_RECORDS_BB) / num_funcs;
 		break;
 	case QED_LL2_QUEUE:
 		dflt_resc_num = MAX_NUM_LL2_RX_QUEUES / num_funcs;
@@ -1653,7 +1688,10 @@ static u32 qed_hw_get_dflt_resc_num(struct qed_hwfn *p_hwfn,
 		dflt_resc_num = NUM_OF_CMDQS_CQS / num_funcs;
 		break;
 	case QED_RDMA_STATS_QUEUE:
-		dflt_resc_num = RDMA_NUM_STATISTIC_COUNTERS_BB / num_funcs;
+		dflt_resc_num = (b_ah ? RDMA_NUM_STATISTIC_COUNTERS_K2
+				      : RDMA_NUM_STATISTIC_COUNTERS_BB) /
+				num_funcs;
+
 		break;
 	default:
 		break;
@@ -1780,6 +1818,7 @@ out:
 
 static int qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 {
+	bool b_ah = QED_IS_AH(p_hwfn->cdev);
 	u8 res_id;
 	int rc;
 
@@ -1790,7 +1829,8 @@ static int qed_hw_get_resc(struct qed_hwfn *p_hwfn)
 	}
 
 	/* Sanity for ILT */
-	if ((RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_BB)) {
+	if ((b_ah && (RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_K2)) ||
+	    (!b_ah && (RESC_END(p_hwfn, QED_ILT) > PXP_NUM_ILT_RECORDS_BB))) {
 		DP_NOTICE(p_hwfn, "Can't assign ILT pages [%08x,...,%08x]\n",
 			  RESC_START(p_hwfn, QED_ILT),
 			  RESC_END(p_hwfn, QED_ILT) - 1);
@@ -1860,9 +1900,15 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X25G:
 		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_2X25G;
 		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X10G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_2X10G;
+		break;
 	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_1X25G:
 		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_1X25G;
 		break;
+	case NVM_CFG1_GLOB_NETWORK_PORT_MODE_4X25G:
+		p_hwfn->hw_info.port_mode = QED_PORT_MODE_DE_4X25G;
+		break;
 	default:
 		DP_NOTICE(p_hwfn, "Unknown port mode in 0x%08x\n", core_cfg);
 		break;
@@ -1976,8 +2022,9 @@ static void qed_get_num_funcs(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
 	u8 num_funcs, enabled_func_idx = p_hwfn->rel_pf_id;
 	u32 reg_function_hide, tmp, eng_mask, low_pfs_mask;
+	struct qed_dev *cdev = p_hwfn->cdev;
 
-	num_funcs = MAX_NUM_PFS_BB;
+	num_funcs = QED_IS_AH(cdev) ? MAX_NUM_PFS_K2 : MAX_NUM_PFS_BB;
 
 	/* Bit 0 of MISCS_REG_FUNCTION_HIDE indicates whether the bypass values
 	 * in the other bits are selected.
@@ -1990,12 +2037,17 @@ static void qed_get_num_funcs(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	reg_function_hide = qed_rd(p_hwfn, p_ptt, MISCS_REG_FUNCTION_HIDE);
 
 	if (reg_function_hide & 0x1) {
-		if (QED_PATH_ID(p_hwfn) && p_hwfn->cdev->num_hwfns == 1) {
-			num_funcs = 0;
-			eng_mask = 0xaaaa;
+		if (QED_IS_BB(cdev)) {
+			if (QED_PATH_ID(p_hwfn) && cdev->num_hwfns == 1) {
+				num_funcs = 0;
+				eng_mask = 0xaaaa;
+			} else {
+				num_funcs = 1;
+				eng_mask = 0x5554;
+			}
 		} else {
 			num_funcs = 1;
-			eng_mask = 0x5554;
+			eng_mask = 0xfffe;
 		}
 
 		/* Get the number of the enabled functions on the engine */
@@ -2027,24 +2079,12 @@ static void qed_get_num_funcs(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		   p_hwfn->enabled_func_idx, p_hwfn->num_funcs_on_engine);
 }
 
-static int
-qed_get_hw_info(struct qed_hwfn *p_hwfn,
-		struct qed_ptt *p_ptt,
-		enum qed_pci_personality personality)
+static void qed_hw_info_port_num_bb(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
 {
 	u32 port_mode;
-	int rc;
 
-	/* Since all information is common, only first hwfns should do this */
-	if (IS_LEAD_HWFN(p_hwfn)) {
-		rc = qed_iov_hw_info(p_hwfn);
-		if (rc)
-			return rc;
-	}
-
-	/* Read the port mode */
-	port_mode = qed_rd(p_hwfn, p_ptt,
-			   CNIG_REG_NW_PORT_MODE_BB_B0);
+	port_mode = qed_rd(p_hwfn, p_ptt, CNIG_REG_NW_PORT_MODE_BB_B0);
 
 	if (port_mode < 3) {
 		p_hwfn->cdev->num_ports_in_engines = 1;
@@ -2057,6 +2097,54 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 		/* Default num_ports_in_engines to something */
 		p_hwfn->cdev->num_ports_in_engines = 1;
 	}
+}
+
+static void qed_hw_info_port_num_ah(struct qed_hwfn *p_hwfn,
+				    struct qed_ptt *p_ptt)
+{
+	u32 port;
+	int i;
+
+	p_hwfn->cdev->num_ports_in_engines = 0;
+
+	for (i = 0; i < MAX_NUM_PORTS_K2; i++) {
+		port = qed_rd(p_hwfn, p_ptt,
+			      CNIG_REG_NIG_PORT0_CONF_K2 + (i * 4));
+		if (port & 1)
+			p_hwfn->cdev->num_ports_in_engines++;
+	}
+
+	if (!p_hwfn->cdev->num_ports_in_engines) {
+		DP_NOTICE(p_hwfn, "All NIG ports are inactive\n");
+
+		/* Default num_ports_in_engine to something */
+		p_hwfn->cdev->num_ports_in_engines = 1;
+	}
+}
+
+static void qed_hw_info_port_num(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
+{
+	if (QED_IS_BB(p_hwfn->cdev))
+		qed_hw_info_port_num_bb(p_hwfn, p_ptt);
+	else
+		qed_hw_info_port_num_ah(p_hwfn, p_ptt);
+}
+
+static int
+qed_get_hw_info(struct qed_hwfn *p_hwfn,
+		struct qed_ptt *p_ptt,
+		enum qed_pci_personality personality)
+{
+	int rc;
+
+	/* Since all information is common, only first hwfns should do this */
+	if (IS_LEAD_HWFN(p_hwfn)) {
+		rc = qed_iov_hw_info(p_hwfn);
+		if (rc)
+			return rc;
+	}
+
+	qed_hw_info_port_num(p_hwfn, p_ptt);
 
 	qed_hw_get_nvm_info(p_hwfn, p_ptt);
 
@@ -2096,19 +2184,33 @@ qed_get_hw_info(struct qed_hwfn *p_hwfn,
 static int qed_get_dev_info(struct qed_dev *cdev)
 {
 	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	u16 device_id_mask;
 	u32 tmp;
 
 	/* Read Vendor Id / Device Id */
 	pci_read_config_word(cdev->pdev, PCI_VENDOR_ID, &cdev->vendor_id);
 	pci_read_config_word(cdev->pdev, PCI_DEVICE_ID, &cdev->device_id);
 
+	/* Determine type */
+	device_id_mask = cdev->device_id & QED_DEV_ID_MASK;
+	switch (device_id_mask) {
+	case QED_DEV_ID_MASK_BB:
+		cdev->type = QED_DEV_TYPE_BB;
+		break;
+	case QED_DEV_ID_MASK_AH:
+		cdev->type = QED_DEV_TYPE_AH;
+		break;
+	default:
+		DP_NOTICE(p_hwfn, "Unknown device id 0x%x\n", cdev->device_id);
+		return -EBUSY;
+	}
+
 	cdev->chip_num = (u16)qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 				     MISCS_REG_CHIP_NUM);
 	cdev->chip_rev = (u16)qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 				     MISCS_REG_CHIP_REV);
 	MASK_FIELD(CHIP_REV, cdev->chip_rev);
 
-	cdev->type = QED_DEV_TYPE_BB;
 	/* Learn number of HW-functions */
 	tmp = qed_rd(p_hwfn, p_hwfn->p_main_ptt,
 		     MISCS_REG_CMT_ENABLED_FOR_PAIR);
@@ -2128,7 +2230,10 @@ static int qed_get_dev_info(struct qed_dev *cdev)
 	MASK_FIELD(CHIP_METAL, cdev->chip_metal);
 
 	DP_INFO(cdev->hwfns,
-		"Chip details - Num: %04x Rev: %04x Bond id: %04x Metal: %04x\n",
+		"Chip details - %s %c%d, Num: %04x Rev: %04x Bond id: %04x Metal: %04x\n",
+		QED_IS_BB(cdev) ? "BB" : "AH",
+		'A' + cdev->chip_rev,
+		(int)cdev->chip_metal,
 		cdev->chip_num, cdev->chip_rev,
 		cdev->chip_bond_id, cdev->chip_metal);
 
@@ -3364,3 +3469,8 @@ void qed_clean_wfq_db(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 	memset(p_hwfn->qm_info.wfq_data, 0,
 	       sizeof(*p_hwfn->qm_info.wfq_data) * p_hwfn->qm_info.num_vports);
 }
+
+int qed_device_num_engines(struct qed_dev *cdev)
+{
+	return QED_IS_BB(cdev) ? 2 : 1;
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 88ca78a297a0..e9acdc96ba84 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -2502,7 +2502,7 @@ struct fw_info_location {
 
 enum init_modes {
 	MODE_RESERVED,
-	MODE_BB_B0,
+	MODE_BB,
 	MODE_K2,
 	MODE_ASIC,
 	MODE_RESERVED2,
@@ -9431,11 +9431,24 @@ struct eth_stats {
 	u64 r511;
 	u64 r1023;
 	u64 r1518;
-	u64 r1522;
-	u64 r2047;
-	u64 r4095;
-	u64 r9216;
-	u64 r16383;
+
+	union {
+		struct {
+			u64 r1522;
+			u64 r2047;
+			u64 r4095;
+			u64 r9216;
+			u64 r16383;
+		} bb0;
+		struct {
+			u64 unused1;
+			u64 r1519_to_max;
+			u64 unused2;
+			u64 unused3;
+			u64 unused4;
+		} ah0;
+	} u0;
+
 	u64 rfcs;
 	u64 rxcf;
 	u64 rxpf;
@@ -9452,14 +9465,36 @@ struct eth_stats {
 	u64 t511;
 	u64 t1023;
 	u64 t1518;
-	u64 t2047;
-	u64 t4095;
-	u64 t9216;
-	u64 t16383;
+
+	union {
+		struct {
+			u64 t2047;
+			u64 t4095;
+			u64 t9216;
+			u64 t16383;
+		} bb1;
+		struct {
+			u64 t1519_to_max;
+			u64 unused6;
+			u64 unused7;
+			u64 unused8;
+		} ah1;
+	} u1;
+
 	u64 txpf;
 	u64 txpp;
-	u64 tlpiec;
-	u64 tncl;
+
+	union {
+		struct {
+			u64 tlpiec;
+			u64 tncl;
+		} bb2;
+		struct {
+			u64 unused9;
+			u64 unused10;
+		} ah2;
+	} u2;
+
 	u64 rbyte;
 	u64 rxuca;
 	u64 rxmca;
@@ -10263,6 +10298,8 @@ struct nvm_cfg1_glob {
 #define NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X25G		0xC
 #define NVM_CFG1_GLOB_NETWORK_PORT_MODE_1X25G		0xD
 #define NVM_CFG1_GLOB_NETWORK_PORT_MODE_4X25G		0xE
+#define NVM_CFG1_GLOB_NETWORK_PORT_MODE_2X10G		0xF
+
 	u32 e_lane_cfg1;
 	u32 e_lane_cfg2;
 	u32 f_lane_cfg1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index df932be5a4e5..4385ccbb5efb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1470,13 +1470,20 @@ static void __qed_get_vport_pstats(struct qed_hwfn *p_hwfn,
 	memset(&pstats, 0, sizeof(pstats));
 	qed_memcpy_from(p_hwfn, p_ptt, &pstats, pstats_addr, pstats_len);
 
-	p_stats->tx_ucast_bytes += HILO_64_REGPAIR(pstats.sent_ucast_bytes);
-	p_stats->tx_mcast_bytes += HILO_64_REGPAIR(pstats.sent_mcast_bytes);
-	p_stats->tx_bcast_bytes += HILO_64_REGPAIR(pstats.sent_bcast_bytes);
-	p_stats->tx_ucast_pkts += HILO_64_REGPAIR(pstats.sent_ucast_pkts);
-	p_stats->tx_mcast_pkts += HILO_64_REGPAIR(pstats.sent_mcast_pkts);
-	p_stats->tx_bcast_pkts += HILO_64_REGPAIR(pstats.sent_bcast_pkts);
-	p_stats->tx_err_drop_pkts += HILO_64_REGPAIR(pstats.error_drop_pkts);
+	p_stats->common.tx_ucast_bytes +=
+	    HILO_64_REGPAIR(pstats.sent_ucast_bytes);
+	p_stats->common.tx_mcast_bytes +=
+	    HILO_64_REGPAIR(pstats.sent_mcast_bytes);
+	p_stats->common.tx_bcast_bytes +=
+	    HILO_64_REGPAIR(pstats.sent_bcast_bytes);
+	p_stats->common.tx_ucast_pkts +=
+	    HILO_64_REGPAIR(pstats.sent_ucast_pkts);
+	p_stats->common.tx_mcast_pkts +=
+	    HILO_64_REGPAIR(pstats.sent_mcast_pkts);
+	p_stats->common.tx_bcast_pkts +=
+	    HILO_64_REGPAIR(pstats.sent_bcast_pkts);
+	p_stats->common.tx_err_drop_pkts +=
+	    HILO_64_REGPAIR(pstats.error_drop_pkts);
 }
 
 static void __qed_get_vport_tstats(struct qed_hwfn *p_hwfn,
@@ -1502,10 +1509,10 @@ static void __qed_get_vport_tstats(struct qed_hwfn *p_hwfn,
 	memset(&tstats, 0, sizeof(tstats));
 	qed_memcpy_from(p_hwfn, p_ptt, &tstats, tstats_addr, tstats_len);
 
-	p_stats->mftag_filter_discards +=
-		HILO_64_REGPAIR(tstats.mftag_filter_discard);
-	p_stats->mac_filter_discards +=
-		HILO_64_REGPAIR(tstats.eth_mac_filter_discard);
+	p_stats->common.mftag_filter_discards +=
+	    HILO_64_REGPAIR(tstats.mftag_filter_discard);
+	p_stats->common.mac_filter_discards +=
+	    HILO_64_REGPAIR(tstats.eth_mac_filter_discard);
 }
 
 static void __qed_get_vport_ustats_addrlen(struct qed_hwfn *p_hwfn,
@@ -1539,12 +1546,15 @@ static void __qed_get_vport_ustats(struct qed_hwfn *p_hwfn,
 	memset(&ustats, 0, sizeof(ustats));
 	qed_memcpy_from(p_hwfn, p_ptt, &ustats, ustats_addr, ustats_len);
 
-	p_stats->rx_ucast_bytes += HILO_64_REGPAIR(ustats.rcv_ucast_bytes);
-	p_stats->rx_mcast_bytes += HILO_64_REGPAIR(ustats.rcv_mcast_bytes);
-	p_stats->rx_bcast_bytes += HILO_64_REGPAIR(ustats.rcv_bcast_bytes);
-	p_stats->rx_ucast_pkts += HILO_64_REGPAIR(ustats.rcv_ucast_pkts);
-	p_stats->rx_mcast_pkts += HILO_64_REGPAIR(ustats.rcv_mcast_pkts);
-	p_stats->rx_bcast_pkts += HILO_64_REGPAIR(ustats.rcv_bcast_pkts);
+	p_stats->common.rx_ucast_bytes +=
+	    HILO_64_REGPAIR(ustats.rcv_ucast_bytes);
+	p_stats->common.rx_mcast_bytes +=
+	    HILO_64_REGPAIR(ustats.rcv_mcast_bytes);
+	p_stats->common.rx_bcast_bytes +=
+	    HILO_64_REGPAIR(ustats.rcv_bcast_bytes);
+	p_stats->common.rx_ucast_pkts += HILO_64_REGPAIR(ustats.rcv_ucast_pkts);
+	p_stats->common.rx_mcast_pkts += HILO_64_REGPAIR(ustats.rcv_mcast_pkts);
+	p_stats->common.rx_bcast_pkts += HILO_64_REGPAIR(ustats.rcv_bcast_pkts);
 }
 
 static void __qed_get_vport_mstats_addrlen(struct qed_hwfn *p_hwfn,
@@ -1578,23 +1588,26 @@ static void __qed_get_vport_mstats(struct qed_hwfn *p_hwfn,
 	memset(&mstats, 0, sizeof(mstats));
 	qed_memcpy_from(p_hwfn, p_ptt, &mstats, mstats_addr, mstats_len);
 
-	p_stats->no_buff_discards += HILO_64_REGPAIR(mstats.no_buff_discard);
-	p_stats->packet_too_big_discard +=
-		HILO_64_REGPAIR(mstats.packet_too_big_discard);
-	p_stats->ttl0_discard += HILO_64_REGPAIR(mstats.ttl0_discard);
-	p_stats->tpa_coalesced_pkts +=
-		HILO_64_REGPAIR(mstats.tpa_coalesced_pkts);
-	p_stats->tpa_coalesced_events +=
-		HILO_64_REGPAIR(mstats.tpa_coalesced_events);
-	p_stats->tpa_aborts_num += HILO_64_REGPAIR(mstats.tpa_aborts_num);
-	p_stats->tpa_coalesced_bytes +=
-		HILO_64_REGPAIR(mstats.tpa_coalesced_bytes);
+	p_stats->common.no_buff_discards +=
+	    HILO_64_REGPAIR(mstats.no_buff_discard);
+	p_stats->common.packet_too_big_discard +=
+	    HILO_64_REGPAIR(mstats.packet_too_big_discard);
+	p_stats->common.ttl0_discard += HILO_64_REGPAIR(mstats.ttl0_discard);
+	p_stats->common.tpa_coalesced_pkts +=
+	    HILO_64_REGPAIR(mstats.tpa_coalesced_pkts);
+	p_stats->common.tpa_coalesced_events +=
+	    HILO_64_REGPAIR(mstats.tpa_coalesced_events);
+	p_stats->common.tpa_aborts_num +=
+	    HILO_64_REGPAIR(mstats.tpa_aborts_num);
+	p_stats->common.tpa_coalesced_bytes +=
+	    HILO_64_REGPAIR(mstats.tpa_coalesced_bytes);
 }
 
 static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
 				       struct qed_ptt *p_ptt,
 				       struct qed_eth_stats *p_stats)
 {
+	struct qed_eth_stats_common *p_common = &p_stats->common;
 	struct port_stats port_stats;
 	int j;
 
@@ -1605,54 +1618,75 @@ static void __qed_get_vport_port_stats(struct qed_hwfn *p_hwfn,
 			offsetof(struct public_port, stats),
 			sizeof(port_stats));
 
-	p_stats->rx_64_byte_packets		+= port_stats.eth.r64;
-	p_stats->rx_65_to_127_byte_packets	+= port_stats.eth.r127;
-	p_stats->rx_128_to_255_byte_packets	+= port_stats.eth.r255;
-	p_stats->rx_256_to_511_byte_packets	+= port_stats.eth.r511;
-	p_stats->rx_512_to_1023_byte_packets	+= port_stats.eth.r1023;
-	p_stats->rx_1024_to_1518_byte_packets	+= port_stats.eth.r1518;
-	p_stats->rx_1519_to_1522_byte_packets	+= port_stats.eth.r1522;
-	p_stats->rx_1519_to_2047_byte_packets	+= port_stats.eth.r2047;
-	p_stats->rx_2048_to_4095_byte_packets	+= port_stats.eth.r4095;
-	p_stats->rx_4096_to_9216_byte_packets	+= port_stats.eth.r9216;
-	p_stats->rx_9217_to_16383_byte_packets	+= port_stats.eth.r16383;
-	p_stats->rx_crc_errors			+= port_stats.eth.rfcs;
-	p_stats->rx_mac_crtl_frames		+= port_stats.eth.rxcf;
-	p_stats->rx_pause_frames		+= port_stats.eth.rxpf;
-	p_stats->rx_pfc_frames			+= port_stats.eth.rxpp;
-	p_stats->rx_align_errors		+= port_stats.eth.raln;
-	p_stats->rx_carrier_errors		+= port_stats.eth.rfcr;
-	p_stats->rx_oversize_packets		+= port_stats.eth.rovr;
-	p_stats->rx_jabbers			+= port_stats.eth.rjbr;
-	p_stats->rx_undersize_packets		+= port_stats.eth.rund;
-	p_stats->rx_fragments			+= port_stats.eth.rfrg;
-	p_stats->tx_64_byte_packets		+= port_stats.eth.t64;
-	p_stats->tx_65_to_127_byte_packets	+= port_stats.eth.t127;
-	p_stats->tx_128_to_255_byte_packets	+= port_stats.eth.t255;
-	p_stats->tx_256_to_511_byte_packets	+= port_stats.eth.t511;
-	p_stats->tx_512_to_1023_byte_packets	+= port_stats.eth.t1023;
-	p_stats->tx_1024_to_1518_byte_packets	+= port_stats.eth.t1518;
-	p_stats->tx_1519_to_2047_byte_packets	+= port_stats.eth.t2047;
-	p_stats->tx_2048_to_4095_byte_packets	+= port_stats.eth.t4095;
-	p_stats->tx_4096_to_9216_byte_packets	+= port_stats.eth.t9216;
-	p_stats->tx_9217_to_16383_byte_packets	+= port_stats.eth.t16383;
-	p_stats->tx_pause_frames		+= port_stats.eth.txpf;
-	p_stats->tx_pfc_frames			+= port_stats.eth.txpp;
-	p_stats->tx_lpi_entry_count		+= port_stats.eth.tlpiec;
-	p_stats->tx_total_collisions		+= port_stats.eth.tncl;
-	p_stats->rx_mac_bytes			+= port_stats.eth.rbyte;
-	p_stats->rx_mac_uc_packets		+= port_stats.eth.rxuca;
-	p_stats->rx_mac_mc_packets		+= port_stats.eth.rxmca;
-	p_stats->rx_mac_bc_packets		+= port_stats.eth.rxbca;
-	p_stats->rx_mac_frames_ok		+= port_stats.eth.rxpok;
-	p_stats->tx_mac_bytes			+= port_stats.eth.tbyte;
-	p_stats->tx_mac_uc_packets		+= port_stats.eth.txuca;
-	p_stats->tx_mac_mc_packets		+= port_stats.eth.txmca;
-	p_stats->tx_mac_bc_packets		+= port_stats.eth.txbca;
-	p_stats->tx_mac_ctrl_frames		+= port_stats.eth.txcf;
+	p_common->rx_64_byte_packets += port_stats.eth.r64;
+	p_common->rx_65_to_127_byte_packets += port_stats.eth.r127;
+	p_common->rx_128_to_255_byte_packets += port_stats.eth.r255;
+	p_common->rx_256_to_511_byte_packets += port_stats.eth.r511;
+	p_common->rx_512_to_1023_byte_packets += port_stats.eth.r1023;
+	p_common->rx_1024_to_1518_byte_packets += port_stats.eth.r1518;
+	p_common->rx_crc_errors += port_stats.eth.rfcs;
+	p_common->rx_mac_crtl_frames += port_stats.eth.rxcf;
+	p_common->rx_pause_frames += port_stats.eth.rxpf;
+	p_common->rx_pfc_frames += port_stats.eth.rxpp;
+	p_common->rx_align_errors += port_stats.eth.raln;
+	p_common->rx_carrier_errors += port_stats.eth.rfcr;
+	p_common->rx_oversize_packets += port_stats.eth.rovr;
+	p_common->rx_jabbers += port_stats.eth.rjbr;
+	p_common->rx_undersize_packets += port_stats.eth.rund;
+	p_common->rx_fragments += port_stats.eth.rfrg;
+	p_common->tx_64_byte_packets += port_stats.eth.t64;
+	p_common->tx_65_to_127_byte_packets += port_stats.eth.t127;
+	p_common->tx_128_to_255_byte_packets += port_stats.eth.t255;
+	p_common->tx_256_to_511_byte_packets += port_stats.eth.t511;
+	p_common->tx_512_to_1023_byte_packets += port_stats.eth.t1023;
+	p_common->tx_1024_to_1518_byte_packets += port_stats.eth.t1518;
+	p_common->tx_pause_frames += port_stats.eth.txpf;
+	p_common->tx_pfc_frames += port_stats.eth.txpp;
+	p_common->rx_mac_bytes += port_stats.eth.rbyte;
+	p_common->rx_mac_uc_packets += port_stats.eth.rxuca;
+	p_common->rx_mac_mc_packets += port_stats.eth.rxmca;
+	p_common->rx_mac_bc_packets += port_stats.eth.rxbca;
+	p_common->rx_mac_frames_ok += port_stats.eth.rxpok;
+	p_common->tx_mac_bytes += port_stats.eth.tbyte;
+	p_common->tx_mac_uc_packets += port_stats.eth.txuca;
+	p_common->tx_mac_mc_packets += port_stats.eth.txmca;
+	p_common->tx_mac_bc_packets += port_stats.eth.txbca;
+	p_common->tx_mac_ctrl_frames += port_stats.eth.txcf;
 	for (j = 0; j < 8; j++) {
-		p_stats->brb_truncates	+= port_stats.brb.brb_truncate[j];
-		p_stats->brb_discards	+= port_stats.brb.brb_discard[j];
+		p_common->brb_truncates += port_stats.brb.brb_truncate[j];
+		p_common->brb_discards += port_stats.brb.brb_discard[j];
+	}
+
+	if (QED_IS_BB(p_hwfn->cdev)) {
+		struct qed_eth_stats_bb *p_bb = &p_stats->bb;
+
+		p_bb->rx_1519_to_1522_byte_packets +=
+		    port_stats.eth.u0.bb0.r1522;
+		p_bb->rx_1519_to_2047_byte_packets +=
+		    port_stats.eth.u0.bb0.r2047;
+		p_bb->rx_2048_to_4095_byte_packets +=
+		    port_stats.eth.u0.bb0.r4095;
+		p_bb->rx_4096_to_9216_byte_packets +=
+		    port_stats.eth.u0.bb0.r9216;
+		p_bb->rx_9217_to_16383_byte_packets +=
+		    port_stats.eth.u0.bb0.r16383;
+		p_bb->tx_1519_to_2047_byte_packets +=
+		    port_stats.eth.u1.bb1.t2047;
+		p_bb->tx_2048_to_4095_byte_packets +=
+		    port_stats.eth.u1.bb1.t4095;
+		p_bb->tx_4096_to_9216_byte_packets +=
+		    port_stats.eth.u1.bb1.t9216;
+		p_bb->tx_9217_to_16383_byte_packets +=
+		    port_stats.eth.u1.bb1.t16383;
+		p_bb->tx_lpi_entry_count += port_stats.eth.u2.bb2.tlpiec;
+		p_bb->tx_total_collisions += port_stats.eth.u2.bb2.tncl;
+	} else {
+		struct qed_eth_stats_ah *p_ah = &p_stats->ah;
+
+		p_ah->rx_1519_to_max_byte_packets +=
+		    port_stats.eth.u0.ah0.r1519_to_max;
+		p_ah->tx_1519_to_max_byte_packets =
+		    port_stats.eth.u1.ah1.t1519_to_max;
 	}
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index eef30a598b40..766c6f39ea63 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -238,6 +238,7 @@ int qed_fill_dev_info(struct qed_dev *cdev,
 	dev_info->rdma_supported = (cdev->hwfns[0].hw_info.personality ==
 				    QED_PCI_ETH_ROCE);
 	dev_info->is_mf_default = IS_MF_DEFAULT(&cdev->hwfns[0]);
+	dev_info->dev_type = cdev->type;
 	ether_addr_copy(dev_info->hw_mac, cdev->hwfns[0].hw_info.hw_mac_addr);
 
 	if (IS_PF(cdev)) {
@@ -1653,8 +1654,10 @@ void qed_get_protocol_stats(struct qed_dev *cdev,
 	switch (type) {
 	case QED_MCP_LAN_STATS:
 		qed_get_vport_stats(cdev, &eth_stats);
-		stats->lan_stats.ucast_rx_pkts = eth_stats.rx_ucast_pkts;
-		stats->lan_stats.ucast_tx_pkts = eth_stats.tx_ucast_pkts;
+		stats->lan_stats.ucast_rx_pkts =
+					eth_stats.common.rx_ucast_pkts;
+		stats->lan_stats.ucast_tx_pkts =
+					eth_stats.common.tx_ucast_pkts;
 		stats->lan_stats.fcs_err = -1;
 		break;
 	case QED_MCP_FCOE_STATS:
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 368e88de146c..bdbfd6d4485e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -479,11 +479,10 @@ int qed_mcp_bist_nvm_test_get_image_att(struct qed_hwfn *p_hwfn,
 					    rel_pfid)
 #define MCP_PF_ID(p_hwfn) MCP_PF_ID_BY_REL(p_hwfn, (p_hwfn)->rel_pf_id)
 
-/* TODO - this is only correct as long as only BB is supported, and
- * no port-swapping is implemented; Afterwards we'll need to fix it.
- */
-#define MFW_PORT(_p_hwfn)       ((_p_hwfn)->abs_pf_id %	\
-				 ((_p_hwfn)->cdev->num_ports_in_engines * 2))
+#define MFW_PORT(_p_hwfn)       ((_p_hwfn)->abs_pf_id %			  \
+				 ((_p_hwfn)->cdev->num_ports_in_engines * \
+				  qed_device_num_engines((_p_hwfn)->cdev)))
+
 struct qed_mcp_info {
 	/* Spinlock used for protecting the access to the MFW mailbox */
 	spinlock_t				lock;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ptp.c b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
index d27aa85da23c..80c9c0b172dd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ptp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
@@ -262,12 +262,20 @@ static int qed_ptp_hw_enable(struct qed_dev *cdev)
 	qed_wr(p_hwfn, p_ptt, NIG_REG_TS_OUTPUT_ENABLE_PDA, 0x1);
 
 	/* Pause free running counter */
-	qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 2);
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 2);
+	if (QED_IS_AH(p_hwfn->cdev))
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREECNT_UPDATE_K2, 2);
 
 	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREE_CNT_VALUE_LSB, 0);
 	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREE_CNT_VALUE_MSB, 0);
 	/* Resume free running counter */
-	qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 4);
+	if (QED_IS_BB_B0(p_hwfn->cdev))
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TIMESYNC_GEN_REG_BB, 4);
+	if (QED_IS_AH(p_hwfn->cdev)) {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_FREECNT_UPDATE_K2, 4);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_PTP_LATCH_OSTS_PKT_TIME, 1);
+	}
 
 	/* Disable drift register */
 	qed_wr(p_hwfn, p_ptt, NIG_REG_TSGEN_DRIFT_CNTR_CONF, 0x0);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index 36ae361884e0..6d4ac7e2ee83 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -160,13 +160,13 @@
 	0x2e0704UL
 #define  CCFC_REG_STRONG_ENABLE_PF \
 	0x2e0708UL
-#define  PGLUE_B_REG_PGL_ADDR_88_F0 \
+#define  PGLUE_B_REG_PGL_ADDR_88_F0_BB \
 	0x2aa404UL
-#define  PGLUE_B_REG_PGL_ADDR_8C_F0 \
+#define  PGLUE_B_REG_PGL_ADDR_8C_F0_BB \
 	0x2aa408UL
-#define  PGLUE_B_REG_PGL_ADDR_90_F0 \
+#define  PGLUE_B_REG_PGL_ADDR_90_F0_BB \
 	0x2aa40cUL
-#define  PGLUE_B_REG_PGL_ADDR_94_F0 \
+#define  PGLUE_B_REG_PGL_ADDR_94_F0_BB \
 	0x2aa410UL
 #define  PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR \
 	0x2aa138UL
@@ -1550,4 +1550,13 @@
 #define NIG_REG_TIMESYNC_GEN_REG_BB 0x500d00UL
 #define NIG_REG_TSGEN_FREE_CNT_VALUE_LSB 0x5088a8UL
 #define NIG_REG_TSGEN_FREE_CNT_VALUE_MSB 0x5088acUL
+#define NIG_REG_PTP_LATCH_OSTS_PKT_TIME 0x509040UL
+
+#define PGLUE_B_REG_PGL_ADDR_E8_F0_K2 0x2aaf98UL
+#define PGLUE_B_REG_PGL_ADDR_EC_F0_K2 0x2aaf9cUL
+#define PGLUE_B_REG_PGL_ADDR_F0_F0_K2 0x2aafa0UL
+#define PGLUE_B_REG_PGL_ADDR_F4_F0_K2 0x2aafa4UL
+#define NIG_REG_TSGEN_FREECNT_UPDATE_K2 0x509008UL
+#define CNIG_REG_NIG_PORT0_CONF_K2 0x218200UL
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 253c2bbe1e4e..16f503c9b0af 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -557,14 +557,30 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 		return 0;
 	}
 
-	/* Calculate the first VF index - this is a bit tricky; Basically,
-	 * VFs start at offset 16 relative to PF0, and 2nd engine VFs begin
-	 * after the first engine's VFs.
+	/* First VF index based on offset is tricky:
+	 *  - If ARI is supported [likely], offset - (16 - pf_id) would
+	 *    provide the number for eng0. 2nd engine Vfs would begin
+	 *    after the first engine's VFs.
+	 *  - If !ARI, VFs would start on next device.
+	 *    so offset - (256 - pf_id) would provide the number.
+	 * Utilize the fact that (256 - pf_id) is achieved only by later
+	 * to diffrentiate between the two.
 	 */
-	cdev->p_iov_info->first_vf_in_pf = p_hwfn->cdev->p_iov_info->offset +
-					   p_hwfn->abs_pf_id - 16;
-	if (QED_PATH_ID(p_hwfn))
-		cdev->p_iov_info->first_vf_in_pf -= MAX_NUM_VFS_BB;
+
+	if (p_hwfn->cdev->p_iov_info->offset < (256 - p_hwfn->abs_pf_id)) {
+		u32 first = p_hwfn->cdev->p_iov_info->offset +
+			    p_hwfn->abs_pf_id - 16;
+
+		cdev->p_iov_info->first_vf_in_pf = first;
+
+		if (QED_PATH_ID(p_hwfn))
+			cdev->p_iov_info->first_vf_in_pf -= MAX_NUM_VFS_BB;
+	} else {
+		u32 first = p_hwfn->cdev->p_iov_info->offset +
+			    p_hwfn->abs_pf_id - 256;
+
+		cdev->p_iov_info->first_vf_in_pf = first;
+	}
 
 	DP_VERBOSE(p_hwfn, QED_MSG_IOV,
 		   "First VF in hwfn 0x%08x\n",
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 8d02fb6c19d7..e73a4a5165ee 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -58,7 +58,7 @@
 
 #define DRV_MODULE_SYM		qede
 
-struct qede_stats {
+struct qede_stats_common {
 	u64 no_buff_discards;
 	u64 packet_too_big_discard;
 	u64 ttl0_discard;
@@ -90,11 +90,6 @@ struct qede_stats {
 	u64 rx_256_to_511_byte_packets;
 	u64 rx_512_to_1023_byte_packets;
 	u64 rx_1024_to_1518_byte_packets;
-	u64 rx_1519_to_1522_byte_packets;
-	u64 rx_1519_to_2047_byte_packets;
-	u64 rx_2048_to_4095_byte_packets;
-	u64 rx_4096_to_9216_byte_packets;
-	u64 rx_9217_to_16383_byte_packets;
 	u64 rx_crc_errors;
 	u64 rx_mac_crtl_frames;
 	u64 rx_pause_frames;
@@ -111,17 +106,39 @@ struct qede_stats {
 	u64 tx_256_to_511_byte_packets;
 	u64 tx_512_to_1023_byte_packets;
 	u64 tx_1024_to_1518_byte_packets;
+	u64 tx_pause_frames;
+	u64 tx_pfc_frames;
+	u64 brb_truncates;
+	u64 brb_discards;
+	u64 tx_mac_ctrl_frames;
+};
+
+struct qede_stats_bb {
+	u64 rx_1519_to_1522_byte_packets;
+	u64 rx_1519_to_2047_byte_packets;
+	u64 rx_2048_to_4095_byte_packets;
+	u64 rx_4096_to_9216_byte_packets;
+	u64 rx_9217_to_16383_byte_packets;
 	u64 tx_1519_to_2047_byte_packets;
 	u64 tx_2048_to_4095_byte_packets;
 	u64 tx_4096_to_9216_byte_packets;
 	u64 tx_9217_to_16383_byte_packets;
-	u64 tx_pause_frames;
-	u64 tx_pfc_frames;
 	u64 tx_lpi_entry_count;
 	u64 tx_total_collisions;
-	u64 brb_truncates;
-	u64 brb_discards;
-	u64 tx_mac_ctrl_frames;
+};
+
+struct qede_stats_ah {
+	u64 rx_1519_to_max_byte_packets;
+	u64 tx_1519_to_max_byte_packets;
+};
+
+struct qede_stats {
+	struct qede_stats_common common;
+
+	union {
+		struct qede_stats_bb bb;
+		struct qede_stats_ah ah;
+	};
 };
 
 struct qede_vlan {
@@ -158,6 +175,10 @@ struct qede_dev {
 	struct qed_dev_eth_info dev_info;
 #define QEDE_MAX_RSS_CNT(edev)	((edev)->dev_info.num_queues)
 #define QEDE_MAX_TSS_CNT(edev)	((edev)->dev_info.num_queues)
+#define QEDE_IS_BB(edev) \
+	((edev)->dev_info.common.dev_type == QED_DEV_TYPE_BB)
+#define QEDE_IS_AH(edev) \
+	((edev)->dev_info.common.dev_type == QED_DEV_TYPE_AH)
 
 	struct qede_fastpath		*fp_array;
 	u8				req_num_tx;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 897953133245..4dcfe9614731 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -75,16 +75,33 @@ static const struct {
 	QEDE_TQSTAT(stopped_cnt),
 };
 
-#define QEDE_STAT_OFFSET(stat_name) (offsetof(struct qede_stats, stat_name))
-#define QEDE_STAT_STRING(stat_name) (#stat_name)
-#define _QEDE_STAT(stat_name, pf_only) \
-	 {QEDE_STAT_OFFSET(stat_name), QEDE_STAT_STRING(stat_name), pf_only}
-#define QEDE_PF_STAT(stat_name)	_QEDE_STAT(stat_name, true)
-#define QEDE_STAT(stat_name)	_QEDE_STAT(stat_name, false)
+#define QEDE_STAT_OFFSET(stat_name, type, base) \
+	(offsetof(type, stat_name) + (base))
+#define QEDE_STAT_STRING(stat_name)	(#stat_name)
+#define _QEDE_STAT(stat_name, type, base, attr) \
+	{QEDE_STAT_OFFSET(stat_name, type, base), \
+	 QEDE_STAT_STRING(stat_name), \
+	 attr}
+#define QEDE_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_common, 0, 0x0)
+#define QEDE_PF_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_common, 0, \
+		   BIT(QEDE_STAT_PF_ONLY))
+#define QEDE_PF_BB_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_bb, \
+		   offsetof(struct qede_stats, bb), \
+		   BIT(QEDE_STAT_PF_ONLY) | BIT(QEDE_STAT_BB_ONLY))
+#define QEDE_PF_AH_STAT(stat_name) \
+	_QEDE_STAT(stat_name, struct qede_stats_ah, \
+		   offsetof(struct qede_stats, ah), \
+		   BIT(QEDE_STAT_PF_ONLY) | BIT(QEDE_STAT_AH_ONLY))
 static const struct {
 	u64 offset;
 	char string[ETH_GSTRING_LEN];
-	bool pf_only;
+	unsigned long attr;
+#define QEDE_STAT_PF_ONLY	0
+#define QEDE_STAT_BB_ONLY	1
+#define QEDE_STAT_AH_ONLY	2
 } qede_stats_arr[] = {
 	QEDE_STAT(rx_ucast_bytes),
 	QEDE_STAT(rx_mcast_bytes),
@@ -106,22 +123,23 @@ static const struct {
 	QEDE_PF_STAT(rx_256_to_511_byte_packets),
 	QEDE_PF_STAT(rx_512_to_1023_byte_packets),
 	QEDE_PF_STAT(rx_1024_to_1518_byte_packets),
-	QEDE_PF_STAT(rx_1519_to_1522_byte_packets),
-	QEDE_PF_STAT(rx_1519_to_2047_byte_packets),
-	QEDE_PF_STAT(rx_2048_to_4095_byte_packets),
-	QEDE_PF_STAT(rx_4096_to_9216_byte_packets),
-	QEDE_PF_STAT(rx_9217_to_16383_byte_packets),
+	QEDE_PF_BB_STAT(rx_1519_to_1522_byte_packets),
+	QEDE_PF_BB_STAT(rx_1519_to_2047_byte_packets),
+	QEDE_PF_BB_STAT(rx_2048_to_4095_byte_packets),
+	QEDE_PF_BB_STAT(rx_4096_to_9216_byte_packets),
+	QEDE_PF_BB_STAT(rx_9217_to_16383_byte_packets),
+	QEDE_PF_AH_STAT(rx_1519_to_max_byte_packets),
 	QEDE_PF_STAT(tx_64_byte_packets),
 	QEDE_PF_STAT(tx_65_to_127_byte_packets),
 	QEDE_PF_STAT(tx_128_to_255_byte_packets),
 	QEDE_PF_STAT(tx_256_to_511_byte_packets),
 	QEDE_PF_STAT(tx_512_to_1023_byte_packets),
 	QEDE_PF_STAT(tx_1024_to_1518_byte_packets),
-	QEDE_PF_STAT(tx_1519_to_2047_byte_packets),
-	QEDE_PF_STAT(tx_2048_to_4095_byte_packets),
-	QEDE_PF_STAT(tx_4096_to_9216_byte_packets),
-	QEDE_PF_STAT(tx_9217_to_16383_byte_packets),
-
+	QEDE_PF_BB_STAT(tx_1519_to_2047_byte_packets),
+	QEDE_PF_BB_STAT(tx_2048_to_4095_byte_packets),
+	QEDE_PF_BB_STAT(tx_4096_to_9216_byte_packets),
+	QEDE_PF_BB_STAT(tx_9217_to_16383_byte_packets),
+	QEDE_PF_AH_STAT(tx_1519_to_max_byte_packets),
 	QEDE_PF_STAT(rx_mac_crtl_frames),
 	QEDE_PF_STAT(tx_mac_ctrl_frames),
 	QEDE_PF_STAT(rx_pause_frames),
@@ -136,8 +154,8 @@ static const struct {
 	QEDE_PF_STAT(rx_jabbers),
 	QEDE_PF_STAT(rx_undersize_packets),
 	QEDE_PF_STAT(rx_fragments),
-	QEDE_PF_STAT(tx_lpi_entry_count),
-	QEDE_PF_STAT(tx_total_collisions),
+	QEDE_PF_BB_STAT(tx_lpi_entry_count),
+	QEDE_PF_BB_STAT(tx_total_collisions),
 	QEDE_PF_STAT(brb_truncates),
 	QEDE_PF_STAT(brb_discards),
 	QEDE_STAT(no_buff_discards),
@@ -155,6 +173,12 @@ static const struct {
 };
 
 #define QEDE_NUM_STATS	ARRAY_SIZE(qede_stats_arr)
+#define QEDE_STAT_IS_PF_ONLY(i) \
+	test_bit(QEDE_STAT_PF_ONLY, &qede_stats_arr[i].attr)
+#define QEDE_STAT_IS_BB_ONLY(i) \
+	test_bit(QEDE_STAT_BB_ONLY, &qede_stats_arr[i].attr)
+#define QEDE_STAT_IS_AH_ONLY(i) \
+	test_bit(QEDE_STAT_AH_ONLY, &qede_stats_arr[i].attr)
 
 enum {
 	QEDE_PRI_FLAG_CMT,
@@ -213,6 +237,13 @@ static void qede_get_strings_stats_rxq(struct qede_dev *edev,
 	}
 }
 
+static bool qede_is_irrelevant_stat(struct qede_dev *edev, int stat_index)
+{
+	return (IS_VF(edev) && QEDE_STAT_IS_PF_ONLY(stat_index)) ||
+	       (QEDE_IS_BB(edev) && QEDE_STAT_IS_AH_ONLY(stat_index)) ||
+	       (QEDE_IS_AH(edev) && QEDE_STAT_IS_BB_ONLY(stat_index));
+}
+
 static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 {
 	struct qede_fastpath *fp;
@@ -234,7 +265,7 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf)
 
 	/* Account for non-queue statistics */
 	for (i = 0; i < QEDE_NUM_STATS; i++) {
-		if (IS_VF(edev) && qede_stats_arr[i].pf_only)
+		if (qede_is_irrelevant_stat(edev, i))
 			continue;
 		strcpy(buf, qede_stats_arr[i].string);
 		buf += ETH_GSTRING_LEN;
@@ -309,7 +340,7 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 	}
 
 	for (i = 0; i < QEDE_NUM_STATS; i++) {
-		if (IS_VF(edev) && qede_stats_arr[i].pf_only)
+		if (qede_is_irrelevant_stat(edev, i))
 			continue;
 		*buf = *((u64 *)(((void *)&edev->stats) +
 				 qede_stats_arr[i].offset));
@@ -323,17 +354,13 @@ static void qede_get_ethtool_stats(struct net_device *dev,
 static int qede_get_sset_count(struct net_device *dev, int stringset)
 {
 	struct qede_dev *edev = netdev_priv(dev);
-	int num_stats = QEDE_NUM_STATS;
+	int num_stats = QEDE_NUM_STATS, i;
 
 	switch (stringset) {
 	case ETH_SS_STATS:
-		if (IS_VF(edev)) {
-			int i;
-
-			for (i = 0; i < QEDE_NUM_STATS; i++)
-				if (qede_stats_arr[i].pf_only)
-					num_stats--;
-		}
+		for (i = 0; i < QEDE_NUM_STATS; i++)
+			if (qede_is_irrelevant_stat(edev, i))
+				num_stats--;
 
 		/* Account for the Regular Tx statistics */
 		num_stats += QEDE_TSS_COUNT(edev) * QEDE_NUM_TQSTATS;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3a78c3f25157..abd99109e532 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -84,6 +84,8 @@ static const struct qed_eth_ops *qed_ops;
 #define CHIP_NUM_57980S_50		0x1654
 #define CHIP_NUM_57980S_25		0x1656
 #define CHIP_NUM_57980S_IOV		0x1664
+#define CHIP_NUM_AH			0x8070
+#define CHIP_NUM_AH_IOV			0x8090
 
 #ifndef PCI_DEVICE_ID_NX2_57980E
 #define PCI_DEVICE_ID_57980S_40		CHIP_NUM_57980S_40
@@ -93,6 +95,9 @@ static const struct qed_eth_ops *qed_ops;
 #define PCI_DEVICE_ID_57980S_50		CHIP_NUM_57980S_50
 #define PCI_DEVICE_ID_57980S_25		CHIP_NUM_57980S_25
 #define PCI_DEVICE_ID_57980S_IOV	CHIP_NUM_57980S_IOV
+#define PCI_DEVICE_ID_AH		CHIP_NUM_AH
+#define PCI_DEVICE_ID_AH_IOV		CHIP_NUM_AH_IOV
+
 #endif
 
 enum qede_pci_private {
@@ -109,6 +114,10 @@ static const struct pci_device_id qede_pci_tbl[] = {
 	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_25), QEDE_PRIVATE_PF},
 #ifdef CONFIG_QED_SRIOV
 	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_57980S_IOV), QEDE_PRIVATE_VF},
+#endif
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_AH), QEDE_PRIVATE_PF},
+#ifdef CONFIG_QED_SRIOV
+	{PCI_VDEVICE(QLOGIC, PCI_DEVICE_ID_AH_IOV), QEDE_PRIVATE_VF},
 #endif
 	{ 0 }
 };
@@ -314,122 +323,135 @@ static int qede_close(struct net_device *ndev);
 
 void qede_fill_by_demand_stats(struct qede_dev *edev)
 {
+	struct qede_stats_common *p_common = &edev->stats.common;
 	struct qed_eth_stats stats;
 
 	edev->ops->get_vport_stats(edev->cdev, &stats);
-	edev->stats.no_buff_discards = stats.no_buff_discards;
-	edev->stats.packet_too_big_discard = stats.packet_too_big_discard;
-	edev->stats.ttl0_discard = stats.ttl0_discard;
-	edev->stats.rx_ucast_bytes = stats.rx_ucast_bytes;
-	edev->stats.rx_mcast_bytes = stats.rx_mcast_bytes;
-	edev->stats.rx_bcast_bytes = stats.rx_bcast_bytes;
-	edev->stats.rx_ucast_pkts = stats.rx_ucast_pkts;
-	edev->stats.rx_mcast_pkts = stats.rx_mcast_pkts;
-	edev->stats.rx_bcast_pkts = stats.rx_bcast_pkts;
-	edev->stats.mftag_filter_discards = stats.mftag_filter_discards;
-	edev->stats.mac_filter_discards = stats.mac_filter_discards;
-
-	edev->stats.tx_ucast_bytes = stats.tx_ucast_bytes;
-	edev->stats.tx_mcast_bytes = stats.tx_mcast_bytes;
-	edev->stats.tx_bcast_bytes = stats.tx_bcast_bytes;
-	edev->stats.tx_ucast_pkts = stats.tx_ucast_pkts;
-	edev->stats.tx_mcast_pkts = stats.tx_mcast_pkts;
-	edev->stats.tx_bcast_pkts = stats.tx_bcast_pkts;
-	edev->stats.tx_err_drop_pkts = stats.tx_err_drop_pkts;
-	edev->stats.coalesced_pkts = stats.tpa_coalesced_pkts;
-	edev->stats.coalesced_events = stats.tpa_coalesced_events;
-	edev->stats.coalesced_aborts_num = stats.tpa_aborts_num;
-	edev->stats.non_coalesced_pkts = stats.tpa_not_coalesced_pkts;
-	edev->stats.coalesced_bytes = stats.tpa_coalesced_bytes;
-
-	edev->stats.rx_64_byte_packets = stats.rx_64_byte_packets;
-	edev->stats.rx_65_to_127_byte_packets = stats.rx_65_to_127_byte_packets;
-	edev->stats.rx_128_to_255_byte_packets =
-				stats.rx_128_to_255_byte_packets;
-	edev->stats.rx_256_to_511_byte_packets =
-				stats.rx_256_to_511_byte_packets;
-	edev->stats.rx_512_to_1023_byte_packets =
-				stats.rx_512_to_1023_byte_packets;
-	edev->stats.rx_1024_to_1518_byte_packets =
-				stats.rx_1024_to_1518_byte_packets;
-	edev->stats.rx_1519_to_1522_byte_packets =
-				stats.rx_1519_to_1522_byte_packets;
-	edev->stats.rx_1519_to_2047_byte_packets =
-				stats.rx_1519_to_2047_byte_packets;
-	edev->stats.rx_2048_to_4095_byte_packets =
-				stats.rx_2048_to_4095_byte_packets;
-	edev->stats.rx_4096_to_9216_byte_packets =
-				stats.rx_4096_to_9216_byte_packets;
-	edev->stats.rx_9217_to_16383_byte_packets =
-				stats.rx_9217_to_16383_byte_packets;
-	edev->stats.rx_crc_errors = stats.rx_crc_errors;
-	edev->stats.rx_mac_crtl_frames = stats.rx_mac_crtl_frames;
-	edev->stats.rx_pause_frames = stats.rx_pause_frames;
-	edev->stats.rx_pfc_frames = stats.rx_pfc_frames;
-	edev->stats.rx_align_errors = stats.rx_align_errors;
-	edev->stats.rx_carrier_errors = stats.rx_carrier_errors;
-	edev->stats.rx_oversize_packets = stats.rx_oversize_packets;
-	edev->stats.rx_jabbers = stats.rx_jabbers;
-	edev->stats.rx_undersize_packets = stats.rx_undersize_packets;
-	edev->stats.rx_fragments = stats.rx_fragments;
-	edev->stats.tx_64_byte_packets = stats.tx_64_byte_packets;
-	edev->stats.tx_65_to_127_byte_packets = stats.tx_65_to_127_byte_packets;
-	edev->stats.tx_128_to_255_byte_packets =
-				stats.tx_128_to_255_byte_packets;
-	edev->stats.tx_256_to_511_byte_packets =
-				stats.tx_256_to_511_byte_packets;
-	edev->stats.tx_512_to_1023_byte_packets =
-				stats.tx_512_to_1023_byte_packets;
-	edev->stats.tx_1024_to_1518_byte_packets =
-				stats.tx_1024_to_1518_byte_packets;
-	edev->stats.tx_1519_to_2047_byte_packets =
-				stats.tx_1519_to_2047_byte_packets;
-	edev->stats.tx_2048_to_4095_byte_packets =
-				stats.tx_2048_to_4095_byte_packets;
-	edev->stats.tx_4096_to_9216_byte_packets =
-				stats.tx_4096_to_9216_byte_packets;
-	edev->stats.tx_9217_to_16383_byte_packets =
-				stats.tx_9217_to_16383_byte_packets;
-	edev->stats.tx_pause_frames = stats.tx_pause_frames;
-	edev->stats.tx_pfc_frames = stats.tx_pfc_frames;
-	edev->stats.tx_lpi_entry_count = stats.tx_lpi_entry_count;
-	edev->stats.tx_total_collisions = stats.tx_total_collisions;
-	edev->stats.brb_truncates = stats.brb_truncates;
-	edev->stats.brb_discards = stats.brb_discards;
-	edev->stats.tx_mac_ctrl_frames = stats.tx_mac_ctrl_frames;
+
+	p_common->no_buff_discards = stats.common.no_buff_discards;
+	p_common->packet_too_big_discard = stats.common.packet_too_big_discard;
+	p_common->ttl0_discard = stats.common.ttl0_discard;
+	p_common->rx_ucast_bytes = stats.common.rx_ucast_bytes;
+	p_common->rx_mcast_bytes = stats.common.rx_mcast_bytes;
+	p_common->rx_bcast_bytes = stats.common.rx_bcast_bytes;
+	p_common->rx_ucast_pkts = stats.common.rx_ucast_pkts;
+	p_common->rx_mcast_pkts = stats.common.rx_mcast_pkts;
+	p_common->rx_bcast_pkts = stats.common.rx_bcast_pkts;
+	p_common->mftag_filter_discards = stats.common.mftag_filter_discards;
+	p_common->mac_filter_discards = stats.common.mac_filter_discards;
+
+	p_common->tx_ucast_bytes = stats.common.tx_ucast_bytes;
+	p_common->tx_mcast_bytes = stats.common.tx_mcast_bytes;
+	p_common->tx_bcast_bytes = stats.common.tx_bcast_bytes;
+	p_common->tx_ucast_pkts = stats.common.tx_ucast_pkts;
+	p_common->tx_mcast_pkts = stats.common.tx_mcast_pkts;
+	p_common->tx_bcast_pkts = stats.common.tx_bcast_pkts;
+	p_common->tx_err_drop_pkts = stats.common.tx_err_drop_pkts;
+	p_common->coalesced_pkts = stats.common.tpa_coalesced_pkts;
+	p_common->coalesced_events = stats.common.tpa_coalesced_events;
+	p_common->coalesced_aborts_num = stats.common.tpa_aborts_num;
+	p_common->non_coalesced_pkts = stats.common.tpa_not_coalesced_pkts;
+	p_common->coalesced_bytes = stats.common.tpa_coalesced_bytes;
+
+	p_common->rx_64_byte_packets = stats.common.rx_64_byte_packets;
+	p_common->rx_65_to_127_byte_packets =
+	    stats.common.rx_65_to_127_byte_packets;
+	p_common->rx_128_to_255_byte_packets =
+	    stats.common.rx_128_to_255_byte_packets;
+	p_common->rx_256_to_511_byte_packets =
+	    stats.common.rx_256_to_511_byte_packets;
+	p_common->rx_512_to_1023_byte_packets =
+	    stats.common.rx_512_to_1023_byte_packets;
+	p_common->rx_1024_to_1518_byte_packets =
+	    stats.common.rx_1024_to_1518_byte_packets;
+	p_common->rx_crc_errors = stats.common.rx_crc_errors;
+	p_common->rx_mac_crtl_frames = stats.common.rx_mac_crtl_frames;
+	p_common->rx_pause_frames = stats.common.rx_pause_frames;
+	p_common->rx_pfc_frames = stats.common.rx_pfc_frames;
+	p_common->rx_align_errors = stats.common.rx_align_errors;
+	p_common->rx_carrier_errors = stats.common.rx_carrier_errors;
+	p_common->rx_oversize_packets = stats.common.rx_oversize_packets;
+	p_common->rx_jabbers = stats.common.rx_jabbers;
+	p_common->rx_undersize_packets = stats.common.rx_undersize_packets;
+	p_common->rx_fragments = stats.common.rx_fragments;
+	p_common->tx_64_byte_packets = stats.common.tx_64_byte_packets;
+	p_common->tx_65_to_127_byte_packets =
+	    stats.common.tx_65_to_127_byte_packets;
+	p_common->tx_128_to_255_byte_packets =
+	    stats.common.tx_128_to_255_byte_packets;
+	p_common->tx_256_to_511_byte_packets =
+	    stats.common.tx_256_to_511_byte_packets;
+	p_common->tx_512_to_1023_byte_packets =
+	    stats.common.tx_512_to_1023_byte_packets;
+	p_common->tx_1024_to_1518_byte_packets =
+	    stats.common.tx_1024_to_1518_byte_packets;
+	p_common->tx_pause_frames = stats.common.tx_pause_frames;
+	p_common->tx_pfc_frames = stats.common.tx_pfc_frames;
+	p_common->brb_truncates = stats.common.brb_truncates;
+	p_common->brb_discards = stats.common.brb_discards;
+	p_common->tx_mac_ctrl_frames = stats.common.tx_mac_ctrl_frames;
+
+	if (QEDE_IS_BB(edev)) {
+		struct qede_stats_bb *p_bb = &edev->stats.bb;
+
+		p_bb->rx_1519_to_1522_byte_packets =
+		    stats.bb.rx_1519_to_1522_byte_packets;
+		p_bb->rx_1519_to_2047_byte_packets =
+		    stats.bb.rx_1519_to_2047_byte_packets;
+		p_bb->rx_2048_to_4095_byte_packets =
+		    stats.bb.rx_2048_to_4095_byte_packets;
+		p_bb->rx_4096_to_9216_byte_packets =
+		    stats.bb.rx_4096_to_9216_byte_packets;
+		p_bb->rx_9217_to_16383_byte_packets =
+		    stats.bb.rx_9217_to_16383_byte_packets;
+		p_bb->tx_1519_to_2047_byte_packets =
+		    stats.bb.tx_1519_to_2047_byte_packets;
+		p_bb->tx_2048_to_4095_byte_packets =
+		    stats.bb.tx_2048_to_4095_byte_packets;
+		p_bb->tx_4096_to_9216_byte_packets =
+		    stats.bb.tx_4096_to_9216_byte_packets;
+		p_bb->tx_9217_to_16383_byte_packets =
+		    stats.bb.tx_9217_to_16383_byte_packets;
+		p_bb->tx_lpi_entry_count = stats.bb.tx_lpi_entry_count;
+		p_bb->tx_total_collisions = stats.bb.tx_total_collisions;
+	} else {
+		struct qede_stats_ah *p_ah = &edev->stats.ah;
+
+		p_ah->rx_1519_to_max_byte_packets =
+		    stats.ah.rx_1519_to_max_byte_packets;
+		p_ah->tx_1519_to_max_byte_packets =
+		    stats.ah.tx_1519_to_max_byte_packets;
+	}
 }
 
 static void qede_get_stats64(struct net_device *dev,
 			     struct rtnl_link_stats64 *stats)
 {
 	struct qede_dev *edev = netdev_priv(dev);
+	struct qede_stats_common *p_common;
 
 	qede_fill_by_demand_stats(edev);
+	p_common = &edev->stats.common;
 
-	stats->rx_packets = edev->stats.rx_ucast_pkts +
-			    edev->stats.rx_mcast_pkts +
-			    edev->stats.rx_bcast_pkts;
-	stats->tx_packets = edev->stats.tx_ucast_pkts +
-			    edev->stats.tx_mcast_pkts +
-			    edev->stats.tx_bcast_pkts;
-
-	stats->rx_bytes = edev->stats.rx_ucast_bytes +
-			  edev->stats.rx_mcast_bytes +
-			  edev->stats.rx_bcast_bytes;
+	stats->rx_packets = p_common->rx_ucast_pkts + p_common->rx_mcast_pkts +
+			    p_common->rx_bcast_pkts;
+	stats->tx_packets = p_common->tx_ucast_pkts + p_common->tx_mcast_pkts +
+			    p_common->tx_bcast_pkts;
 
-	stats->tx_bytes = edev->stats.tx_ucast_bytes +
-			  edev->stats.tx_mcast_bytes +
-			  edev->stats.tx_bcast_bytes;
+	stats->rx_bytes = p_common->rx_ucast_bytes + p_common->rx_mcast_bytes +
+			  p_common->rx_bcast_bytes;
+	stats->tx_bytes = p_common->tx_ucast_bytes + p_common->tx_mcast_bytes +
+			  p_common->tx_bcast_bytes;
 
-	stats->tx_errors = edev->stats.tx_err_drop_pkts;
-	stats->multicast = edev->stats.rx_mcast_pkts +
-			   edev->stats.rx_bcast_pkts;
+	stats->tx_errors = p_common->tx_err_drop_pkts;
+	stats->multicast = p_common->rx_mcast_pkts + p_common->rx_bcast_pkts;
 
-	stats->rx_fifo_errors = edev->stats.no_buff_discards;
+	stats->rx_fifo_errors = p_common->no_buff_discards;
 
-	stats->collisions = edev->stats.tx_total_collisions;
-	stats->rx_crc_errors = edev->stats.rx_crc_errors;
-	stats->rx_frame_errors = edev->stats.rx_align_errors;
+	if (QEDE_IS_BB(edev))
+		stats->collisions = edev->stats.bb.tx_total_collisions;
+	stats->rx_crc_errors = p_common->rx_crc_errors;
+	stats->rx_frame_errors = p_common->rx_align_errors;
 }
 
 #ifdef CONFIG_QED_SRIOV
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index fde56c436f71..8e0065c52857 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -300,6 +300,11 @@ struct qed_sb_info {
 	struct qed_dev		*cdev;
 };
 
+enum qed_dev_type {
+	QED_DEV_TYPE_BB,
+	QED_DEV_TYPE_AH,
+};
+
 struct qed_dev_info {
 	unsigned long	pci_mem_start;
 	unsigned long	pci_mem_end;
@@ -325,6 +330,8 @@ struct qed_dev_info {
 	u16		mtu;
 
 	bool wol_support;
+
+	enum qed_dev_type dev_type;
 };
 
 enum qed_sb_type {
@@ -752,7 +759,7 @@ enum qed_mf_mode {
 	QED_MF_NPAR,
 };
 
-struct qed_eth_stats {
+struct qed_eth_stats_common {
 	u64	no_buff_discards;
 	u64	packet_too_big_discard;
 	u64	ttl0_discard;
@@ -784,11 +791,6 @@ struct qed_eth_stats {
 	u64	rx_256_to_511_byte_packets;
 	u64	rx_512_to_1023_byte_packets;
 	u64	rx_1024_to_1518_byte_packets;
-	u64	rx_1519_to_1522_byte_packets;
-	u64	rx_1519_to_2047_byte_packets;
-	u64	rx_2048_to_4095_byte_packets;
-	u64	rx_4096_to_9216_byte_packets;
-	u64	rx_9217_to_16383_byte_packets;
 	u64	rx_crc_errors;
 	u64	rx_mac_crtl_frames;
 	u64	rx_pause_frames;
@@ -805,14 +807,8 @@ struct qed_eth_stats {
 	u64	tx_256_to_511_byte_packets;
 	u64	tx_512_to_1023_byte_packets;
 	u64	tx_1024_to_1518_byte_packets;
-	u64	tx_1519_to_2047_byte_packets;
-	u64	tx_2048_to_4095_byte_packets;
-	u64	tx_4096_to_9216_byte_packets;
-	u64	tx_9217_to_16383_byte_packets;
 	u64	tx_pause_frames;
 	u64	tx_pfc_frames;
-	u64	tx_lpi_entry_count;
-	u64	tx_total_collisions;
 	u64	brb_truncates;
 	u64	brb_discards;
 	u64	rx_mac_bytes;
@@ -827,6 +823,34 @@ struct qed_eth_stats {
 	u64	tx_mac_ctrl_frames;
 };
 
+struct qed_eth_stats_bb {
+	u64 rx_1519_to_1522_byte_packets;
+	u64 rx_1519_to_2047_byte_packets;
+	u64 rx_2048_to_4095_byte_packets;
+	u64 rx_4096_to_9216_byte_packets;
+	u64 rx_9217_to_16383_byte_packets;
+	u64 tx_1519_to_2047_byte_packets;
+	u64 tx_2048_to_4095_byte_packets;
+	u64 tx_4096_to_9216_byte_packets;
+	u64 tx_9217_to_16383_byte_packets;
+	u64 tx_lpi_entry_count;
+	u64 tx_total_collisions;
+};
+
+struct qed_eth_stats_ah {
+	u64 rx_1519_to_max_byte_packets;
+	u64 tx_1519_to_max_byte_packets;
+};
+
+struct qed_eth_stats {
+	struct qed_eth_stats_common common;
+
+	union {
+		struct qed_eth_stats_bb bb;
+		struct qed_eth_stats_ah ah;
+	};
+};
+
 #define QED_SB_IDX              0x0002
 
 #define RX_PI           0
diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h
index f773aa5e746f..72c770f9f666 100644
--- a/include/linux/qed/rdma_common.h
+++ b/include/linux/qed/rdma_common.h
@@ -52,7 +52,8 @@
 #define RDMA_MAX_PDS                            (64 * 1024)
 
 #define RDMA_NUM_STATISTIC_COUNTERS                     MAX_NUM_VPORTS
-#define RDMA_NUM_STATISTIC_COUNTERS_BB			MAX_NUM_VPORTS_BB
+#define RDMA_NUM_STATISTIC_COUNTERS_K2                  MAX_NUM_VPORTS_K2
+#define RDMA_NUM_STATISTIC_COUNTERS_BB                  MAX_NUM_VPORTS_BB
 
 #define RDMA_TASK_TYPE (PROTOCOLID_ROCE)
 
-- 
cgit v1.2.3


From a26356ab9392e0c5f8ad87d76c42e7c58c036d24 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Tue, 28 Feb 2017 15:31:16 +0100
Subject: of/pci: Remove unused MSI controller helpers

All users of the small MSI controller API have been migrated to use the
generic MSI infrastructure instead. We no longer need a global chained list
of msi_controller. Instead, MSI controllers are now represented as IRQ
domains attached to OF nodes, and the resolution between a device
requesting an MSI and the corresponding MSI controller is done by the
generic interrupt resolution logic.

Therefore, this API is now completely useless, and can be removed from the
kernel.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
---
 drivers/of/of_pci.c    | 45 ---------------------------------------------
 include/linux/of_pci.h | 11 -----------
 2 files changed, 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/of_pci.c b/drivers/of/of_pci.c
index 0ee42c3e66a1..c9d4d3a7b0fe 100644
--- a/drivers/of/of_pci.c
+++ b/drivers/of/of_pci.c
@@ -285,51 +285,6 @@ parse_failed:
 EXPORT_SYMBOL_GPL(of_pci_get_host_bridge_resources);
 #endif /* CONFIG_OF_ADDRESS */
 
-#ifdef CONFIG_PCI_MSI
-
-static LIST_HEAD(of_pci_msi_chip_list);
-static DEFINE_MUTEX(of_pci_msi_chip_mutex);
-
-int of_pci_msi_chip_add(struct msi_controller *chip)
-{
-	if (!of_property_read_bool(chip->of_node, "msi-controller"))
-		return -EINVAL;
-
-	mutex_lock(&of_pci_msi_chip_mutex);
-	list_add(&chip->list, &of_pci_msi_chip_list);
-	mutex_unlock(&of_pci_msi_chip_mutex);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(of_pci_msi_chip_add);
-
-void of_pci_msi_chip_remove(struct msi_controller *chip)
-{
-	mutex_lock(&of_pci_msi_chip_mutex);
-	list_del(&chip->list);
-	mutex_unlock(&of_pci_msi_chip_mutex);
-}
-EXPORT_SYMBOL_GPL(of_pci_msi_chip_remove);
-
-struct msi_controller *of_pci_find_msi_chip_by_node(struct device_node *of_node)
-{
-	struct msi_controller *c;
-
-	mutex_lock(&of_pci_msi_chip_mutex);
-	list_for_each_entry(c, &of_pci_msi_chip_list, list) {
-		if (c->of_node == of_node) {
-			mutex_unlock(&of_pci_msi_chip_mutex);
-			return c;
-		}
-	}
-	mutex_unlock(&of_pci_msi_chip_mutex);
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(of_pci_find_msi_chip_by_node);
-
-#endif /* CONFIG_PCI_MSI */
-
 /**
  * of_pci_map_rid - Translate a requester ID through a downstream mapping.
  * @np: root complex device node.
diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h
index 0e0974eceb80..518c8d20647a 100644
--- a/include/linux/of_pci.h
+++ b/include/linux/of_pci.h
@@ -85,15 +85,4 @@ static inline int of_pci_get_host_bridge_resources(struct device_node *dev,
 }
 #endif
 
-#if defined(CONFIG_OF) && defined(CONFIG_PCI_MSI)
-int of_pci_msi_chip_add(struct msi_controller *chip);
-void of_pci_msi_chip_remove(struct msi_controller *chip);
-struct msi_controller *of_pci_find_msi_chip_by_node(struct device_node *of_node);
-#else
-static inline int of_pci_msi_chip_add(struct msi_controller *chip) { return -EINVAL; }
-static inline void of_pci_msi_chip_remove(struct msi_controller *chip) { }
-static inline struct msi_controller *
-of_pci_find_msi_chip_by_node(struct device_node *of_node) { return NULL; }
-#endif
-
 #endif
-- 
cgit v1.2.3


From 85c73d50e57eb8ad43955fe38714bc5fba1acd92 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 2 Mar 2017 15:48:05 +0200
Subject: gpio: acpi: Add managed variant of acpi_dev_add_driver_gpios()

Introduce device managed variant of acpi_dev_add_driver_gpios() and its
counterpart acpi_dev_remove_driver_gpios().

The functions in most cases are used in driver's ->probe() and
->remove() callbacks, that's why it's useful to have managed variant of
them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-acpi.c | 31 +++++++++++++++++++++++++++++++
 include/linux/acpi.h        | 11 +++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 9b37a3692b3f..108331de440b 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -362,6 +362,37 @@ int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 }
 EXPORT_SYMBOL_GPL(acpi_dev_add_driver_gpios);
 
+static void devm_acpi_dev_release_driver_gpios(struct device *dev, void *res)
+{
+	acpi_dev_remove_driver_gpios(ACPI_COMPANION(dev));
+}
+
+int devm_acpi_dev_add_driver_gpios(struct device *dev,
+				   const struct acpi_gpio_mapping *gpios)
+{
+	void *res;
+	int ret;
+
+	res = devres_alloc(devm_acpi_dev_release_driver_gpios, 0, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	ret = acpi_dev_add_driver_gpios(ACPI_COMPANION(dev), gpios);
+	if (ret) {
+		devres_free(res);
+		return ret;
+	}
+	devres_add(dev, res);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_acpi_dev_add_driver_gpios);
+
+void devm_acpi_dev_remove_driver_gpios(struct device *dev)
+{
+	WARN_ON(devres_release(dev, devm_acpi_dev_release_driver_gpios, NULL, NULL));
+}
+EXPORT_SYMBOL_GPL(devm_acpi_dev_remove_driver_gpios);
+
 static bool acpi_get_driver_gpio_data(struct acpi_device *adev,
 				      const char *name, int index,
 				      struct acpi_reference_args *args)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 673acda012af..c8eaaad4a9ed 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -952,6 +952,10 @@ static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev)
 		adev->driver_gpios = NULL;
 }
 
+int devm_acpi_dev_add_driver_gpios(struct device *dev,
+				   const struct acpi_gpio_mapping *gpios);
+void devm_acpi_dev_remove_driver_gpios(struct device *dev);
+
 int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index);
 #else
 static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev,
@@ -961,6 +965,13 @@ static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev,
 }
 static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) {}
 
+static inline int devm_acpi_dev_add_driver_gpios(struct device *dev,
+			      const struct acpi_gpio_mapping *gpios)
+{
+	return -ENXIO;
+}
+static inline void devm_acpi_dev_remove_driver_gpios(struct device *dev) {}
+
 static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
 {
 	return -ENXIO;
-- 
cgit v1.2.3


From 22c403676dbbb7c6f186099527af7f065498ef45 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sun, 12 Feb 2017 17:13:55 -0800
Subject: gpio: return NULL from gpiod_get_optional when GPIOLIB is disabled

Given the intent behind gpiod_get_optional() and friends it does not make
sense to return -ENOSYS when GPIOLIB is disabled: the driver is expected to
work just fine without gpio so let's behave as if gpio was not found.
Otherwise we have to special-case -ENOSYS in drivers.

Note that there was objection that someone might forget to enable GPIOLIB
when dealing with a platform that has device that actually specifies
optional gpio and we'll break it. I find this unconvincing as that would
have to be the *only GPIO* in the system, which is extremely unlikely.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/gpio/consumer.txt |  6 ++++++
 include/linux/gpio/consumer.h   | 12 ++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio/consumer.txt b/Documentation/gpio/consumer.txt
index 05676fdacfe3..912568baabb9 100644
--- a/Documentation/gpio/consumer.txt
+++ b/Documentation/gpio/consumer.txt
@@ -70,6 +70,12 @@ instead of -ENOENT if no GPIO has been assigned to the requested function:
 						   unsigned int index,
 						   enum gpiod_flags flags)
 
+Note that gpio_get*_optional() functions (and their managed variants), unlike
+the rest of gpiolib API, also return NULL when gpiolib support is disabled.
+This is helpful to driver authors, since they do not need to special case
+-ENOSYS return codes.  System integrators should however be careful to enable
+gpiolib on systems that need it.
+
 For a function using multiple GPIOs all of those can be obtained with one call:
 
 	struct gpio_descs *gpiod_get_array(struct device *dev,
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 2484b2fcc6eb..13daf08e25bb 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -179,14 +179,14 @@ static inline struct gpio_desc *__must_check
 gpiod_get_optional(struct device *dev, const char *con_id,
 		   enum gpiod_flags flags)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline struct gpio_desc *__must_check
 gpiod_get_index_optional(struct device *dev, const char *con_id,
 			 unsigned int index, enum gpiod_flags flags)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline struct gpio_descs *__must_check
@@ -200,7 +200,7 @@ static inline struct gpio_descs *__must_check
 gpiod_get_array_optional(struct device *dev, const char *con_id,
 			 enum gpiod_flags flags)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline void gpiod_put(struct gpio_desc *desc)
@@ -240,14 +240,14 @@ static inline struct gpio_desc *__must_check
 devm_gpiod_get_optional(struct device *dev, const char *con_id,
 			  enum gpiod_flags flags)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline struct gpio_desc *__must_check
 devm_gpiod_get_index_optional(struct device *dev, const char *con_id,
 				unsigned int index, enum gpiod_flags flags)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline struct gpio_descs *__must_check
@@ -261,7 +261,7 @@ static inline struct gpio_descs *__must_check
 devm_gpiod_get_array_optional(struct device *dev, const char *con_id,
 			      enum gpiod_flags flags)
 {
-	return ERR_PTR(-ENOSYS);
+	return NULL;
 }
 
 static inline void devm_gpiod_put(struct device *dev, struct gpio_desc *desc)
-- 
cgit v1.2.3


From 1d585e70905e03e8c19c9aaf523ec246ae6b18a1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Mar 2017 13:56:06 +0530
Subject: trace/kprobes: Fix check for kretprobe offset within function entry

perf specifies an offset from _text and since this offset is fed
directly into the arch-specific helper, kprobes tracer rejects
installation of kretprobes through perf. Fix this by looking up the
actual offset from a function for the specified sym+offset.

Refactor and reuse existing routines to limit code duplication -- we
repurpose kprobe_addr() for determining final kprobe address and we
split out the function entry offset determination into a separate
generic helper.

Before patch:

  naveen@ubuntu:~/linux/tools/perf$ sudo ./perf probe -v do_open%return
  probe-definition(0): do_open%return
  symbol:do_open file:(null) line:0 offset:0 return:1 lazy:(null)
  0 arguments
  Looking at the vmlinux_path (8 entries long)
  Using /boot/vmlinux for symbols
  Open Debuginfo file: /boot/vmlinux
  Try to find probe point from debuginfo.
  Matched function: do_open [2d0c7ff]
  Probe point found: do_open+0
  Matched function: do_open [35d76dc]
  found inline addr: 0xc0000000004ba9c4
  Failed to find "do_open%return",
   because do_open is an inlined function and has no return point.
  An error occurred in debuginfo analysis (-22).
  Trying to use symbols.
  Opening /sys/kernel/debug/tracing//README write=0
  Opening /sys/kernel/debug/tracing//kprobe_events write=1
  Writing event: r:probe/do_open _text+4469776
  Failed to write event: Invalid argument
    Error: Failed to add events. Reason: Invalid argument (Code: -22)
  naveen@ubuntu:~/linux/tools/perf$ dmesg | tail
  <snip>
  [   33.568656] Given offset is not valid for return probe.

After patch:

  naveen@ubuntu:~/linux/tools/perf$ sudo ./perf probe -v do_open%return
  probe-definition(0): do_open%return
  symbol:do_open file:(null) line:0 offset:0 return:1 lazy:(null)
  0 arguments
  Looking at the vmlinux_path (8 entries long)
  Using /boot/vmlinux for symbols
  Open Debuginfo file: /boot/vmlinux
  Try to find probe point from debuginfo.
  Matched function: do_open [2d0c7d6]
  Probe point found: do_open+0
  Matched function: do_open [35d76b3]
  found inline addr: 0xc0000000004ba9e4
  Failed to find "do_open%return",
   because do_open is an inlined function and has no return point.
  An error occurred in debuginfo analysis (-22).
  Trying to use symbols.
  Opening /sys/kernel/debug/tracing//README write=0
  Opening /sys/kernel/debug/tracing//kprobe_events write=1
  Writing event: r:probe/do_open _text+4469808
  Writing event: r:probe/do_open_1 _text+4956344
  Added new events:
    probe:do_open        (on do_open%return)
    probe:do_open_1      (on do_open%return)

  You can now use it in all perf tools, such as:

	  perf record -e probe:do_open_1 -aR sleep 1

  naveen@ubuntu:~/linux/tools/perf$ sudo cat /sys/kernel/debug/kprobes/list
  c000000000041370  k  kretprobe_trampoline+0x0    [OPTIMIZED]
  c0000000004ba0b8  r  do_open+0x8    [DISABLED]
  c000000000443430  r  do_open+0x0    [DISABLED]

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/d8cd1ef420ec22e3643ac332fdabcffc77319a42.1488961018.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/kprobes.h     |  1 +
 kernel/kprobes.c            | 40 ++++++++++++++++++++++++++--------------
 kernel/trace/trace_kprobe.c |  2 +-
 3 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 177bdf6c6aeb..47e4da5b4fa2 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -268,6 +268,7 @@ extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
 extern bool arch_function_offset_within_entry(unsigned long offset);
+extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4780ec236035..d733479a10ee 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1391,21 +1391,19 @@ bool within_kprobe_blacklist(unsigned long addr)
  * This returns encoded errors if it fails to look up symbol or invalid
  * combination of parameters.
  */
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+			const char *symbol_name, unsigned int offset)
 {
-	kprobe_opcode_t *addr = p->addr;
-
-	if ((p->symbol_name && p->addr) ||
-	    (!p->symbol_name && !p->addr))
+	if ((symbol_name && addr) || (!symbol_name && !addr))
 		goto invalid;
 
-	if (p->symbol_name) {
-		kprobe_lookup_name(p->symbol_name, addr);
+	if (symbol_name) {
+		kprobe_lookup_name(symbol_name, addr);
 		if (!addr)
 			return ERR_PTR(-ENOENT);
 	}
 
-	addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+	addr = (kprobe_opcode_t *)(((char *)addr) + offset);
 	if (addr)
 		return addr;
 
@@ -1413,6 +1411,11 @@ invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+	return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
+
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe *__get_valid_kprobe(struct kprobe *p)
 {
@@ -1881,19 +1884,28 @@ bool __weak arch_function_offset_within_entry(unsigned long offset)
 	return !offset;
 }
 
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+	kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+
+	if (IS_ERR(kp_addr))
+		return false;
+
+	if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+						!arch_function_offset_within_entry(offset))
+		return false;
+
+	return true;
+}
+
 int register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
 	int i;
 	void *addr;
-	unsigned long offset;
-
-	addr = kprobe_addr(&rp->kp);
-	if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
-		return -EINVAL;
 
-	if (!arch_function_offset_within_entry(offset))
+	if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
 		return -EINVAL;
 
 	if (kretprobe_blacklist_size) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 12fb540da0e5..013f4e7146d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -697,7 +697,7 @@ static int create_trace_kprobe(int argc, char **argv)
 			return ret;
 		}
 		if (offset && is_return &&
-		    !arch_function_offset_within_entry(offset)) {
+		    !function_offset_within_entry(NULL, symbol, offset)) {
 			pr_info("Given offset is not valid for return probe.\n");
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 56f36acd215cf7c28372b2fdb4f33f6900e97e05 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Wed, 15 Mar 2017 10:39:25 -0700
Subject: mqprio: Modify mqprio to pass user parameters via ndo_setup_tc.

The configurable priority to traffic class mapping and the user specified
queue ranges are used to configure the traffic class, overriding the
hardware defaults when the 'hw' option is set to 0. However, when the 'hw'
option is non-zero, the hardware QOS defaults are used.

This patch makes it so that we can pass the data the user provided to
ndo_setup_tc. This allows us to pull in the queue configuration if the
user requested it as well as any additional hardware offload type
requested by using a value other than 1 for the hw value.

Finally it also provides a means for the device driver to return the level
supported for the offload type via the qopt->hw value. Previously we were
just always assuming the value to be 1, in the future values beyond just 1
may be supported.

Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c          |  3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |  5 ++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c         |  4 +++-
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c    | 16 ++++++++++------
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c   |  4 +++-
 drivers/net/ethernet/intel/i40e/i40e_main.c       |  7 +++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  4 +++-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  4 +++-
 drivers/net/ethernet/sfc/falcon/tx.c              |  4 +++-
 drivers/net/ethernet/sfc/tx.c                     |  4 +++-
 drivers/net/ethernet/ti/netcp_core.c              | 12 ++++++++----
 include/linux/netdevice.h                         |  2 +-
 net/sched/sch_mqprio.c                            | 17 +++++++++++------
 14 files changed, 62 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index ffea9859f5a7..7ec2c9717cf1 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1854,7 +1854,8 @@ static int xgbe_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
 	if (tc_to_netdev->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	tc = tc_to_netdev->tc;
+	tc_to_netdev->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	tc = tc_to_netdev->mqprio->num_tc;
 
 	if (tc > pdata->hw_feat.tc_cnt)
 		return -EINVAL;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 9e8c06130c09..ad3e0631877e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -4277,7 +4277,10 @@ int __bnx2x_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 {
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
-	return bnx2x_setup_tc(dev, tc->tc);
+
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return bnx2x_setup_tc(dev, tc->mqprio->num_tc);
 }
 
 /* called with rtnl_lock */
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 32de4589d16a..174ec8f84637 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6905,7 +6905,9 @@ static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 	if (ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return bnxt_setup_mq_tc(dev, ntc->tc);
+	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return bnxt_setup_mq_tc(dev, ntc->mqprio->num_tc);
 }
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
index aa769cbc7425..d4bb8bf86a45 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c
@@ -346,33 +346,37 @@ static int dpaa_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
 			 struct tc_to_netdev *tc)
 {
 	struct dpaa_priv *priv = netdev_priv(net_dev);
+	u8 num_tc;
 	int i;
 
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	if (tc->tc == priv->num_tc)
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	num_tc = tc->mqprio->num_tc;
+
+	if (num_tc == priv->num_tc)
 		return 0;
 
-	if (!tc->tc) {
+	if (!num_tc) {
 		netdev_reset_tc(net_dev);
 		goto out;
 	}
 
-	if (tc->tc > DPAA_TC_NUM) {
+	if (num_tc > DPAA_TC_NUM) {
 		netdev_err(net_dev, "Too many traffic classes: max %d supported.\n",
 			   DPAA_TC_NUM);
 		return -EINVAL;
 	}
 
-	netdev_set_num_tc(net_dev, tc->tc);
+	netdev_set_num_tc(net_dev, num_tc);
 
-	for (i = 0; i < tc->tc; i++)
+	for (i = 0; i < num_tc; i++)
 		netdev_set_tc_queue(net_dev, i, DPAA_TC_TXQ_NUM,
 				    i * DPAA_TC_TXQ_NUM);
 
 out:
-	priv->num_tc = tc->tc ? tc->tc : 1;
+	priv->num_tc = num_tc ? : 1;
 	netif_set_real_num_tx_queues(net_dev, priv->num_tc * DPAA_TC_TXQ_NUM);
 	return 0;
 }
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 01db688cf539..72481670478c 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -1226,7 +1226,9 @@ static int __fm10k_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return fm10k_setup_tc(dev, tc->tc);
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return fm10k_setup_tc(dev, tc->mqprio->num_tc);
 }
 
 static void fm10k_assign_l2_accel(struct fm10k_intfc *interface,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 113b32911f1b..9df0d86812e7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5611,9 +5611,12 @@ static int __i40e_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
 			   struct tc_to_netdev *tc)
 #endif
 {
-	if (handle != TC_H_ROOT || tc->type != TC_SETUP_MQPRIO)
+	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
-	return i40e_setup_tc(netdev, tc->tc);
+
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return i40e_setup_tc(netdev, tc->mqprio->num_tc);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a7a430a7be2c..d45477db0227 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8948,7 +8948,9 @@ static int __ixgbe_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return ixgbe_setup_tc(dev, tc->tc);
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return ixgbe_setup_tc(dev, tc->mqprio->num_tc);
 }
 
 #ifdef CONFIG_PCI_IOV
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 61420473fe5f..94fab20ef146 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -92,7 +92,9 @@ static int __mlx4_en_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return mlx4_en_setup_tc(dev, tc->tc);
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return mlx4_en_setup_tc(dev, tc->mqprio->num_tc);
 }
 
 #ifdef CONFIG_RFS_ACCEL
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8ef64c4db2c2..f96a73ea8e0b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2737,7 +2737,9 @@ mqprio:
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	return mlx5e_setup_tc(dev, tc->tc);
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
+	return mlx5e_setup_tc(dev, tc->mqprio->num_tc);
 }
 
 static void
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 104fb15a73f2..f6daf09b8627 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -437,11 +437,13 @@ int ef4_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
 	if (ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	num_tc = ntc->tc;
+	num_tc = ntc->mqprio->num_tc;
 
 	if (ef4_nic_rev(efx) < EF4_REV_FALCON_B0 || num_tc > EF4_MAX_TX_TC)
 		return -EINVAL;
 
+	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
 	if (num_tc == net_dev->num_tc)
 		return 0;
 
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index ff88d60aa6d5..3bdf87f31087 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -665,11 +665,13 @@ int efx_setup_tc(struct net_device *net_dev, u32 handle, __be16 proto,
 	if (ntc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
-	num_tc = ntc->tc;
+	num_tc = ntc->mqprio->num_tc;
 
 	if (num_tc > EFX_MAX_TX_TC)
 		return -EINVAL;
 
+	ntc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+
 	if (num_tc == net_dev->num_tc)
 		return 0;
 
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 7c7ae0890e90..9027c9c509b5 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1882,6 +1882,7 @@ static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
 static int netcp_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 			  struct tc_to_netdev *tc)
 {
+	u8 num_tc;
 	int i;
 
 	/* setup tc must be called under rtnl lock */
@@ -1890,15 +1891,18 @@ static int netcp_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
 	if (tc->type != TC_SETUP_MQPRIO)
 		return -EINVAL;
 
+	tc->mqprio->hw = TC_MQPRIO_HW_OFFLOAD_TCS;
+	num_tc = tc->mqprio->num_tc;
+
 	/* Sanity-check the number of traffic classes requested */
 	if ((dev->real_num_tx_queues <= 1) ||
-	    (dev->real_num_tx_queues < tc->tc))
+	    (dev->real_num_tx_queues < num_tc))
 		return -EINVAL;
 
 	/* Configure traffic class to queue mappings */
-	if (tc->tc) {
-		netdev_set_num_tc(dev, tc->tc);
-		for (i = 0; i < tc->tc; i++)
+	if (num_tc) {
+		netdev_set_num_tc(dev, num_tc);
+		for (i = 0; i < num_tc; i++)
 			netdev_set_tc_queue(dev, i, 1, i);
 	} else {
 		netdev_reset_tc(dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 97456b2539e4..b7365b587818 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -786,11 +786,11 @@ struct tc_cls_u32_offload;
 struct tc_to_netdev {
 	unsigned int type;
 	union {
-		u8 tc;
 		struct tc_cls_u32_offload *cls_u32;
 		struct tc_cls_flower_offload *cls_flower;
 		struct tc_cls_matchall_offload *cls_mall;
 		struct tc_cls_bpf_offload *cls_bpf;
+		struct tc_mqprio_qopt *mqprio;
 	};
 	bool egress_dev;
 };
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 5f55bf149d9f..0a4cf27ea54b 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -28,7 +28,6 @@ static void mqprio_destroy(struct Qdisc *sch)
 {
 	struct net_device *dev = qdisc_dev(sch);
 	struct mqprio_sched *priv = qdisc_priv(sch);
-	struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO};
 	unsigned int ntx;
 
 	if (priv->qdiscs) {
@@ -39,10 +38,15 @@ static void mqprio_destroy(struct Qdisc *sch)
 		kfree(priv->qdiscs);
 	}
 
-	if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc)
+	if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
+		struct tc_mqprio_qopt offload = { 0 };
+		struct tc_to_netdev tc = { .type = TC_SETUP_MQPRIO,
+					   { .mqprio = &offload } };
+
 		dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
-	else
+	} else {
 		netdev_set_num_tc(dev, 0);
+	}
 }
 
 static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
@@ -144,14 +148,15 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
 	 * supplied and verified mapping
 	 */
 	if (qopt->hw) {
-		struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO,
-					  { .tc = qopt->num_tc }};
+		struct tc_mqprio_qopt offload = *qopt;
+		struct tc_to_netdev tc = { .type = TC_SETUP_MQPRIO,
+					   { .mqprio = &offload } };
 
 		err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
 		if (err)
 			return err;
 
-		priv->hw_offload = qopt->hw;
+		priv->hw_offload = offload.hw;
 	} else {
 		netdev_set_num_tc(dev, qopt->num_tc);
 		for (i = 0; i < qopt->num_tc; i++)
-- 
cgit v1.2.3


From f4c0b0aa58d9b7e30ab0a95e33da84d53b3d764a Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 20 Feb 2017 15:33:50 +0200
Subject: perf/core: Keep AUX flags in the output handle

In preparation for adding more flags to perf AUX records, introduce a
separate API for setting the flags for a session, rather than appending
more bool arguments to perf_aux_output_end. This allows to set each
flag at the time a corresponding condition is detected, instead of
tracking it in each driver's private state.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170220133352.17995-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/bts.c                      | 16 +++++------
 arch/x86/events/intel/pt.c                       | 17 ++++++------
 arch/x86/events/intel/pt.h                       |  1 -
 drivers/hwtracing/coresight/coresight-etb10.c    |  9 +++----
 drivers/hwtracing/coresight/coresight-etm-perf.c |  9 +++----
 drivers/hwtracing/coresight/coresight-priv.h     |  2 --
 drivers/hwtracing/coresight/coresight-tmc-etf.c  |  7 +++--
 include/linux/coresight.h                        |  2 +-
 include/linux/perf_event.h                       |  8 +++---
 kernel/events/ring_buffer.c                      | 34 ++++++++++++++++--------
 10 files changed, 56 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 982c9e31daca..8ae8c5ce3a1f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -63,7 +63,6 @@ struct bts_buffer {
 	unsigned int	cur_buf;
 	bool		snapshot;
 	local_t		data_size;
-	local_t		lost;
 	local_t		head;
 	unsigned long	end;
 	void		**data_pages;
@@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts)
 			return;
 
 		if (ds->bts_index >= ds->bts_absolute_maximum)
-			local_inc(&buf->lost);
+			perf_aux_output_flag(&bts->handle,
+			                     PERF_AUX_FLAG_TRUNCATED);
 
 		/*
 		 * old and head are always in the same physical buffer, so we
@@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags)
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(&bts->handle, 0, false);
+	perf_aux_output_end(&bts->handle, 0);
 
 fail_stop:
 	event->hw.state = PERF_HES_STOPPED;
@@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags)
 				bts->handle.head =
 					local_xchg(&buf->data_size,
 						   buf->nr_pages << PAGE_SHIFT);
-
-			perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-					    !!local_xchg(&buf->lost, 0));
+			perf_aux_output_end(&bts->handle,
+			                    local_xchg(&buf->data_size, 0));
 		}
 
 		cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
@@ -484,8 +483,7 @@ int intel_bts_interrupt(void)
 	if (old_head == local_read(&buf->head))
 		return handled;
 
-	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-			    !!local_xchg(&buf->lost, 0));
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
 
 	buf = perf_aux_output_begin(&bts->handle, event);
 	if (buf)
@@ -500,7 +498,7 @@ int intel_bts_interrupt(void)
 			 * cleared handle::event
 			 */
 			barrier();
-			perf_aux_output_end(&bts->handle, 0, false);
+			perf_aux_output_end(&bts->handle, 0);
 		}
 	}
 
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 5900471ee508..0218728be37a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -753,7 +753,8 @@ static void pt_handle_status(struct pt *pt)
 		 */
 		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
 		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-			local_inc(&buf->lost);
+			perf_aux_output_flag(&pt->handle,
+			                     PERF_AUX_FLAG_TRUNCATED);
 			advance++;
 		}
 	}
@@ -846,8 +847,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 <
-	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		return -EINVAL;
+	}
 
 
 	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
@@ -1192,8 +1195,7 @@ void intel_pt_interrupt(void)
 
 	pt_update_head(pt);
 
-	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-			    local_xchg(&buf->lost, 0));
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 
 	if (!event->hw.state) {
 		int ret;
@@ -1208,7 +1210,7 @@ void intel_pt_interrupt(void)
 		/* snapshot counters don't use PMI, so it's safe */
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
 		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
+			perf_aux_output_end(&pt->handle, 0);
 			return;
 		}
 
@@ -1280,7 +1282,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(&pt->handle, 0, true);
+	perf_aux_output_end(&pt->handle, 0);
 fail_stop:
 	hwc->state = PERF_HES_STOPPED;
 }
@@ -1321,8 +1323,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
 			pt->handle.head =
 				local_xchg(&buf->data_size,
 					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-				    local_xchg(&buf->lost, 0));
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 	}
 }
 
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 53473c21b554..b528e8f373e4 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -143,7 +143,6 @@ struct pt_buffer {
 	size_t			output_off;
 	unsigned long		nr_pages;
 	local_t			data_size;
-	local_t			lost;
 	local64_t		head;
 	bool			snapshot;
 	unsigned long		stop_pos, intr_pos;
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index d7325c6534ad..979ea6ec7902 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -321,7 +321,7 @@ static int etb_set_buffer(struct coresight_device *csdev,
 
 static unsigned long etb_reset_buffer(struct coresight_device *csdev,
 				      struct perf_output_handle *handle,
-				      void *sink_config, bool *lost)
+				      void *sink_config)
 {
 	unsigned long size = 0;
 	struct cs_buffers *buf = sink_config;
@@ -343,7 +343,6 @@ static unsigned long etb_reset_buffer(struct coresight_device *csdev,
 		 * resetting parameters here and squaring off with the ring
 		 * buffer API in the tracer PMU is fine.
 		 */
-		*lost = !!local_xchg(&buf->lost, 0);
 		size = local_xchg(&buf->data_size, 0);
 	}
 
@@ -385,7 +384,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
 			(unsigned long)write_ptr);
 
 		write_ptr &= ~(ETB_FRAME_SIZE_WORDS - 1);
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 	}
 
 	/*
@@ -396,7 +395,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
 	 */
 	status = readl_relaxed(drvdata->base + ETB_STATUS_REG);
 	if (status & ETB_STATUS_RAM_FULL) {
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		to_read = capacity;
 		read_ptr = write_ptr;
 	} else {
@@ -429,7 +428,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
 		if (read_ptr > (drvdata->buffer_depth - 1))
 			read_ptr -= drvdata->buffer_depth;
 		/* let the decoder know we've skipped ahead */
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 	}
 
 	/* finally tell HW where we want to start reading from */
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 26cfac3e6de7..288a423c1b27 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -302,7 +302,8 @@ out:
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(handle, 0, true);
+	perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+	perf_aux_output_end(handle, 0);
 fail:
 	event->hw.state = PERF_HES_STOPPED;
 	goto out;
@@ -310,7 +311,6 @@ fail:
 
 static void etm_event_stop(struct perf_event *event, int mode)
 {
-	bool lost;
 	int cpu = smp_processor_id();
 	unsigned long size;
 	struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
@@ -348,10 +348,9 @@ static void etm_event_stop(struct perf_event *event, int mode)
 			return;
 
 		size = sink_ops(sink)->reset_buffer(sink, handle,
-						    event_data->snk_config,
-						    &lost);
+						    event_data->snk_config);
 
-		perf_aux_output_end(handle, size, lost);
+		perf_aux_output_end(handle, size);
 	}
 
 	/* Disabling the path make its elements available to other sessions */
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index ef9d8e93e3b2..5f662d82052c 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -76,7 +76,6 @@ enum cs_mode {
  * @nr_pages:	max number of pages granted to us
  * @offset:	offset within the current buffer
  * @data_size:	how much we collected in this run
- * @lost:	other than zero if we had a HW buffer wrap around
  * @snapshot:	is this run in snapshot mode
  * @data_pages:	a handle the ring buffer
  */
@@ -85,7 +84,6 @@ struct cs_buffers {
 	unsigned int		nr_pages;
 	unsigned long		offset;
 	local_t			data_size;
-	local_t			lost;
 	bool			snapshot;
 	void			**data_pages;
 };
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 1549436e2492..aec61a6d5c63 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -329,7 +329,7 @@ static int tmc_set_etf_buffer(struct coresight_device *csdev,
 
 static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
 					  struct perf_output_handle *handle,
-					  void *sink_config, bool *lost)
+					  void *sink_config)
 {
 	long size = 0;
 	struct cs_buffers *buf = sink_config;
@@ -350,7 +350,6 @@ static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
 		 * resetting parameters here and squaring off with the ring
 		 * buffer API in the tracer PMU is fine.
 		 */
-		*lost = !!local_xchg(&buf->lost, 0);
 		size = local_xchg(&buf->data_size, 0);
 	}
 
@@ -389,7 +388,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
 	 */
 	status = readl_relaxed(drvdata->base + TMC_STS);
 	if (status & TMC_STS_FULL) {
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		to_read = drvdata->size;
 	} else {
 		to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
@@ -434,7 +433,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
 			read_ptr -= drvdata->size;
 		/* Tell the HW */
 		writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 	}
 
 	cur = buf->cur;
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 2a5982c37dfb..035c16c9a505 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -201,7 +201,7 @@ struct coresight_ops_sink {
 			  void *sink_config);
 	unsigned long (*reset_buffer)(struct coresight_device *csdev,
 				      struct perf_output_handle *handle,
-				      void *sink_config, bool *lost);
+				      void *sink_config);
 	void (*update_buffer)(struct coresight_device *csdev,
 			      struct perf_output_handle *handle,
 			      void *sink_config);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f19a82362851..b6e75c9d4791 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -801,6 +801,7 @@ struct perf_output_handle {
 	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
+	u64				aux_flags;
 	union {
 		void			*addr;
 		unsigned long		head;
@@ -849,10 +850,11 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
 extern void *perf_aux_output_begin(struct perf_output_handle *handle,
 				   struct perf_event *event);
 extern void perf_aux_output_end(struct perf_output_handle *handle,
-				unsigned long size, bool truncated);
+				unsigned long size);
 extern int perf_aux_output_skip(struct perf_output_handle *handle,
 				unsigned long size);
 extern void *perf_get_aux(struct perf_output_handle *handle);
+extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
@@ -1268,8 +1270,8 @@ static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event)				{ return NULL; }
 static inline void
-perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
-		    bool truncated)					{ }
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
+									{ }
 static inline int
 perf_aux_output_skip(struct perf_output_handle *handle,
 		     unsigned long size)				{ return -EINVAL; }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..9654e55c38d6 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 		rb->paused = 1;
 }
 
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+	/*
+	 * OVERWRITE is determined by perf_aux_output_end() and can't
+	 * be passed in directly.
+	 */
+	if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+		return;
+
+	handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
+
 /*
  * This is called before hardware starts writing to the AUX area to
  * obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	handle->event = event;
 	handle->head = aux_head;
 	handle->size = 0;
+	handle->aux_flags = 0;
 
 	/*
 	 * In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
  * of the AUX buffer management code is that after pmu::stop(), the AUX
  * transaction must be stopped and therefore drop the AUX reference count.
  */
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
-			 bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 {
 	struct ring_buffer *rb = handle->rb;
-	bool wakeup = truncated;
+	bool wakeup = !!handle->aux_flags;
 	unsigned long aux_head;
-	u64 flags = 0;
-
-	if (truncated)
-		flags |= PERF_AUX_FLAG_TRUNCATED;
 
 	/* in overwrite mode, driver provides aux_head via handle */
 	if (rb->aux_overwrite) {
-		flags |= PERF_AUX_FLAG_OVERWRITE;
+		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
 
 		aux_head = handle->head;
 		local_set(&rb->aux_head, aux_head);
 	} else {
+		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
+
 		aux_head = local_read(&rb->aux_head);
 		local_add(size, &rb->aux_head);
 	}
 
-	if (size || flags) {
+	if (size || handle->aux_flags) {
 		/*
 		 * Only send RECORD_AUX if we have something useful to communicate
 		 */
 
-		perf_event_aux_event(handle->event, aux_head, size, flags);
+		perf_event_aux_event(handle->event, aux_head, size,
+		                     handle->aux_flags);
 	}
 
 	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	}
 
 	if (wakeup) {
-		if (truncated)
+		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
 			handle->event->pending_disable = 1;
 		perf_output_wakeup(handle);
 	}
-- 
cgit v1.2.3


From 6419c4af777a773a45a1b1af735de0fcd9a7dcc7 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05g@gmail.com>
Date: Fri, 3 Feb 2017 01:38:17 +0900
Subject: locking/lockdep: Add new check to lock_downgrade()

Commit:

  f8319483f57f ("locking/lockdep: Provide a type check for lock_is_held")

didn't fully cover rwsems as downgrade_write() was left out.

Introduce lock_downgrade() and use it to add new checks.

See-also: http://marc.info/?l=linux-kernel&m=148581164003149&w=2
Originally-written-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1486053497-9948-3-git-send-email-hooanon05g@gmail.com
[ Rewrote the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  |  3 +++
 kernel/locking/lockdep.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/locking/rwsem.c   |  6 ++----
 3 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1e327bb80838..fffe49f188e6 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -361,6 +361,8 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 	lock_set_class(lock, lock->name, lock->key, subclass, ip);
 }
 
+extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
+
 extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
@@ -411,6 +413,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
+# define lock_downgrade(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_set_current_reclaim_state(g)	do { } while (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index da795480e0aa..b1a1cefb8244 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3533,6 +3533,44 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 	return 1;
 }
 
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock;
+	unsigned int depth;
+	int i;
+
+	depth = curr->lockdep_depth;
+	/*
+	 * This function is about (re)setting the class of a held lock,
+	 * yet we're not actually holding any locks. Naughty user!
+	 */
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	hlock = find_held_lock(curr, lock, depth, &i);
+	if (!hlock)
+		return print_unlock_imbalance_bug(curr, lock, ip);
+
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	WARN(hlock->read, "downgrading a read lock");
+	hlock->read = 1;
+	hlock->acquire_ip = ip;
+
+	if (reacquire_held_locks(curr, depth, i))
+		return 0;
+
+	/*
+	 * I took it apart and put it back together again, except now I have
+	 * these 'spare' parts.. where shall I put them.
+	 */
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+		return 0;
+	return 1;
+}
+
 /*
  * Remove the lock to the list of currently held locks - this gets
  * called on mutex_unlock()/spin_unlock*() (or on a failed
@@ -3759,6 +3797,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
+	check_flags(flags);
+	if (__lock_downgrade(lock, ip))
+		check_chain_key(current);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
  */
 void downgrade_write(struct rw_semaphore *sem)
 {
-	/*
-	 * lockdep: a downgraded write will live on as a write
-	 * dependency.
-	 */
+	lock_downgrade(&sem->dep_map, _RET_IP_);
+
 	rwsem_set_reader_owned(sem);
 	__downgrade_write(sem);
 }
-- 
cgit v1.2.3


From 383776fa7527745224446337f2dcfb0f0d1b8b56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Feb 2017 15:37:36 +0100
Subject: locking/lockdep: Handle statically initialized PER_CPU locks properly

If a PER_CPU struct which contains a spin_lock is statically initialized
via:

DEFINE_PER_CPU(struct foo, bla) = {
	.lock = __SPIN_LOCK_UNLOCKED(bla.lock)
};

then lockdep assigns a seperate key to each lock because the logic for
assigning a key to statically initialized locks is to use the address as
the key. With per CPU locks the address is obvioulsy different on each CPU.

That's wrong, because all locks should have the same key.

To solve this the following modifications are required:

 1) Extend the is_kernel/module_percpu_addr() functions to hand back the
    canonical address of the per CPU address, i.e. the per CPU address
    minus the per CPU offset.

 2) Check the lock address with these functions and if the per CPU check
    matches use the returned canonical address as the lock key, so all per
    CPU locks have the same key.

 3) Move the static_obj(key) check into look_up_lock_class() so this check
    can be avoided for statically initialized per CPU locks.  That's
    required because the canonical address fails the static_obj(key) check
    for obvious reasons.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ Merged Dan's fixups for !MODULES and !SMP into this patch. ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Murphy <dmurphy@ti.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170227143736.pectaimkjkan5kow@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/module.h   |  6 ++++++
 include/linux/percpu.h   |  1 +
 kernel/locking/lockdep.c | 33 +++++++++++++++++++++++----------
 kernel/module.c          | 36 ++++++++++++++++++++++++------------
 mm/percpu.c              | 37 +++++++++++++++++++++++--------------
 5 files changed, 77 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0297c5cd7cdf..9ad68561d8c2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -493,6 +493,7 @@ static inline int module_is_live(struct module *mod)
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
 bool is_module_percpu_address(unsigned long addr);
 bool is_module_text_address(unsigned long addr);
 
@@ -660,6 +661,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
 	return false;
 }
 
+static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+	return false;
+}
+
 static inline bool is_module_text_address(unsigned long addr)
 {
 	return false;
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 56939d3f6e53..491b3f5a5f8a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -110,6 +110,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 #endif
 
 extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
+extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
 extern bool is_kernel_percpu_address(unsigned long addr);
 
 #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b1a1cefb8244..98dd6231d43b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	struct lockdep_subclass_key *key;
 	struct hlist_head *hash_head;
 	struct lock_class *class;
+	bool is_static = false;
 
 	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
 		debug_locks_off();
@@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 
 	/*
 	 * Static locks do not have their class-keys yet - for them the key
-	 * is the lock object itself:
+	 * is the lock object itself. If the lock is in the per cpu area,
+	 * the canonical address of the lock (per cpu offset removed) is
+	 * used.
 	 */
-	if (unlikely(!lock->key))
-		lock->key = (void *)lock;
+	if (unlikely(!lock->key)) {
+		unsigned long can_addr, addr = (unsigned long)lock;
+
+		if (__is_kernel_percpu_address(addr, &can_addr))
+			lock->key = (void *)can_addr;
+		else if (__is_module_percpu_address(addr, &can_addr))
+			lock->key = (void *)can_addr;
+		else if (static_obj(lock))
+			lock->key = (void *)lock;
+		else
+			return ERR_PTR(-EINVAL);
+		is_static = true;
+	}
 
 	/*
 	 * NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 		}
 	}
 
-	return NULL;
+	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
 /*
@@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
 
 	class = look_up_lock_class(lock, subclass);
-	if (likely(class))
+	if (likely(!IS_ERR_OR_NULL(class)))
 		goto out_set_class_cache;
 
 	/*
 	 * Debug-check: all keys must be persistent!
- 	 */
-	if (!static_obj(lock->key)) {
+	 */
+	if (IS_ERR(class)) {
 		debug_locks_off();
 		printk("INFO: trying to register non-static key.\n");
 		printk("the code is fine but needs lockdep annotation.\n");
 		printk("turning off the locking correctness validator.\n");
 		dump_stack();
-
 		return NULL;
 	}
 
@@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 		 * Clearly if the lock hasn't been acquired _ever_, we're not
 		 * holding it either, so report failure.
 		 */
-		if (!class)
+		if (IS_ERR_OR_NULL(class))
 			return 0;
 
 		/*
@@ -4225,7 +4238,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 		 * If the class exists we look it up and zap it:
 		 */
 		class = look_up_lock_class(lock, j);
-		if (class)
+		if (!IS_ERR_OR_NULL(class))
 			zap_class(class);
 	}
 	/*
diff --git a/kernel/module.c b/kernel/module.c
index 7eba6dea4f41..5ef618133849 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod,
 		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
 }
 
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
 {
 	struct module *mod;
 	unsigned int cpu;
@@ -688,9 +679,11 @@ bool is_module_percpu_address(unsigned long addr)
 			continue;
 		for_each_possible_cpu(cpu) {
 			void *start = per_cpu_ptr(mod->percpu, cpu);
+			void *va = (void *)addr;
 
-			if ((void *)addr >= start &&
-			    (void *)addr < start + mod->percpu_size) {
+			if (va >= start && va < start + mod->percpu_size) {
+				if (can_addr)
+					*can_addr = (unsigned long) (va - start);
 				preempt_enable();
 				return true;
 			}
@@ -701,6 +694,20 @@ bool is_module_percpu_address(unsigned long addr)
 	return false;
 }
 
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+	return __is_module_percpu_address(addr, NULL);
+}
+
 #else /* ... !CONFIG_SMP */
 
 static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +739,11 @@ bool is_module_percpu_address(unsigned long addr)
 	return false;
 }
 
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+	return false;
+}
+
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)	\
diff --git a/mm/percpu.c b/mm/percpu.c
index 5696039b5c07..7d3b728c0254 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1281,18 +1281,7 @@ void free_percpu(void __percpu *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area.  Module
- * static percpu areas are not considered.  For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
 {
 #ifdef CONFIG_SMP
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1301,15 +1290,35 @@ bool is_kernel_percpu_address(unsigned long addr)
 
 	for_each_possible_cpu(cpu) {
 		void *start = per_cpu_ptr(base, cpu);
+		void *va = (void *)addr;
 
-		if ((void *)addr >= start && (void *)addr < start + static_size)
+		if (va >= start && va < start + static_size) {
+			if (can_addr)
+				*can_addr = (unsigned long) (va - start);
 			return true;
-        }
+		}
+	}
 #endif
 	/* on UP, can't distinguish from other static vars, always false */
 	return false;
 }
 
+/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area.  Module
+ * static percpu areas are not considered.  For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+	return __is_kernel_percpu_address(addr, NULL);
+}
+
 /**
  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
  * @addr: the address to be converted to physical address
-- 
cgit v1.2.3


From ef88f33fc1ee0a12a1e5eee7e4f70b7743100a19 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 2 Mar 2017 12:51:15 +0100
Subject: USB: serial: clean up endpoint and port-counter types

Use unsigned-char type for the endpoint and port counters.

Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/usb-serial.c | 14 +++++++-------
 include/linux/usb/serial.h      | 11 ++++++-----
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index c20d90ed1ef2..e5b859ad15c6 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -728,12 +728,12 @@ static int usb_serial_probe(struct usb_interface *interface,
 	int buffer_size;
 	int i;
 	int j;
-	int num_interrupt_in = 0;
-	int num_interrupt_out = 0;
-	int num_bulk_in = 0;
-	int num_bulk_out = 0;
+	unsigned char num_interrupt_in = 0;
+	unsigned char num_interrupt_out = 0;
+	unsigned char num_bulk_in = 0;
+	unsigned char num_bulk_out = 0;
 	int num_ports = 0;
-	int max_endpoints;
+	unsigned char max_endpoints;
 
 	mutex_lock(&table_lock);
 	type = search_serial_device(interface);
@@ -879,7 +879,7 @@ static int usb_serial_probe(struct usb_interface *interface,
 		num_ports = MAX_NUM_PORTS;
 	}
 
-	serial->num_ports = num_ports;
+	serial->num_ports = (unsigned char)num_ports;
 	serial->num_bulk_in = num_bulk_in;
 	serial->num_bulk_out = num_bulk_out;
 	serial->num_interrupt_in = num_interrupt_in;
@@ -894,7 +894,7 @@ static int usb_serial_probe(struct usb_interface *interface,
 	max_endpoints = max(num_bulk_in, num_bulk_out);
 	max_endpoints = max(max_endpoints, num_interrupt_in);
 	max_endpoints = max(max_endpoints, num_interrupt_out);
-	max_endpoints = max(max_endpoints, (int)serial->num_ports);
+	max_endpoints = max(max_endpoints, serial->num_ports);
 	serial->num_port_pointers = max_endpoints;
 
 	dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints);
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 704a1ab8240c..85b475933848 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -159,10 +159,10 @@ struct usb_serial {
 	unsigned char			minors_reserved:1;
 	unsigned char			num_ports;
 	unsigned char			num_port_pointers;
-	char				num_interrupt_in;
-	char				num_interrupt_out;
-	char				num_bulk_in;
-	char				num_bulk_out;
+	unsigned char			num_interrupt_in;
+	unsigned char			num_interrupt_out;
+	unsigned char			num_bulk_in;
+	unsigned char			num_bulk_out;
 	struct usb_serial_port		*port[MAX_NUM_PORTS];
 	struct kref			kref;
 	struct mutex			disc_mutex;
@@ -227,13 +227,14 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
 struct usb_serial_driver {
 	const char *description;
 	const struct usb_device_id *id_table;
-	char	num_ports;
 
 	struct list_head	driver_list;
 	struct device_driver	driver;
 	struct usb_driver	*usb_driver;
 	struct usb_dynids	dynids;
 
+	unsigned char		num_ports;
+
 	size_t			bulk_in_size;
 	size_t			bulk_out_size;
 
-- 
cgit v1.2.3


From ff0c5703a4b11fca86886e5b7ce40c396bef8381 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 2 Mar 2017 12:51:17 +0100
Subject: USB: serial: allow up to 16 ports per device

Raise the arbitrary limit of how many ports a single device can claim
from eight to 16.

This specifically enables the upper eight ports of some mxuport devices.

Signed-off-by: Johan Hovold <johan@kernel.org>
---
 include/linux/usb/serial.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 85b475933848..ee4394d8932f 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -20,7 +20,7 @@
 #include <linux/kfifo.h>
 
 /* The maximum number of ports one device can grab at once */
-#define MAX_NUM_PORTS		8
+#define MAX_NUM_PORTS		16
 
 /* parity check flag */
 #define RELEVANT_IFLAG(iflag)	(iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK))
-- 
cgit v1.2.3


From 92e6b2c675e1d247317ec41a078f49aaade7f716 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 2 Mar 2017 12:51:19 +0100
Subject: USB: serial: add endpoint sanity check to core

Allow drivers to specify a minimum number of endpoints per type, which
USB serial core will verify after subdriver probe has returned (where
the current alternate setting may have been changed).

Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/usb-serial.c | 17 ++++++++++++-----
 include/linux/usb/serial.h      |  9 +++++++++
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index ccc729d17a89..747dd414bef9 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -848,21 +848,26 @@ static int usb_serial_probe(struct usb_interface *interface,
 		if (epds->num_bulk_in == 0 || epds->num_bulk_out == 0) {
 			dev_info(ddev, "PL-2303 hack: descriptors matched but endpoints did not\n");
 			retval = -ENODEV;
-			kfree(epds);
-			goto err_put_serial;
+			goto err_free_epds;
 		}
 	}
 	/* END HORRIBLE HACK FOR PL2303 */
 #endif
-
+	if (epds->num_bulk_in < type->num_bulk_in ||
+			epds->num_bulk_out < type->num_bulk_out ||
+			epds->num_interrupt_in < type->num_interrupt_in ||
+			epds->num_interrupt_out < type->num_interrupt_out) {
+		dev_err(ddev, "required endpoints missing\n");
+		retval = -ENODEV;
+		goto err_free_epds;
+	}
 #ifdef CONFIG_USB_SERIAL_GENERIC
 	if (type == &usb_serial_generic_device) {
 		num_ports = epds->num_bulk_out;
 		if (num_ports == 0) {
 			dev_err(ddev, "Generic device with no bulk out, not allowed.\n");
 			retval = -EIO;
-			kfree(epds);
-			goto err_put_serial;
+			goto err_free_epds;
 		}
 		dev_info(ddev, "The \"generic\" usb-serial driver is only for testing and one-off prototypes.\n");
 		dev_info(ddev, "Tell linux-usb@vger.kernel.org to add your device to a proper driver.\n");
@@ -1085,6 +1090,8 @@ exit:
 
 probe_error:
 	retval = -EIO;
+err_free_epds:
+	kfree(epds);
 err_put_serial:
 	usb_serial_put(serial);
 err_put_module:
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index ee4394d8932f..f1b8a8493762 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -188,6 +188,10 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
  * @id_table: pointer to a list of usb_device_id structures that define all
  *	of the devices this structure can support.
  * @num_ports: the number of different ports this device will have.
+ * @num_bulk_in: minimum number of bulk-in endpoints
+ * @num_bulk_out: minimum number of bulk-out endpoints
+ * @num_interrupt_in: minimum number of interrupt-in endpoints
+ * @num_interrupt_out: minimum number of interrupt-out endpoints
  * @bulk_in_size: minimum number of bytes to allocate for bulk-in buffer
  *	(0 = end-point size)
  * @bulk_out_size: bytes to allocate for bulk-out buffer (0 = end-point size)
@@ -235,6 +239,11 @@ struct usb_serial_driver {
 
 	unsigned char		num_ports;
 
+	unsigned char		num_bulk_in;
+	unsigned char		num_bulk_out;
+	unsigned char		num_interrupt_in;
+	unsigned char		num_interrupt_out;
+
 	size_t			bulk_in_size;
 	size_t			bulk_out_size;
 
-- 
cgit v1.2.3


From 81ed18ab3098b6519274545e80a29caacb77d160 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 15 Mar 2017 18:26:42 -0700
Subject: bpf: add helper inlining infra and optimize map_array lookup

Optimize bpf_call -> bpf_map_lookup_elem() -> array_map_lookup_elem()
into a sequence of bpf instructions.
When JIT is on the sequence of bpf instructions is the sequence
of native cpu instructions with significantly faster performance
than indirect call and two function's prologue/epilogue.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 +
 include/linux/bpf_verifier.h |  5 ++++-
 include/linux/filter.h       | 10 ++++++++++
 kernel/bpf/arraymap.c        | 29 +++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 36 +++++++++++++++++++++++++++++++++---
 5 files changed, 77 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 909fc033173a..da8c64ca8dc9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -35,6 +35,7 @@ struct bpf_map_ops {
 	void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
 				int fd);
 	void (*map_fd_put_ptr)(void *ptr);
+	u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
 };
 
 struct bpf_map {
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index a13b031dc6b8..5efb4db44e1e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -66,7 +66,10 @@ struct bpf_verifier_state_list {
 };
 
 struct bpf_insn_aux_data {
-	enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
+	union {
+		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
+		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
+	};
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index fbf7b39e8103..dffa072b7b79 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -693,6 +693,11 @@ static inline bool bpf_jit_is_ebpf(void)
 # endif
 }
 
+static inline bool ebpf_jit_enabled(void)
+{
+	return bpf_jit_enable && bpf_jit_is_ebpf();
+}
+
 static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
 {
 	return fp->jited && bpf_jit_is_ebpf();
@@ -753,6 +758,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp);
 
 #else /* CONFIG_BPF_JIT */
 
+static inline bool ebpf_jit_enabled(void)
+{
+	return false;
+}
+
 static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
 {
 	return false;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6b6f41f0b211..bcf9955fac95 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016,2017 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -113,6 +114,33 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	return array->value + array->elem_size * index;
 }
 
+/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
+static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_insn *insn = insn_buf;
+	u32 elem_size = array->elem_size;
+	const int ret = BPF_REG_0;
+	const int map_ptr = BPF_REG_1;
+	const int index = BPF_REG_2;
+
+	*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+	*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+	*insn++ = BPF_JMP_IMM(BPF_JGE, ret, array->map.max_entries,
+			      elem_size == 1 ? 2 : 3);
+	if (elem_size == 1) {
+		/* nop */
+	} else if (is_power_of_2(elem_size)) {
+		*insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+	} else {
+		*insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size);
+	}
+	*insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr);
+	*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+	*insn++ = BPF_MOV64_IMM(ret, 0);
+	return insn - insn_buf;
+}
+
 /* Called from eBPF program */
 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
@@ -267,6 +295,7 @@ static const struct bpf_map_ops array_ops = {
 	.map_lookup_elem = array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
+	.map_gen_lookup = array_map_gen_lookup,
 };
 
 static struct bpf_map_type_list array_type __ro_after_init = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2990fda1c6a5..90bf46787603 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1273,7 +1273,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	}
 }
 
-static int check_call(struct bpf_verifier_env *env, int func_id)
+static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
@@ -1369,6 +1369,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
+		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
 	} else {
 		verbose("unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -2940,7 +2941,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EINVAL;
 				}
 
-				err = check_call(env, insn->imm);
+				err = check_call(env, insn->imm, insn_idx);
 				if (err)
 					return err;
 
@@ -3268,6 +3269,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 }
 
 /* fixup insn->imm field of bpf_call instructions
+ * and inline eligible helpers as explicit sequence of BPF instructions
  *
  * this function is called after eBPF program passed verification
  */
@@ -3277,7 +3279,10 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
-	int i;
+	struct bpf_insn insn_buf[16];
+	struct bpf_prog *new_prog;
+	struct bpf_map *map_ptr;
+	int i, cnt, delta = 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (insn->code != (BPF_JMP | BPF_CALL))
@@ -3300,6 +3305,31 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) {
+			map_ptr = env->insn_aux_data[i + delta].map_ptr;
+			if (!map_ptr->ops->map_gen_lookup)
+				goto patch_call_imm;
+
+			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
+			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+				verbose("bpf verifier is misconfigured\n");
+				return -EINVAL;
+			}
+
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf,
+						       cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta += cnt - 1;
+
+			/* keep walking new program and skip insns we just inserted */
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			continue;
+		}
+
+patch_call_imm:
 		fn = prog->aux->ops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
-- 
cgit v1.2.3


From 2c93e790e8253552227bf9b46a8d49dca3f71b06 Mon Sep 17 00:00:00 2001
From: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Date: Sat, 25 Feb 2017 19:20:55 +0800
Subject: usb: add CONFIG_USB_PCI for system have both PCI HW and non-PCI based
 USB HW

a lot of embeded system SOC (e.g. freescale T2080) have both
PCI and USB modules. But USB module is controlled by registers directly,
it have no relationship with PCI module.

when say N here it will not build PCI related code in USB driver.

Signed-off-by: yuan linyu <Linyu.Yuan@alcatel-sbell.com.cn>
Acked-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/Kconfig                | 12 +++++++++++-
 drivers/usb/Makefile               |  2 +-
 drivers/usb/chipidea/Kconfig       |  2 +-
 drivers/usb/core/Makefile          |  2 +-
 drivers/usb/dwc2/Kconfig           |  2 +-
 drivers/usb/dwc3/Kconfig           |  2 +-
 drivers/usb/gadget/udc/Kconfig     |  8 ++++----
 drivers/usb/gadget/udc/bdc/Kconfig |  2 +-
 drivers/usb/gadget/udc/net2272.c   |  8 ++++----
 drivers/usb/gadget/udc/net2272.h   |  2 +-
 drivers/usb/host/Kconfig           | 10 +++++-----
 drivers/usb/host/Makefile          |  4 +---
 drivers/usb/host/ehci-dbg.c        |  2 +-
 drivers/usb/host/ohci-hcd.c        |  2 +-
 drivers/usb/host/ohci.h            |  2 +-
 drivers/usb/host/pci-quirks.h      |  4 ++--
 drivers/usb/host/uhci-hcd.c        |  2 +-
 drivers/usb/host/uhci-hcd.h        |  2 +-
 drivers/usb/host/xhci.c            |  2 +-
 drivers/usb/isp1760/isp1760-if.c   |  8 ++++----
 include/linux/usb/hcd.h            |  4 ++--
 21 files changed, 46 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index fbe493d44e81..aba6ebd8dedf 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -35,7 +35,6 @@ config USB_COMMON
 config USB_ARCH_HAS_HCD
 	def_bool y
 
-# ARM SA1111 chips have a non-PCI based "OHCI-compatible" USB host interface.
 config USB
 	tristate "Support for Host-side USB"
 	depends on USB_ARCH_HAS_HCD
@@ -73,6 +72,17 @@ config USB
 	  To compile this driver as a module, choose M here: the
 	  module will be called usbcore.
 
+config USB_PCI
+	bool "PCI based USB host interface"
+	depends on PCI
+	default y
+	---help---
+	  A lot of embeded system SOC (e.g. freescale T2080) have both
+	  PCI and USB modules. But USB module is controlled by registers
+	  directly, it have no relationship with PCI module.
+
+	  When say N here it will not build PCI related code in USB driver.
+
 if USB
 
 source "drivers/usb/core/Kconfig"
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 7791af6c102c..4e1cf090fd20 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_USB_ISP1760)	+= isp1760/
 obj-$(CONFIG_USB_MON)		+= mon/
 obj-$(CONFIG_USB_MTU3)		+= mtu3/
 
-obj-$(CONFIG_PCI)		+= host/
+obj-$(CONFIG_USB_PCI)		+= host/
 obj-$(CONFIG_USB_EHCI_HCD)	+= host/
 obj-$(CONFIG_USB_ISP116X_HCD)	+= host/
 obj-$(CONFIG_USB_OHCI_HCD)	+= host/
diff --git a/drivers/usb/chipidea/Kconfig b/drivers/usb/chipidea/Kconfig
index fc96f5cdcb5c..51f4157bbecf 100644
--- a/drivers/usb/chipidea/Kconfig
+++ b/drivers/usb/chipidea/Kconfig
@@ -20,7 +20,7 @@ config USB_CHIPIDEA_OF
 
 config USB_CHIPIDEA_PCI
 	tristate
-	depends on PCI
+	depends on USB_PCI
 	depends on NOP_USB_XCEIV
 	default USB_CHIPIDEA
 
diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile
index b99b871c4b9d..250ec1d662d9 100644
--- a/drivers/usb/core/Makefile
+++ b/drivers/usb/core/Makefile
@@ -8,7 +8,7 @@ usbcore-y += devio.o notify.o generic.o quirks.o devices.o
 usbcore-y += port.o
 
 usbcore-$(CONFIG_OF)		+= of.o
-usbcore-$(CONFIG_PCI)		+= hcd-pci.o
+usbcore-$(CONFIG_USB_PCI)		+= hcd-pci.o
 usbcore-$(CONFIG_ACPI)		+= usb-acpi.o
 
 obj-$(CONFIG_USB)		+= usbcore.o
diff --git a/drivers/usb/dwc2/Kconfig b/drivers/usb/dwc2/Kconfig
index e838701d6dd5..b6a495e98fd8 100644
--- a/drivers/usb/dwc2/Kconfig
+++ b/drivers/usb/dwc2/Kconfig
@@ -54,7 +54,7 @@ endchoice
 
 config USB_DWC2_PCI
 	tristate "DWC2 PCI"
-	depends on PCI
+	depends on USB_PCI
 	depends on USB_GADGET || !USB_GADGET
 	default n
 	select NOP_USB_XCEIV
diff --git a/drivers/usb/dwc3/Kconfig b/drivers/usb/dwc3/Kconfig
index c5aa235863e8..4c9e56d8776a 100644
--- a/drivers/usb/dwc3/Kconfig
+++ b/drivers/usb/dwc3/Kconfig
@@ -70,7 +70,7 @@ config USB_DWC3_EXYNOS
 
 config USB_DWC3_PCI
 	tristate "PCIe-based Platforms"
-	depends on PCI && ACPI
+	depends on USB_PCI && ACPI
 	default USB_DWC3
 	help
 	  If you're using the DesignWare Core IP with a PCIe, please say
diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig
index 4b69f28a9af9..c6cc9d3270ac 100644
--- a/drivers/usb/gadget/udc/Kconfig
+++ b/drivers/usb/gadget/udc/Kconfig
@@ -277,7 +277,7 @@ source "drivers/usb/gadget/udc/bdc/Kconfig"
 
 config USB_AMD5536UDC
 	tristate "AMD5536 UDC"
-	depends on PCI
+	depends on USB_PCI
 	help
 	   The AMD5536 UDC is part of the AMD Geode CS5536, an x86 southbridge.
 	   It is a USB Highspeed DMA capable USB device controller. Beside ep0
@@ -327,7 +327,7 @@ config USB_NET2272_DMA
 
 config USB_NET2280
 	tristate "NetChip NET228x / PLX USB3x8x"
-	depends on PCI
+	depends on USB_PCI
 	help
 	   NetChip 2280 / 2282 is a PCI based USB peripheral controller which
 	   supports both full and high speed USB 2.0 data transfers.
@@ -352,7 +352,7 @@ config USB_NET2280
 
 config USB_GOKU
 	tristate "Toshiba TC86C001 'Goku-S'"
-	depends on PCI
+	depends on USB_PCI
 	help
 	   The Toshiba TC86C001 is a PCI device which includes controllers
 	   for full speed USB devices, IDE, I2C, SIO, plus a USB host (OHCI).
@@ -366,7 +366,7 @@ config USB_GOKU
 
 config USB_EG20T
 	tristate "Intel QUARK X1000/EG20T PCH/LAPIS Semiconductor IOH(ML7213/ML7831) UDC"
-	depends on PCI
+	depends on USB_PCI
 	help
 	  This is a USB device driver for EG20T PCH.
 	  EG20T PCH is the platform controller hub that is used in Intel's
diff --git a/drivers/usb/gadget/udc/bdc/Kconfig b/drivers/usb/gadget/udc/bdc/Kconfig
index 0d7b8c9f72fd..eb8b55392360 100644
--- a/drivers/usb/gadget/udc/bdc/Kconfig
+++ b/drivers/usb/gadget/udc/bdc/Kconfig
@@ -14,7 +14,7 @@ if USB_BDC_UDC
 comment "Platform Support"
 config	USB_BDC_PCI
 	tristate "BDC support for PCIe based platforms"
-	depends on PCI
+	depends on USB_PCI
 	default USB_BDC_UDC
 	help
 		Enable support for platforms which have BDC connected through PCIe, such as Lego3 FPGA platform.
diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c
index 7dc0102abdfe..8f85a51bd2b3 100644
--- a/drivers/usb/gadget/udc/net2272.c
+++ b/drivers/usb/gadget/udc/net2272.c
@@ -653,7 +653,7 @@ net2272_request_dma(struct net2272 *dev, unsigned ep, u32 buf,
 	dev->dma_busy = 1;
 
 	/* initialize platform's dma */
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 	/* NET2272 addr, buffer addr, length, etc. */
 	switch (dev->dev_id) {
 	case PCI_DEVICE_ID_RDK1:
@@ -701,7 +701,7 @@ static void
 net2272_start_dma(struct net2272 *dev)
 {
 	/* start platform's dma controller */
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 	switch (dev->dev_id) {
 	case PCI_DEVICE_ID_RDK1:
 		writeb((1 << CHANNEL_ENABLE) | (1 << CHANNEL_START),
@@ -797,7 +797,7 @@ net2272_kick_dma(struct net2272_ep *ep, struct net2272_request *req)
 
 static void net2272_cancel_dma(struct net2272 *dev)
 {
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 	switch (dev->dev_id) {
 	case PCI_DEVICE_ID_RDK1:
 		writeb(0, dev->rdk1.plx9054_base_addr + DMACSR0);
@@ -2306,7 +2306,7 @@ err_add_udc:
 	return ret;
 }
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 
 /*
  * wrap this driver around the specified device, but
diff --git a/drivers/usb/gadget/udc/net2272.h b/drivers/usb/gadget/udc/net2272.h
index 127ab03fcde3..69bc9c3c6ce4 100644
--- a/drivers/usb/gadget/udc/net2272.h
+++ b/drivers/usb/gadget/udc/net2272.h
@@ -472,7 +472,7 @@ struct net2272 {
 	unsigned int base_shift;
 	u16 __iomem *base_addr;
 	union {
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 		struct {
 			void __iomem *plx9054_base_addr;
 			void __iomem *epld_base_addr;
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 407d947b34ea..2b2eecd4c11d 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -30,7 +30,7 @@ if USB_XHCI_HCD
 
 config USB_XHCI_PCI
        tristate
-       depends on PCI
+       depends on USB_PCI
        default y
 
 config USB_XHCI_PLATFORM
@@ -139,7 +139,7 @@ if USB_EHCI_HCD
 
 config USB_EHCI_PCI
 	tristate
-	depends on PCI
+	depends on USB_PCI
 	default y
 
 config USB_EHCI_HCD_PMC_MSP
@@ -525,7 +525,7 @@ config USB_OHCI_HCD_PPC_OF
 
 config USB_OHCI_HCD_PCI
 	tristate "OHCI support for PCI-bus USB controllers"
-	depends on PCI
+	depends on USB_PCI
 	default y
 	select USB_OHCI_LITTLE_ENDIAN
 	---help---
@@ -606,7 +606,7 @@ endif # USB_OHCI_HCD
 
 config USB_UHCI_HCD
 	tristate "UHCI HCD (most Intel and VIA) support"
-	depends on PCI || USB_UHCI_SUPPORT_NON_PCI_HC
+	depends on USB_PCI || USB_UHCI_SUPPORT_NON_PCI_HC
 	---help---
 	  The Universal Host Controller Interface is a standard by Intel for
 	  accessing the USB hardware in the PC (which is also called the USB
@@ -739,7 +739,7 @@ config USB_RENESAS_USBHS_HCD
 
 config USB_WHCI_HCD
 	tristate "Wireless USB Host Controller Interface (WHCI) driver"
-	depends on PCI && USB && UWB
+	depends on USB_PCI && USB && UWB
 	select USB_WUSB
 	select UWB_WHCI
 	help
diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index 2644537b7bcf..c77b0a38557b 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile
@@ -27,9 +27,7 @@ endif
 
 obj-$(CONFIG_USB_WHCI_HCD)	+= whci/
 
-ifneq ($(CONFIG_USB), )
-	obj-$(CONFIG_PCI)	+= pci-quirks.o
-endif
+obj-$(CONFIG_USB_PCI)	+= pci-quirks.o
 
 obj-$(CONFIG_USB_EHCI_HCD)	+= ehci-hcd.o
 obj-$(CONFIG_USB_EHCI_PCI)	+= ehci-pci.o
diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c
index 1a2614aae42c..cbb9b8e12c3c 100644
--- a/drivers/usb/host/ehci-dbg.c
+++ b/drivers/usb/host/ehci-dbg.c
@@ -803,7 +803,7 @@ static ssize_t fill_registers_buffer(struct debug_buffer *buf)
 	size -= temp;
 	next += temp;
 
-#ifdef	CONFIG_PCI
+#ifdef	CONFIG_USB_PCI
 	/* EHCI 0.96 and later may have "extended capabilities" */
 	if (dev_is_pci(hcd->self.controller)) {
 		struct pci_dev	*pdev;
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index b6daf2e69989..da66ad57e370 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -994,7 +994,7 @@ static void ohci_stop (struct usb_hcd *hcd)
 
 /*-------------------------------------------------------------------------*/
 
-#if defined(CONFIG_PM) || defined(CONFIG_PCI)
+#if defined(CONFIG_PM) || defined(CONFIG_USB_PCI)
 
 /* must not be called from interrupt context */
 int ohci_restart(struct ohci_hcd *ohci)
diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h
index 37f1725e7a46..382444c8b44c 100644
--- a/drivers/usb/host/ohci.h
+++ b/drivers/usb/host/ohci.h
@@ -438,7 +438,7 @@ struct ohci_hcd {
 
 };
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 static inline int quirk_nec(struct ohci_hcd *ohci)
 {
 	return ohci->flags & OHCI_QUIRK_NEC;
diff --git a/drivers/usb/host/pci-quirks.h b/drivers/usb/host/pci-quirks.h
index c622ddf21c94..0222195bd5b0 100644
--- a/drivers/usb/host/pci-quirks.h
+++ b/drivers/usb/host/pci-quirks.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_USB_PCI_QUIRKS_H
 #define __LINUX_USB_PCI_QUIRKS_H
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 void uhci_reset_hc(struct pci_dev *pdev, unsigned long base);
 int uhci_check_and_reset_hc(struct pci_dev *pdev, unsigned long base);
 int usb_amd_find_chipset_info(void);
@@ -21,6 +21,6 @@ static inline void usb_amd_quirk_pll_enable(void) {}
 static inline void usb_amd_dev_put(void) {}
 static inline void usb_disable_xhci_ports(struct pci_dev *xhci_pdev) {}
 static inline void sb800_prefetch(struct device *dev, int on) {}
-#endif  /* CONFIG_PCI */
+#endif  /* CONFIG_USB_PCI */
 
 #endif  /*  __LINUX_USB_PCI_QUIRKS_H  */
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index 683098afa93e..94b150196d4f 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -837,7 +837,7 @@ static int uhci_count_ports(struct usb_hcd *hcd)
 
 static const char hcd_name[] = "uhci_hcd";
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 #include "uhci-pci.c"
 #define	PCI_DRIVER		uhci_pci_driver
 #endif
diff --git a/drivers/usb/host/uhci-hcd.h b/drivers/usb/host/uhci-hcd.h
index 6f986d82472d..7fa318a3091d 100644
--- a/drivers/usb/host/uhci-hcd.h
+++ b/drivers/usb/host/uhci-hcd.h
@@ -530,7 +530,7 @@ static inline void uhci_writeb(const struct uhci_hcd *uhci, u8 val, int reg)
 
 #else
 /* Support non-PCI host controllers */
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 /* Support PCI and non-PCI host controllers */
 #define uhci_has_pci_registers(u)	((u)->io_addr != 0)
 #else
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 50aee8b7718b..bb0becf8561c 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -216,7 +216,7 @@ int xhci_reset(struct xhci_hcd *xhci)
 	return ret;
 }
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 static int xhci_free_msi(struct xhci_hcd *xhci)
 {
 	int i;
diff --git a/drivers/usb/isp1760/isp1760-if.c b/drivers/usb/isp1760/isp1760-if.c
index 79205b31e4a9..bc68bbab7fa1 100644
--- a/drivers/usb/isp1760/isp1760-if.c
+++ b/drivers/usb/isp1760/isp1760-if.c
@@ -21,11 +21,11 @@
 #include "isp1760-core.h"
 #include "isp1760-regs.h"
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 #include <linux/pci.h>
 #endif
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 static int isp1761_pci_init(struct pci_dev *dev)
 {
 	resource_size_t mem_start;
@@ -286,7 +286,7 @@ static int __init isp1760_init(void)
 	ret = platform_driver_register(&isp1760_plat_driver);
 	if (!ret)
 		any_ret = 0;
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 	ret = pci_register_driver(&isp1761_pci_driver);
 	if (!ret)
 		any_ret = 0;
@@ -301,7 +301,7 @@ module_init(isp1760_init);
 static void __exit isp1760_exit(void)
 {
 	platform_driver_unregister(&isp1760_plat_driver);
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 	pci_unregister_driver(&isp1761_pci_driver);
 #endif
 	isp1760_deinit_kmem_cache();
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 40edf6a8533e..dff130151235 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -453,7 +453,7 @@ extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1);
 struct platform_device;
 extern void usb_hcd_platform_shutdown(struct platform_device *dev);
 
-#ifdef CONFIG_PCI
+#ifdef CONFIG_USB_PCI
 struct pci_dev;
 struct pci_device_id;
 extern int usb_hcd_pci_probe(struct pci_dev *dev,
@@ -466,7 +466,7 @@ extern int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *dev);
 #ifdef CONFIG_PM
 extern const struct dev_pm_ops usb_hcd_pci_pm_ops;
 #endif
-#endif /* CONFIG_PCI */
+#endif /* CONFIG_USB_PCI */
 
 /* pci-ish (pdev null is ok) buffer alloc/mapping support */
 void usb_init_pool_max(void);
-- 
cgit v1.2.3


From 5095cb89c62acc78b4cfaeb9a4072979d010510a Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 21 Feb 2017 19:59:47 +0900
Subject: usb: of: add functions to bind a companion controller

EHCI controllers will have a companion controller. However, on platform
bus, there was difficult to bind them in previous code. So, this
patch adds helper functions to bind them using a "companion" property.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/devicetree/bindings/usb/generic.txt |  1 +
 drivers/usb/core/of.c                             | 23 +++++++++++++++++++++++
 include/linux/usb/of.h                            |  5 +++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/usb/generic.txt b/Documentation/devicetree/bindings/usb/generic.txt
index bfadeb1c3bab..0a74ab8dfdc2 100644
--- a/Documentation/devicetree/bindings/usb/generic.txt
+++ b/Documentation/devicetree/bindings/usb/generic.txt
@@ -22,6 +22,7 @@ Optional properties:
 			property is used if any real OTG features(HNP/SRP/ADP)
 			is enabled, if ADP is required, otg-rev should be
 			0x0200 or above.
+ - companion: phandle of a companion
  - hnp-disable: tells OTG controllers we want to disable OTG HNP, normally HNP
 			is the basic function of real OTG except you want it
 			to be a srp-capable only B device.
diff --git a/drivers/usb/core/of.c b/drivers/usb/core/of.c
index 3de4f8873984..d787f195a9a6 100644
--- a/drivers/usb/core/of.c
+++ b/drivers/usb/core/of.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/of.h>
+#include <linux/of_platform.h>
 #include <linux/usb/of.h>
 
 /**
@@ -46,3 +47,25 @@ struct device_node *usb_of_get_child_node(struct device_node *parent,
 }
 EXPORT_SYMBOL_GPL(usb_of_get_child_node);
 
+/**
+ * usb_of_get_companion_dev - Find the companion device
+ * @dev: the device pointer to find a companion
+ *
+ * Find the companion device from platform bus.
+ *
+ * Return: On success, a pointer to the companion device, %NULL on failure.
+ */
+struct device *usb_of_get_companion_dev(struct device *dev)
+{
+	struct device_node *node;
+	struct platform_device *pdev = NULL;
+
+	node = of_parse_phandle(dev->of_node, "companion", 0);
+	if (node)
+		pdev = of_find_device_by_node(node);
+
+	of_node_put(node);
+
+	return pdev ? &pdev->dev : NULL;
+}
+EXPORT_SYMBOL_GPL(usb_of_get_companion_dev);
diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h
index 5ff9032ee1b4..4031f47629ec 100644
--- a/include/linux/usb/of.h
+++ b/include/linux/usb/of.h
@@ -18,6 +18,7 @@ int of_usb_update_otg_caps(struct device_node *np,
 			struct usb_otg_caps *otg_caps);
 struct device_node *usb_of_get_child_node(struct device_node *parent,
 			int portnum);
+struct device *usb_of_get_companion_dev(struct device *dev);
 #else
 static inline enum usb_dr_mode
 of_usb_get_dr_mode_by_phy(struct device_node *np, int arg0)
@@ -38,6 +39,10 @@ static inline struct device_node *usb_of_get_child_node
 {
 	return NULL;
 }
+static inline struct device *usb_of_get_companion_dev(struct device *dev)
+{
+	return NULL;
+}
 #endif
 
 #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_USB_SUPPORT)
-- 
cgit v1.2.3


From 9d1d994d33fe4863c8caeeaac264664815f4c321 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Thu, 16 Mar 2017 08:14:16 -0700
Subject: linux/serdev.h: Replace 'ctrl->serdev' with 'serdev'

Replace 'ctrl->serdev' with 'serdev' in serdev_controller_write_wakeup()
and serdev_controller_receive_buf().

Cc: Rob Herring <robh@kernel.org>
Cc: cphealy@gmail.com
Cc: linux-serial@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serdev.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 9519da6253a8..5176cdc2057f 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -165,7 +165,7 @@ static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl
 	if (!serdev || !serdev->ops->write_wakeup)
 		return;
 
-	serdev->ops->write_wakeup(ctrl->serdev);
+	serdev->ops->write_wakeup(serdev);
 }
 
 static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
@@ -177,7 +177,7 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl,
 	if (!serdev || !serdev->ops->receive_buf)
 		return -EINVAL;
 
-	return serdev->ops->receive_buf(ctrl->serdev, data, count);
+	return serdev->ops->receive_buf(serdev, data, count);
 }
 
 #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS)
-- 
cgit v1.2.3


From 0f4f0c8ff1da9171bca0dc01ce5551e8b6d2f0f3 Mon Sep 17 00:00:00 2001
From: Moritz Fischer <mdf@kernel.org>
Date: Mon, 27 Feb 2017 09:19:00 -0600
Subject: fpga: Add flag to indicate bitstream needs decrypting

Add a flag that is passed to the write_init() callback, indicating
that the bitstream is encrypted.

The low-level driver will deal with the flag, or return an error,
if encrypted bitstreams are not supported.

Signed-off-by: Moritz Fischer <mdf@kernel.org>
Acked-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/fpga/fpga-mgr.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 57beb5d09bfc..e2ef94fd37af 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -70,6 +70,7 @@ enum fpga_mgr_states {
  */
 #define FPGA_MGR_PARTIAL_RECONFIG	BIT(0)
 #define FPGA_MGR_EXTERNAL_CONFIG	BIT(1)
+#define FPGA_MGR_ENCRYPTED_BITSTREAM	BIT(2)
 
 /**
  * struct fpga_image_info - information specific to a FPGA image
-- 
cgit v1.2.3


From 8b1f91fb4c1a8a860b8edc0c383821b2ff8a1ece Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 4 Mar 2017 18:27:12 -0700
Subject: vmbus: remove useless return's

No need for empty return at end of void function

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hv_balloon.c  | 2 --
 drivers/hv/hv_fcopy.c    | 2 --
 drivers/hv/hv_kvp.c      | 2 --
 drivers/hv/hv_snapshot.c | 2 --
 drivers/hv/ring_buffer.c | 2 --
 drivers/hv/vmbus_drv.c   | 2 --
 include/linux/hyperv.h   | 2 --
 7 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 5fd03e59cee5..f5728deff893 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -722,8 +722,6 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
 						    5*HZ);
 		post_status(&dm_device);
 	}
-
-	return;
 }
 
 static void hv_online_page(struct page *pg)
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 9aee6014339d..3ce7559d7b41 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -187,8 +187,6 @@ static void fcopy_send_data(struct work_struct *dummy)
 		}
 	}
 	kfree(smsg_out);
-
-	return;
 }
 
 /*
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index de263712e247..a65b7f88d7aa 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -484,8 +484,6 @@ kvp_send_key(struct work_struct *dummy)
 	}
 
 	kfree(message);
-
-	return;
 }
 
 /*
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index bcc03f0748d6..216d02277759 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -213,8 +213,6 @@ static void vss_send_op(void)
 	}
 
 	kfree(vss_msg);
-
-	return;
 }
 
 static void vss_handle_request(struct work_struct *dummy)
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 87799e81af97..d0ff5b41161a 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -73,8 +73,6 @@ static void hv_signal_on_write(u32 old_write, struct vmbus_channel *channel)
 	 */
 	if (old_write == READ_ONCE(rbi->ring_buffer->read_index))
 		vmbus_setevent(channel);
-
-	return;
 }
 
 /* Get the next write location for the specified ring buffer. */
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index da6b59ba5940..5ca5004861c6 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -787,8 +787,6 @@ static void vmbus_shutdown(struct device *child_device)
 
 	if (drv->shutdown)
 		drv->shutdown(dev);
-
-	return;
 }
 
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 62bbf3c1aa4a..2b1ed66824be 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1500,8 +1500,6 @@ static inline  void hv_signal_on_read(struct vmbus_channel *channel)
 	cached_write_sz = hv_get_cached_bytes_to_write(rbi);
 	if (cached_write_sz < pending_sz)
 		vmbus_setevent(channel);
-
-	return;
 }
 
 static inline void
-- 
cgit v1.2.3


From 2a9d7de2038e87bb2a1085ac73c4246c260263f0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 4 Mar 2017 18:27:17 -0700
Subject: vmbus: cleanup header file style

Minor changes to align hyper-v vmbus include files with current
linux kernel style.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hyperv_vmbus.h | 16 ++++++++--------
 include/linux/hyperv.h    | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index b552c3a4dd3c..a69b52de8d56 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -218,8 +218,8 @@ struct hv_per_cpu_context {
 
 struct hv_context {
 	/* We only support running on top of Hyper-V
-	* So at this point this really can only contain the Hyper-V ID
-	*/
+	 * So at this point this really can only contain the Hyper-V ID
+	 */
 	u64 guestid;
 
 	void *tsc_page;
@@ -403,17 +403,17 @@ int vmbus_post_msg(void *buffer, size_t buflen, bool can_sleep);
 void vmbus_on_event(unsigned long data);
 void vmbus_on_msg_dpc(unsigned long data);
 
-int hv_kvp_init(struct hv_util_service *);
+int hv_kvp_init(struct hv_util_service *srv);
 void hv_kvp_deinit(void);
-void hv_kvp_onchannelcallback(void *);
+void hv_kvp_onchannelcallback(void *context);
 
-int hv_vss_init(struct hv_util_service *);
+int hv_vss_init(struct hv_util_service *srv);
 void hv_vss_deinit(void);
-void hv_vss_onchannelcallback(void *);
+void hv_vss_onchannelcallback(void *context);
 
-int hv_fcopy_init(struct hv_util_service *);
+int hv_fcopy_init(struct hv_util_service *srv);
 void hv_fcopy_deinit(void);
-void hv_fcopy_onchannelcallback(void *);
+void hv_fcopy_onchannelcallback(void *context);
 void vmbus_initiate_unload(bool crash);
 
 static inline void hv_poll_channel(struct vmbus_channel *channel,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 2b1ed66824be..de9b80ff6698 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -524,10 +524,10 @@ struct vmbus_channel_open_channel {
 	u32 target_vp;
 
 	/*
-	* The upstream ring buffer begins at offset zero in the memory
-	* described by RingBufferGpadlHandle. The downstream ring buffer
-	* follows it at this offset (in pages).
-	*/
+	 * The upstream ring buffer begins at offset zero in the memory
+	 * described by RingBufferGpadlHandle. The downstream ring buffer
+	 * follows it at this offset (in pages).
+	 */
 	u32 downstream_ringbuffer_pageoffset;
 
 	/* User-specific data to be passed along to the server endpoint. */
@@ -1006,7 +1006,7 @@ extern int vmbus_open(struct vmbus_channel *channel,
 			    u32 recv_ringbuffersize,
 			    void *userdata,
 			    u32 userdatalen,
-			    void(*onchannel_callback)(void *context),
+			    void (*onchannel_callback)(void *context),
 			    void *context);
 
 extern void vmbus_close(struct vmbus_channel *channel);
@@ -1421,7 +1421,7 @@ struct hyperv_service_callback {
 	char *log_msg;
 	uuid_le data;
 	struct vmbus_channel *channel;
-	void (*callback) (void *context);
+	void (*callback)(void *context);
 };
 
 #define MAX_SRV_VER	0x7ffffff
-- 
cgit v1.2.3


From 4827ee1dca5691c9fc568883170a568db94f9b38 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Sat, 4 Mar 2017 18:27:18 -0700
Subject: vmbus: expose debug info for drivers

Allow driver to get debug information about state of the ring.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/hyperv_vmbus.h | 11 -----------
 drivers/hv/ring_buffer.c  |  1 +
 include/linux/hyperv.h    | 17 +++++++++++++++++
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index a69b52de8d56..6113e915c50e 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -248,14 +248,6 @@ struct hv_context {
 
 extern struct hv_context hv_context;
 
-struct hv_ring_buffer_debug_info {
-	u32 current_interrupt_mask;
-	u32 current_read_index;
-	u32 current_write_index;
-	u32 bytes_avail_toread;
-	u32 bytes_avail_towrite;
-};
-
 /* Hv Interface */
 
 extern int hv_init(void);
@@ -289,9 +281,6 @@ int hv_ringbuffer_read(struct vmbus_channel *channel,
 		       void *buffer, u32 buflen, u32 *buffer_actual_len,
 		       u64 *requestid, bool raw);
 
-void hv_ringbuffer_get_debuginfo(const struct hv_ring_buffer_info *ring_info,
-				 struct hv_ring_buffer_debug_info *debug_info);
-
 /*
  * Maximum channels is determined by the size of the interrupt page
  * which is PAGE_SIZE. 1/2 of PAGE_SIZE is for send endpoint interrupt
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 8a249740b4b4..cfacca566e3f 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -206,6 +206,7 @@ void hv_ringbuffer_get_debuginfo(const struct hv_ring_buffer_info *ring_info,
 			ring_info->ring_buffer->interrupt_mask;
 	}
 }
+EXPORT_SYMBOL_GPL(hv_ringbuffer_get_debuginfo);
 
 /* Initialize the ring buffer. */
 int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info,
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index de9b80ff6698..1fa727fe5f93 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -491,6 +491,12 @@ struct vmbus_channel_rescind_offer {
 	u32 child_relid;
 } __packed;
 
+static inline u32
+hv_ringbuffer_pending_size(const struct hv_ring_buffer_info *rbi)
+{
+	return rbi->ring_buffer->pending_send_sz;
+}
+
 /*
  * Request Offer -- no parameters, SynIC message contains the partition ID
  * Set Snoop -- no parameters, SynIC message contains the partition ID
@@ -1148,6 +1154,17 @@ static inline void *hv_get_drvdata(struct hv_device *dev)
 	return dev_get_drvdata(&dev->device);
 }
 
+struct hv_ring_buffer_debug_info {
+	u32 current_interrupt_mask;
+	u32 current_read_index;
+	u32 current_write_index;
+	u32 bytes_avail_toread;
+	u32 bytes_avail_towrite;
+};
+
+void hv_ringbuffer_get_debuginfo(const struct hv_ring_buffer_info *ring_info,
+			    struct hv_ring_buffer_debug_info *debug_info);
+
 /* Vmbus interface */
 #define vmbus_driver_register(driver)	\
 	__vmbus_driver_register(driver, THIS_MODULE, KBUILD_MODNAME)
-- 
cgit v1.2.3


From b5bc980a4929bb2a449fef3e0b7131466815d0b1 Mon Sep 17 00:00:00 2001
From: Martyn Welch <martyn.welch@collabora.co.uk>
Date: Sat, 4 Mar 2017 00:34:29 +0000
Subject: docs: Add kernel-doc comments to VME driver API

Add kernel-doc comments to the VME driver API and structures. This
documentation will be integrated into the RST documentation in a later
patch.

Signed-off-by: Martyn Welch <martyn.welch@collabora.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/vme/vme.c   | 469 ++++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/vme.h |  12 +-
 2 files changed, 449 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c
index 0035cf79760a..6a3ead42aba8 100644
--- a/drivers/vme/vme.c
+++ b/drivers/vme/vme.c
@@ -76,9 +76,16 @@ static struct vme_bridge *find_bridge(struct vme_resource *resource)
 	}
 }
 
-/*
+/**
+ * vme_free_consistent - Allocate contiguous memory.
+ * @resource: Pointer to VME resource.
+ * @size: Size of allocation required.
+ * @dma: Pointer to variable to store physical address of allocation.
+ *
  * Allocate a contiguous block of memory for use by the driver. This is used to
  * create the buffers for the slave windows.
+ *
+ * Return: Virtual address of allocation on success, NULL on failure.
  */
 void *vme_alloc_consistent(struct vme_resource *resource, size_t size,
 	dma_addr_t *dma)
@@ -111,8 +118,14 @@ void *vme_alloc_consistent(struct vme_resource *resource, size_t size,
 }
 EXPORT_SYMBOL(vme_alloc_consistent);
 
-/*
- * Free previously allocated contiguous block of memory.
+/**
+ * vme_free_consistent - Free previously allocated memory.
+ * @resource: Pointer to VME resource.
+ * @size: Size of allocation to free.
+ * @vaddr: Virtual address of allocation.
+ * @dma: Physical address of allocation.
+ *
+ * Free previously allocated block of contiguous memory.
  */
 void vme_free_consistent(struct vme_resource *resource, size_t size,
 	void *vaddr, dma_addr_t dma)
@@ -145,6 +158,16 @@ void vme_free_consistent(struct vme_resource *resource, size_t size,
 }
 EXPORT_SYMBOL(vme_free_consistent);
 
+/**
+ * vme_get_size - Helper function returning size of a VME window
+ * @resource: Pointer to VME slave or master resource.
+ *
+ * Determine the size of the VME window provided. This is a helper
+ * function, wrappering the call to vme_master_get or vme_slave_get
+ * depending on the type of window resource handed to it.
+ *
+ * Return: Size of the window on success, zero on failure.
+ */
 size_t vme_get_size(struct vme_resource *resource)
 {
 	int enabled, retval;
@@ -259,9 +282,16 @@ static u32 vme_get_aspace(int am)
 	return 0;
 }
 
-/*
- * Request a slave image with specific attributes, return some unique
- * identifier.
+/**
+ * vme_slave_request - Request a VME slave window resource.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @address: Required VME address space.
+ * @cycle: Required VME data transfer cycle type.
+ *
+ * Request use of a VME window resource capable of being set for the requested
+ * address space and data transfer cycle.
+ *
+ * Return: Pointer to VME resource on success, NULL on failure.
  */
 struct vme_resource *vme_slave_request(struct vme_dev *vdev, u32 address,
 	u32 cycle)
@@ -327,6 +357,23 @@ err_bus:
 }
 EXPORT_SYMBOL(vme_slave_request);
 
+/**
+ * vme_slave_set - Set VME slave window configuration.
+ * @resource: Pointer to VME slave resource.
+ * @enabled: State to which the window should be configured.
+ * @vme_base: Base address for the window.
+ * @size: Size of the VME window.
+ * @buf_base: Based address of buffer used to provide VME slave window storage.
+ * @aspace: VME address space for the VME window.
+ * @cycle: VME data transfer cycle type for the VME window.
+ *
+ * Set configuration for provided VME slave window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device, if an invalid resource has been provided or invalid
+ *         attributes are provided. Hardware specific errors may also be
+ *         returned.
+ */
 int vme_slave_set(struct vme_resource *resource, int enabled,
 	unsigned long long vme_base, unsigned long long size,
 	dma_addr_t buf_base, u32 aspace, u32 cycle)
@@ -362,6 +409,21 @@ int vme_slave_set(struct vme_resource *resource, int enabled,
 }
 EXPORT_SYMBOL(vme_slave_set);
 
+/**
+ * vme_slave_get - Retrieve VME slave window configuration.
+ * @resource: Pointer to VME slave resource.
+ * @enabled: Pointer to variable for storing state.
+ * @vme_base: Pointer to variable for storing window base address.
+ * @size: Pointer to variable for storing window size.
+ * @buf_base: Pointer to variable for storing slave buffer base address.
+ * @aspace: Pointer to variable for storing VME address space.
+ * @cycle: Pointer to variable for storing VME data transfer cycle type.
+ *
+ * Return configuration for provided VME slave window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device or if an invalid resource has been provided.
+ */
 int vme_slave_get(struct vme_resource *resource, int *enabled,
 	unsigned long long *vme_base, unsigned long long *size,
 	dma_addr_t *buf_base, u32 *aspace, u32 *cycle)
@@ -386,6 +448,12 @@ int vme_slave_get(struct vme_resource *resource, int *enabled,
 }
 EXPORT_SYMBOL(vme_slave_get);
 
+/**
+ * vme_slave_free - Free VME slave window
+ * @resource: Pointer to VME slave resource.
+ *
+ * Free the provided slave resource so that it may be reallocated.
+ */
 void vme_slave_free(struct vme_resource *resource)
 {
 	struct vme_slave_resource *slave_image;
@@ -415,9 +483,17 @@ void vme_slave_free(struct vme_resource *resource)
 }
 EXPORT_SYMBOL(vme_slave_free);
 
-/*
- * Request a master image with specific attributes, return some unique
- * identifier.
+/**
+ * vme_master_request - Request a VME master window resource.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @address: Required VME address space.
+ * @cycle: Required VME data transfer cycle type.
+ * @dwidth: Required VME data transfer width.
+ *
+ * Request use of a VME window resource capable of being set for the requested
+ * address space, data transfer cycle and width.
+ *
+ * Return: Pointer to VME resource on success, NULL on failure.
  */
 struct vme_resource *vme_master_request(struct vme_dev *vdev, u32 address,
 	u32 cycle, u32 dwidth)
@@ -486,6 +562,23 @@ err_bus:
 }
 EXPORT_SYMBOL(vme_master_request);
 
+/**
+ * vme_master_set - Set VME master window configuration.
+ * @resource: Pointer to VME master resource.
+ * @enabled: State to which the window should be configured.
+ * @vme_base: Base address for the window.
+ * @size: Size of the VME window.
+ * @aspace: VME address space for the VME window.
+ * @cycle: VME data transfer cycle type for the VME window.
+ * @dwidth: VME data transfer width for the VME window.
+ *
+ * Set configuration for provided VME master window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device, if an invalid resource has been provided or invalid
+ *         attributes are provided. Hardware specific errors may also be
+ *         returned.
+ */
 int vme_master_set(struct vme_resource *resource, int enabled,
 	unsigned long long vme_base, unsigned long long size, u32 aspace,
 	u32 cycle, u32 dwidth)
@@ -522,6 +615,21 @@ int vme_master_set(struct vme_resource *resource, int enabled,
 }
 EXPORT_SYMBOL(vme_master_set);
 
+/**
+ * vme_master_get - Retrieve VME master window configuration.
+ * @resource: Pointer to VME master resource.
+ * @enabled: Pointer to variable for storing state.
+ * @vme_base: Pointer to variable for storing window base address.
+ * @size: Pointer to variable for storing window size.
+ * @aspace: Pointer to variable for storing VME address space.
+ * @cycle: Pointer to variable for storing VME data transfer cycle type.
+ * @dwidth: Pointer to variable for storing VME data transfer width.
+ *
+ * Return configuration for provided VME master window.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device or if an invalid resource has been provided.
+ */
 int vme_master_get(struct vme_resource *resource, int *enabled,
 	unsigned long long *vme_base, unsigned long long *size, u32 *aspace,
 	u32 *cycle, u32 *dwidth)
@@ -546,8 +654,20 @@ int vme_master_get(struct vme_resource *resource, int *enabled,
 }
 EXPORT_SYMBOL(vme_master_get);
 
-/*
- * Read data out of VME space into a buffer.
+/**
+ * vme_master_write - Read data from VME space into a buffer.
+ * @resource: Pointer to VME master resource.
+ * @buf: Pointer to buffer where data should be transferred.
+ * @count: Number of bytes to transfer.
+ * @offset: Offset into VME master window at which to start transfer.
+ *
+ * Perform read of count bytes of data from location on VME bus which maps into
+ * the VME master window at offset to buf.
+ *
+ * Return: Number of bytes read, -EINVAL if resource is not a VME master
+ *         resource or read operation is not supported. -EFAULT returned if
+ *         invalid offset is provided. Hardware specific errors may also be
+ *         returned.
  */
 ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count,
 	loff_t offset)
@@ -583,8 +703,20 @@ ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count,
 }
 EXPORT_SYMBOL(vme_master_read);
 
-/*
- * Write data out to VME space from a buffer.
+/**
+ * vme_master_write - Write data out to VME space from a buffer.
+ * @resource: Pointer to VME master resource.
+ * @buf: Pointer to buffer holding data to transfer.
+ * @count: Number of bytes to transfer.
+ * @offset: Offset into VME master window at which to start transfer.
+ *
+ * Perform write of count bytes of data from buf to location on VME bus which
+ * maps into the VME master window at offset.
+ *
+ * Return: Number of bytes written, -EINVAL if resource is not a VME master
+ *         resource or write operation is not supported. -EFAULT returned if
+ *         invalid offset is provided. Hardware specific errors may also be
+ *         returned.
  */
 ssize_t vme_master_write(struct vme_resource *resource, void *buf,
 	size_t count, loff_t offset)
@@ -619,8 +751,24 @@ ssize_t vme_master_write(struct vme_resource *resource, void *buf,
 }
 EXPORT_SYMBOL(vme_master_write);
 
-/*
- * Perform RMW cycle to provided location.
+/**
+ * vme_master_rmw - Perform read-modify-write cycle.
+ * @resource: Pointer to VME master resource.
+ * @mask: Bits to be compared and swapped in operation.
+ * @compare: Bits to be compared with data read from offset.
+ * @swap: Bits to be swapped in data read from offset.
+ * @offset: Offset into VME master window at which to perform operation.
+ *
+ * Perform read-modify-write cycle on provided location:
+ * - Location on VME bus is read.
+ * - Bits selected by mask are compared with compare.
+ * - Where a selected bit matches that in compare and are selected in swap,
+ * the bit is swapped.
+ * - Result written back to location on VME bus.
+ *
+ * Return: Bytes written on success, -EINVAL if resource is not a VME master
+ *         resource or RMW operation is not supported. Hardware specific
+ *         errors may also be returned.
  */
 unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask,
 	unsigned int compare, unsigned int swap, loff_t offset)
@@ -644,6 +792,17 @@ unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask,
 }
 EXPORT_SYMBOL(vme_master_rmw);
 
+/**
+ * vme_master_mmap - Mmap region of VME master window.
+ * @resource: Pointer to VME master resource.
+ * @vma: Pointer to definition of user mapping.
+ *
+ * Memory map a region of the VME master window into user space.
+ *
+ * Return: Zero on success, -EINVAL if resource is not a VME master
+ *         resource or -EFAULT if map exceeds window size. Other generic mmap
+ *         errors may also be returned.
+ */
 int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma)
 {
 	struct vme_master_resource *image;
@@ -670,6 +829,12 @@ int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma)
 }
 EXPORT_SYMBOL(vme_master_mmap);
 
+/**
+ * vme_master_free - Free VME master window
+ * @resource: Pointer to VME master resource.
+ *
+ * Free the provided master resource so that it may be reallocated.
+ */
 void vme_master_free(struct vme_resource *resource)
 {
 	struct vme_master_resource *master_image;
@@ -699,9 +864,15 @@ void vme_master_free(struct vme_resource *resource)
 }
 EXPORT_SYMBOL(vme_master_free);
 
-/*
- * Request a DMA controller with specific attributes, return some unique
- * identifier.
+/**
+ * vme_dma_request - Request a DMA controller.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @route: Required src/destination combination.
+ *
+ * Request a VME DMA controller with capability to perform transfers bewteen
+ * requested source/destination combination.
+ *
+ * Return: Pointer to VME DMA resource on success, NULL on failure.
  */
 struct vme_resource *vme_dma_request(struct vme_dev *vdev, u32 route)
 {
@@ -768,8 +939,15 @@ err_bus:
 }
 EXPORT_SYMBOL(vme_dma_request);
 
-/*
- * Start new list
+/**
+ * vme_new_dma_list - Create new VME DMA list.
+ * @resource: Pointer to VME DMA resource.
+ *
+ * Create a new VME DMA list. It is the responsibility of the user to free
+ * the list once it is no longer required with vme_dma_list_free().
+ *
+ * Return: Pointer to new VME DMA list, NULL on allocation failure or invalid
+ *         VME DMA resource.
  */
 struct vme_dma_list *vme_new_dma_list(struct vme_resource *resource)
 {
@@ -796,8 +974,16 @@ struct vme_dma_list *vme_new_dma_list(struct vme_resource *resource)
 }
 EXPORT_SYMBOL(vme_new_dma_list);
 
-/*
- * Create "Pattern" type attributes
+/**
+ * vme_dma_pattern_attribute - Create "Pattern" type VME DMA list attribute.
+ * @pattern: Value to use used as pattern
+ * @type: Type of pattern to be written.
+ *
+ * Create VME DMA list attribute for pattern generation. It is the
+ * responsibility of the user to free used attributes using
+ * vme_dma_free_attribute().
+ *
+ * Return: Pointer to VME DMA attribute, NULL on failure.
  */
 struct vme_dma_attr *vme_dma_pattern_attribute(u32 pattern, u32 type)
 {
@@ -831,8 +1017,15 @@ err_attr:
 }
 EXPORT_SYMBOL(vme_dma_pattern_attribute);
 
-/*
- * Create "PCI" type attributes
+/**
+ * vme_dma_pci_attribute - Create "PCI" type VME DMA list attribute.
+ * @address: PCI base address for DMA transfer.
+ *
+ * Create VME DMA list attribute pointing to a location on PCI for DMA
+ * transfers. It is the responsibility of the user to free used attributes
+ * using vme_dma_free_attribute().
+ *
+ * Return: Pointer to VME DMA attribute, NULL on failure.
  */
 struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t address)
 {
@@ -869,8 +1062,18 @@ err_attr:
 }
 EXPORT_SYMBOL(vme_dma_pci_attribute);
 
-/*
- * Create "VME" type attributes
+/**
+ * vme_dma_vme_attribute - Create "VME" type VME DMA list attribute.
+ * @address: VME base address for DMA transfer.
+ * @aspace: VME address space to use for DMA transfer.
+ * @cycle: VME bus cycle to use for DMA transfer.
+ * @dwidth: VME data width to use for DMA transfer.
+ *
+ * Create VME DMA list attribute pointing to a location on the VME bus for DMA
+ * transfers. It is the responsibility of the user to free used attributes
+ * using vme_dma_free_attribute().
+ *
+ * Return: Pointer to VME DMA attribute, NULL on failure.
  */
 struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long address,
 	u32 aspace, u32 cycle, u32 dwidth)
@@ -908,8 +1111,12 @@ err_attr:
 }
 EXPORT_SYMBOL(vme_dma_vme_attribute);
 
-/*
- * Free attribute
+/**
+ * vme_dma_free_attribute - Free DMA list attribute.
+ * @attributes: Pointer to DMA list attribute.
+ *
+ * Free VME DMA list attribute. VME DMA list attributes can be safely freed
+ * once vme_dma_list_add() has returned.
  */
 void vme_dma_free_attribute(struct vme_dma_attr *attributes)
 {
@@ -918,6 +1125,23 @@ void vme_dma_free_attribute(struct vme_dma_attr *attributes)
 }
 EXPORT_SYMBOL(vme_dma_free_attribute);
 
+/**
+ * vme_dma_list_add - Add enty to a VME DMA list.
+ * @list: Pointer to VME list.
+ * @src: Pointer to DMA list attribute to use as source.
+ * @dest: Pointer to DMA list attribute to use as destination.
+ * @count: Number of bytes to transfer.
+ *
+ * Add an entry to the provided VME DMA list. Entry requires pointers to source
+ * and destination DMA attributes and a count.
+ *
+ * Please note, the attributes supported as source and destinations for
+ * transfers are hardware dependent.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device or if the link list has already been submitted for execution.
+ *         Hardware specific errors also possible.
+ */
 int vme_dma_list_add(struct vme_dma_list *list, struct vme_dma_attr *src,
 	struct vme_dma_attr *dest, size_t count)
 {
@@ -942,6 +1166,16 @@ int vme_dma_list_add(struct vme_dma_list *list, struct vme_dma_attr *src,
 }
 EXPORT_SYMBOL(vme_dma_list_add);
 
+/**
+ * vme_dma_list_exec - Queue a VME DMA list for execution.
+ * @list: Pointer to VME list.
+ *
+ * Queue the provided VME DMA list for execution. The call will return once the
+ * list has been executed.
+ *
+ * Return: Zero on success, -EINVAL if operation is not supported on this
+ *         device. Hardware specific errors also possible.
+ */
 int vme_dma_list_exec(struct vme_dma_list *list)
 {
 	struct vme_bridge *bridge = list->parent->parent;
@@ -962,6 +1196,15 @@ int vme_dma_list_exec(struct vme_dma_list *list)
 }
 EXPORT_SYMBOL(vme_dma_list_exec);
 
+/**
+ * vme_dma_list_free - Free a VME DMA list.
+ * @list: Pointer to VME list.
+ *
+ * Free the provided DMA list and all its entries.
+ *
+ * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource
+ *         is still in use. Hardware specific errors also possible.
+ */
 int vme_dma_list_free(struct vme_dma_list *list)
 {
 	struct vme_bridge *bridge = list->parent->parent;
@@ -994,6 +1237,15 @@ int vme_dma_list_free(struct vme_dma_list *list)
 }
 EXPORT_SYMBOL(vme_dma_list_free);
 
+/**
+ * vme_dma_free - Free a VME DMA resource.
+ * @resource: Pointer to VME DMA resource.
+ *
+ * Free the provided DMA resource so that it may be reallocated.
+ *
+ * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource
+ *         is still active.
+ */
 int vme_dma_free(struct vme_resource *resource)
 {
 	struct vme_dma_resource *ctrlr;
@@ -1099,6 +1351,22 @@ void vme_irq_handler(struct vme_bridge *bridge, int level, int statid)
 }
 EXPORT_SYMBOL(vme_irq_handler);
 
+/**
+ * vme_irq_request - Request a specific VME interrupt.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @level: Interrupt priority being requested.
+ * @statid: Interrupt vector being requested.
+ * @callback: Pointer to callback function called when VME interrupt/vector
+ *            received.
+ * @priv_data: Generic pointer that will be passed to the callback function.
+ *
+ * Request callback to be attached as a handler for VME interrupts with provided
+ * level and statid.
+ *
+ * Return: Zero on success, -EINVAL on invalid vme device, level or if the
+ *         function is not supported, -EBUSY if the level/statid combination is
+ *         already in use. Hardware specific errors also possible.
+ */
 int vme_irq_request(struct vme_dev *vdev, int level, int statid,
 	void (*callback)(int, int, void *),
 	void *priv_data)
@@ -1142,6 +1410,14 @@ int vme_irq_request(struct vme_dev *vdev, int level, int statid,
 }
 EXPORT_SYMBOL(vme_irq_request);
 
+/**
+ * vme_irq_free - Free a VME interrupt.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @level: Interrupt priority of interrupt being freed.
+ * @statid: Interrupt vector of interrupt being freed.
+ *
+ * Remove previously attached callback from VME interrupt priority/vector.
+ */
 void vme_irq_free(struct vme_dev *vdev, int level, int statid)
 {
 	struct vme_bridge *bridge;
@@ -1177,6 +1453,18 @@ void vme_irq_free(struct vme_dev *vdev, int level, int statid)
 }
 EXPORT_SYMBOL(vme_irq_free);
 
+/**
+ * vme_irq_generate - Generate VME interrupt.
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ * @level: Interrupt priority at which to assert the interrupt.
+ * @statid: Interrupt vector to associate with the interrupt.
+ *
+ * Generate a VME interrupt of the provided level and with the provided
+ * statid.
+ *
+ * Return: Zero on success, -EINVAL on invalid vme device, level or if the
+ *         function is not supported. Hardware specific errors also possible.
+ */
 int vme_irq_generate(struct vme_dev *vdev, int level, int statid)
 {
 	struct vme_bridge *bridge;
@@ -1201,8 +1489,15 @@ int vme_irq_generate(struct vme_dev *vdev, int level, int statid)
 }
 EXPORT_SYMBOL(vme_irq_generate);
 
-/*
- * Request the location monitor, return resource or NULL
+/**
+ * vme_lm_request - Request a VME location monitor
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ *
+ * Allocate a location monitor resource to the driver. A location monitor
+ * allows the driver to monitor accesses to a contiguous number of
+ * addresses on the VME bus.
+ *
+ * Return: Pointer to a VME resource on success or NULL on failure.
  */
 struct vme_resource *vme_lm_request(struct vme_dev *vdev)
 {
@@ -1218,7 +1513,7 @@ struct vme_resource *vme_lm_request(struct vme_dev *vdev)
 		goto err_bus;
 	}
 
-	/* Loop through DMA resources */
+	/* Loop through LM resources */
 	list_for_each(lm_pos, &bridge->lm_resources) {
 		lm = list_entry(lm_pos,
 			struct vme_lm_resource, list);
@@ -1264,6 +1559,17 @@ err_bus:
 }
 EXPORT_SYMBOL(vme_lm_request);
 
+/**
+ * vme_lm_count - Determine number of VME Addresses monitored
+ * @resource: Pointer to VME location monitor resource.
+ *
+ * The number of contiguous addresses monitored is hardware dependent.
+ * Return the number of contiguous addresses monitored by the
+ * location monitor.
+ *
+ * Return: Count of addresses monitored or -EINVAL when provided with an
+ *	   invalid location monitor resource.
+ */
 int vme_lm_count(struct vme_resource *resource)
 {
 	struct vme_lm_resource *lm;
@@ -1279,6 +1585,20 @@ int vme_lm_count(struct vme_resource *resource)
 }
 EXPORT_SYMBOL(vme_lm_count);
 
+/**
+ * vme_lm_set - Configure location monitor
+ * @resource: Pointer to VME location monitor resource.
+ * @lm_base: Base address to monitor.
+ * @aspace: VME address space to monitor.
+ * @cycle: VME bus cycle type to monitor.
+ *
+ * Set the base address, address space and cycle type of accesses to be
+ * monitored by the location monitor.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
 int vme_lm_set(struct vme_resource *resource, unsigned long long lm_base,
 	u32 aspace, u32 cycle)
 {
@@ -1301,6 +1621,20 @@ int vme_lm_set(struct vme_resource *resource, unsigned long long lm_base,
 }
 EXPORT_SYMBOL(vme_lm_set);
 
+/**
+ * vme_lm_get - Retrieve location monitor settings
+ * @resource: Pointer to VME location monitor resource.
+ * @lm_base: Pointer used to output the base address monitored.
+ * @aspace: Pointer used to output the address space monitored.
+ * @cycle: Pointer used to output the VME bus cycle type monitored.
+ *
+ * Retrieve the base address, address space and cycle type of accesses to
+ * be monitored by the location monitor.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
 int vme_lm_get(struct vme_resource *resource, unsigned long long *lm_base,
 	u32 *aspace, u32 *cycle)
 {
@@ -1323,6 +1657,21 @@ int vme_lm_get(struct vme_resource *resource, unsigned long long *lm_base,
 }
 EXPORT_SYMBOL(vme_lm_get);
 
+/**
+ * vme_lm_attach - Provide callback for location monitor address
+ * @resource: Pointer to VME location monitor resource.
+ * @monitor: Offset to which callback should be attached.
+ * @callback: Pointer to callback function called when triggered.
+ * @data: Generic pointer that will be passed to the callback function.
+ *
+ * Attach a callback to the specificed offset into the location monitors
+ * monitored addresses. A generic pointer is provided to allow data to be
+ * passed to the callback when called.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
 int vme_lm_attach(struct vme_resource *resource, int monitor,
 	void (*callback)(void *), void *data)
 {
@@ -1345,6 +1694,18 @@ int vme_lm_attach(struct vme_resource *resource, int monitor,
 }
 EXPORT_SYMBOL(vme_lm_attach);
 
+/**
+ * vme_lm_detach - Remove callback for location monitor address
+ * @resource: Pointer to VME location monitor resource.
+ * @monitor: Offset to which callback should be removed.
+ *
+ * Remove the callback associated with the specificed offset into the
+ * location monitors monitored addresses.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource or function is not supported. Hardware specific
+ *	   errors may also be returned.
+ */
 int vme_lm_detach(struct vme_resource *resource, int monitor)
 {
 	struct vme_bridge *bridge = find_bridge(resource);
@@ -1366,6 +1727,18 @@ int vme_lm_detach(struct vme_resource *resource, int monitor)
 }
 EXPORT_SYMBOL(vme_lm_detach);
 
+/**
+ * vme_lm_free - Free allocated VME location monitor
+ * @resource: Pointer to VME location monitor resource.
+ *
+ * Free allocation of a VME location monitor.
+ *
+ * WARNING: This function currently expects that any callbacks that have
+ *          been attached to the location monitor have been removed.
+ *
+ * Return: Zero on success, -EINVAL when provided with an invalid location
+ *	   monitor resource.
+ */
 void vme_lm_free(struct vme_resource *resource)
 {
 	struct vme_lm_resource *lm;
@@ -1392,6 +1765,16 @@ void vme_lm_free(struct vme_resource *resource)
 }
 EXPORT_SYMBOL(vme_lm_free);
 
+/**
+ * vme_slot_num - Retrieve slot ID
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ *
+ * Retrieve the slot ID associated with the provided VME device.
+ *
+ * Return: The slot ID on success, -EINVAL if VME bridge cannot be determined
+ *         or the function is not supported. Hardware specific errors may also
+ *         be returned.
+ */
 int vme_slot_num(struct vme_dev *vdev)
 {
 	struct vme_bridge *bridge;
@@ -1411,6 +1794,15 @@ int vme_slot_num(struct vme_dev *vdev)
 }
 EXPORT_SYMBOL(vme_slot_num);
 
+/**
+ * vme_bus_num - Retrieve bus number
+ * @vdev: Pointer to VME device struct vme_dev assigned to driver instance.
+ *
+ * Retrieve the bus enumeration associated with the provided VME device.
+ *
+ * Return: The bus number on success, -EINVAL if VME bridge cannot be
+ *         determined.
+ */
 int vme_bus_num(struct vme_dev *vdev)
 {
 	struct vme_bridge *bridge;
@@ -1556,6 +1948,15 @@ static int __vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
 	return err;
 }
 
+/**
+ * vme_register_driver - Register a VME driver
+ * @drv: Pointer to VME driver structure to register.
+ * @ndevs: Maximum number of devices to allow to be enumerated.
+ *
+ * Register a VME device driver with the VME subsystem.
+ *
+ * Return: Zero on success, error value on registration failure.
+ */
 int vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
 {
 	int err;
@@ -1576,6 +1977,12 @@ int vme_register_driver(struct vme_driver *drv, unsigned int ndevs)
 }
 EXPORT_SYMBOL(vme_register_driver);
 
+/**
+ * vme_unregister_driver - Unregister a VME driver
+ * @drv: Pointer to VME driver structure to unregister.
+ *
+ * Unregister a VME device driver from the VME subsystem.
+ */
 void vme_unregister_driver(struct vme_driver *drv)
 {
 	struct vme_dev *dev, *dev_tmp;
diff --git a/include/linux/vme.h b/include/linux/vme.h
index ec5e8bf6118e..25874da3f2e1 100644
--- a/include/linux/vme.h
+++ b/include/linux/vme.h
@@ -92,7 +92,7 @@ extern struct bus_type vme_bus_type;
 #define VME_SLOT_ALL		-2
 
 /**
- * Structure representing a VME device
+ * struct vme_dev - Structure representing a VME device
  * @num: The device number
  * @bridge: Pointer to the bridge device this device is on
  * @dev: Internal device structure
@@ -107,6 +107,16 @@ struct vme_dev {
 	struct list_head bridge_list;
 };
 
+/**
+ * struct vme_driver - Structure representing a VME driver
+ * @name: Driver name, should be unique among VME drivers and usually the same
+ *        as the module name.
+ * @match: Callback used to determine whether probe should be run.
+ * @probe: Callback for device binding, called when new device is detected.
+ * @remove: Callback, called on device removal.
+ * @driver: Underlying generic device driver structure.
+ * @devices: List of VME devices (struct vme_dev) associated with this driver.
+ */
 struct vme_driver {
 	const char *name;
 	int (*match)(struct vme_dev *);
-- 
cgit v1.2.3


From 77f88796cee819b9c4562b0b6b44691b3b7755b1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 16 Mar 2017 16:54:24 -0400
Subject: cgroup, kthread: close race window where new kthreads can be migrated
 to non-root cgroups

Creation of a kthread goes through a couple interlocked stages between
the kthread itself and its creator.  Once the new kthread starts
running, it initializes itself and wakes up the creator.  The creator
then can further configure the kthread and then let it start doing its
job by waking it up.

In this configuration-by-creator stage, the creator is the only one
that can wake it up but the kthread is visible to userland.  When
altering the kthread's attributes from userland is allowed, this is
fine; however, for cases where CPU affinity is critical,
kthread_bind() is used to first disable affinity changes from userland
and then set the affinity.  This also prevents the kthread from being
migrated into non-root cgroups as that can affect the CPU affinity and
many other things.

Unfortunately, the cgroup side of protection is racy.  While the
PF_NO_SETAFFINITY flag prevents further migrations, userland can win
the race before the creator sets the flag with kthread_bind() and put
the kthread in a non-root cgroup, which can lead to all sorts of
problems including incorrect CPU affinity and starvation.

This bug got triggered by userland which periodically tries to migrate
all processes in the root cpuset cgroup to a non-root one.  Per-cpu
workqueue workers got caught while being created and ended up with
incorrected CPU affinity breaking concurrency management and sometimes
stalling workqueue execution.

This patch adds task->no_cgroup_migration which disallows the task to
be migrated by userland.  kthreadd starts with the flag set making
every child kthread start in the root cgroup with migration
disallowed.  The flag is cleared after the kthread finishes
initialization by which time PF_NO_SETAFFINITY is set if the kthread
should stay in the root cgroup.

It'd be better to wait for the initialization instead of failing but I
couldn't think of a way of implementing that without adding either a
new PF flag, or sleeping and retrying from waiting side.  Even if
userland depends on changing cgroup membership of a kthread, it either
has to be synchronized with kthread_create() or periodically repeat,
so it's unlikely that this would break anything.

v2: Switch to a simpler implementation using a new task_struct bit
    field suggested by Oleg.

Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reported-and-debugged-by: Chris Mason <clm@fb.com>
Cc: stable@vger.kernel.org # v4.3+ (we can't close the race on < v4.3)
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 21 +++++++++++++++++++++
 include/linux/sched.h  |  4 ++++
 kernel/cgroup/cgroup.c |  9 +++++----
 kernel/kthread.c       |  3 +++
 4 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b43fbb141c..af9c86e958bd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -570,6 +570,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
+static inline void cgroup_init_kthreadd(void)
+{
+	/*
+	 * kthreadd is inherited by all kthreads, keep it in the root so
+	 * that the new kthreads are guaranteed to stay in the root until
+	 * initialization is finished.
+	 */
+	current->no_cgroup_migration = 1;
+}
+
+static inline void cgroup_kthread_ready(void)
+{
+	/*
+	 * This kthread finished initialization.  The creator should have
+	 * set PF_NO_SETAFFINITY if this kthread should stay in the root.
+	 */
+	current->no_cgroup_migration = 0;
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -590,6 +609,8 @@ static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
+static inline void cgroup_init_kthreadd(void) {}
+static inline void cgroup_kthread_ready(void) {}
 
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..4cf9a59a4d08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -604,6 +604,10 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
 	unsigned			brk_randomized:1;
 #endif
+#ifdef CONFIG_CGROUPS
+	/* disallow userland-initiated cgroup migration */
+	unsigned			no_cgroup_migration:1;
+#endif
 
 	unsigned long			atomic_flags; /* Flags requiring atomic access. */
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589c7428..638ef7568495 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2425,11 +2425,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 		tsk = tsk->group_leader;
 
 	/*
-	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-	 * trapped in a cpuset, or RT worker may be born in a cgroup
-	 * with no rt_runtime allocated.  Just say no.
+	 * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+	 * If userland migrates such a kthread to a non-root cgroup, it can
+	 * become trapped in a cpuset, or RT kthread may be born in a
+	 * cgroup with no rt_runtime allocated.  Just say no.
 	 */
-	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		goto out_unlock_rcu;
 	}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2f26adea0f84..26db528c1d88 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -225,6 +226,7 @@ static int kthread(void *_create)
 
 	ret = -EINTR;
 	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+		cgroup_kthread_ready();
 		__kthread_parkme(self);
 		ret = threadfn(data);
 	}
@@ -538,6 +540,7 @@ int kthreadd(void *unused)
 	set_mems_allowed(node_states[N_MEMORY]);
 
 	current->flags |= PF_NOFREEZE;
+	cgroup_init_kthreadd();
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From 016da20148a1692e34d35d1f1787400a2a2d2c58 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 16 Mar 2017 18:08:13 -0700
Subject: hrtimer: Remove hrtimer_peek_ahead_timers() leftovers

This function was removed in commit c6eb3f70d448 (hrtimer: Get rid of
hrtimer softirq, 2015-04-14) but the prototype wasn't ever deleted.

Delete it now.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Link: http://lkml.kernel.org/r/20170317010814.2591-1-sboyd@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 4 ----
 kernel/time/hrtimer.c   | 5 +----
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 249e579ecd4c..23d58fcd4d9a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -276,8 +276,6 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 	return timer->base->cpu_base->hres_active;
 }
 
-extern void hrtimer_peek_ahead_timers(void);
-
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -300,8 +298,6 @@ extern unsigned int hrtimer_resolution;
 
 #define hrtimer_resolution	(unsigned int)LOW_RES_NSEC
 
-static inline void hrtimer_peek_ahead_timers(void) { }
-
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return 0;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..1ef82cdb61ff 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1368,10 +1368,7 @@ retry:
 		    ktime_to_ns(delta));
 }
 
-/*
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
+/* called with interrupts disabled */
 static inline void __hrtimer_peek_ahead_timers(void)
 {
 	struct tick_device *td;
-- 
cgit v1.2.3


From d5b72a2123dfaf9416b1a1177b4be041f8a8b6d4 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Fri, 17 Mar 2017 17:34:49 +0100
Subject: dma-fence: add dma_fence_match_context helper

Add a helper to check if all fences in a fence array are from a given
context. For convenience, the function can also handle being given a
non-array fence.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Gustavo Padovan <gustavo.padovan@collabora.com>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: http://patchwork.freedesktop.org/patch/msgid/1489768492-25190-1-git-send-email-p.zabel@pengutronix.de
---
 drivers/dma-buf/dma-fence-array.c | 26 ++++++++++++++++++++++++++
 include/linux/dma-fence-array.h   |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c
index 67eb7c8fb88c..0350829ba62e 100644
--- a/drivers/dma-buf/dma-fence-array.c
+++ b/drivers/dma-buf/dma-fence-array.c
@@ -144,3 +144,29 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
 	return array;
 }
 EXPORT_SYMBOL(dma_fence_array_create);
+
+/**
+ * dma_fence_match_context - Check if all fences are from the given context
+ * @fence:		[in]	fence or fence array
+ * @context:		[in]	fence context to check all fences against
+ *
+ * Checks the provided fence or, for a fence array, all fences in the array
+ * against the given context. Returns false if any fence is from a different
+ * context.
+ */
+bool dma_fence_match_context(struct dma_fence *fence, u64 context)
+{
+	struct dma_fence_array *array = to_dma_fence_array(fence);
+	unsigned i;
+
+	if (!dma_fence_is_array(fence))
+		return fence->context == context;
+
+	for (i = 0; i < array->num_fences; i++) {
+		if (array->fences[i]->context != context)
+			return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(dma_fence_match_context);
diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h
index 5900945f962d..332a5420243c 100644
--- a/include/linux/dma-fence-array.h
+++ b/include/linux/dma-fence-array.h
@@ -83,4 +83,6 @@ struct dma_fence_array *dma_fence_array_create(int num_fences,
 					       u64 context, unsigned seqno,
 					       bool signal_on_any);
 
+bool dma_fence_match_context(struct dma_fence *fence, u64 context);
+
 #endif /* __LINUX_DMA_FENCE_ARRAY_H */
-- 
cgit v1.2.3


From b59f65fa076a8eac2ff3a8ab7f8e1705b9fa86cb Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 16 Mar 2017 18:26:53 +0300
Subject: mm/gup: Implement the dev_pagemap() logic in the generic
 get_user_pages_fast() function

This is a preparation patch for the transition of x86 to the generic GUP_fast()
implementation.

Prepare generic GUP_fast() to handle dev_pagemap(). At the moment, it's
only implemented on x86. On non-x86, the new code will be compiled out.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K . V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dann Frazier <dann.frazier@canonical.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170316152655.37789-6-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm.h |  4 +++
 mm/gup.c           | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5f01c88f0800..e197d3ca3e8a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -430,6 +430,10 @@ static inline int pud_devmap(pud_t pud)
 {
 	return 0;
 }
+static inline int pgd_devmap(pgd_t pgd)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/mm/gup.c b/mm/gup.c
index 2b6cd3573457..e3d1e80424f4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1200,12 +1200,23 @@ static inline pte_t gup_get_pte(pte_t *ptep)
 }
 #endif
 
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+	while ((*nr) - nr_start) {
+		struct page *page = pages[--(*nr)];
+
+		ClearPageReferenced(page);
+		put_page(page);
+	}
+}
+
 #ifdef __HAVE_ARCH_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 int write, struct page **pages, int *nr)
 {
+	struct dev_pagemap *pgmap = NULL;
+	int nr_start = *nr, ret = 0;
 	pte_t *ptep, *ptem;
-	int ret = 0;
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
 	do {
@@ -1222,7 +1233,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		if (!pte_access_permitted(pte, write))
 			goto pte_unmap;
 
-		if (pte_special(pte))
+		if (pte_devmap(pte)) {
+			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+			if (unlikely(!pgmap)) {
+				undo_dev_pagemap(nr, nr_start, pages);
+				goto pte_unmap;
+			}
+		} else if (pte_special(pte))
 			goto pte_unmap;
 
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1239,6 +1256,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 
+		put_dev_pagemap(pgmap);
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
@@ -1269,6 +1287,64 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* __HAVE_ARCH_PTE_SPECIAL */
 
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	int nr_start = *nr;
+	struct dev_pagemap *pgmap = NULL;
+
+	do {
+		struct page *page = pfn_to_page(pfn);
+
+		pgmap = get_dev_pagemap(pfn, pgmap);
+		if (unlikely(!pgmap)) {
+			undo_dev_pagemap(nr, nr_start, pages);
+			return 0;
+		}
+		SetPageReferenced(page);
+		pages[*nr] = page;
+		get_page(page);
+		put_dev_pagemap(pgmap);
+		(*nr)++;
+		pfn++;
+	} while (addr += PAGE_SIZE, addr != end);
+	return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	unsigned long fault_pfn;
+
+	fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	unsigned long fault_pfn;
+
+	fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+#else
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	BUILD_BUG();
+	return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	BUILD_BUG();
+	return 0;
+}
+#endif
+
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -1278,6 +1354,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 	if (!pmd_access_permitted(orig, write))
 		return 0;
 
+	if (pmd_devmap(orig))
+		return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+
 	refs = 0;
 	head = pmd_page(orig);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1314,6 +1393,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 	if (!pud_access_permitted(orig, write))
 		return 0;
 
+	if (pud_devmap(orig))
+		return __gup_device_huge_pud(orig, addr, end, pages, nr);
+
 	refs = 0;
 	head = pud_page(orig);
 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1351,6 +1433,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	if (!pgd_access_permitted(orig, write))
 		return 0;
 
+	BUILD_BUG_ON(pgd_devmap(orig));
 	refs = 0;
 	head = pgd_page(orig);
 	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
-- 
cgit v1.2.3


From 3843832fc8cadc2d48ba4ea4cd350a696906ac42 Mon Sep 17 00:00:00 2001
From: Peter De Schrijver <pdeschrijver@nvidia.com>
Date: Tue, 28 Feb 2017 17:19:24 +0200
Subject: clk: tegra: Handle UTMIPLL IDDQ

Export UTMIPLL IDDQ functions. These will be needed when powergating the
XUSB partition.

Signed-off-by: BH Hsieh <bhsieh@nvidia.com>
Signed-off-by: Peter De Schrijver <pdeschrijver@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/tegra/clk-tegra210.c | 26 ++++++++++++++++++++++++++
 include/linux/clk/tegra.h        |  2 ++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 9a2512a6b419..f89d2a912273 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -2313,6 +2313,32 @@ static const char * const aclk_parents[] = {
 	"clk_m"
 };
 
+void tegra210_put_utmipll_in_iddq(void)
+{
+	u32 reg;
+
+	reg = readl_relaxed(clk_base + UTMIPLL_HW_PWRDN_CFG0);
+
+	if (reg & UTMIPLL_HW_PWRDN_CFG0_UTMIPLL_LOCK) {
+		pr_err("trying to assert IDDQ while UTMIPLL is locked\n");
+		return;
+	}
+
+	reg |= UTMIPLL_HW_PWRDN_CFG0_IDDQ_OVERRIDE;
+	writel_relaxed(reg, clk_base + UTMIPLL_HW_PWRDN_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_put_utmipll_in_iddq);
+
+void tegra210_put_utmipll_out_iddq(void)
+{
+	u32 reg;
+
+	reg = readl_relaxed(clk_base + UTMIPLL_HW_PWRDN_CFG0);
+	reg &= ~UTMIPLL_HW_PWRDN_CFG0_IDDQ_OVERRIDE;
+	writel_relaxed(reg, clk_base + UTMIPLL_HW_PWRDN_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_put_utmipll_out_iddq);
+
 static __init void tegra210_periph_clk_init(void __iomem *clk_base,
 					    void __iomem *pmc_base)
 {
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 7007a5f48080..e17d32831e28 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -125,5 +125,7 @@ extern void tegra210_xusb_pll_hw_control_enable(void);
 extern void tegra210_xusb_pll_hw_sequence_start(void);
 extern void tegra210_sata_pll_hw_control_enable(void);
 extern void tegra210_sata_pll_hw_sequence_start(void);
+extern void tegra210_put_utmipll_in_iddq(void);
+extern void tegra210_put_utmipll_out_iddq(void);
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.2.3


From 59af78d78db8bde6a63e09772aa44192f772fa96 Mon Sep 17 00:00:00 2001
From: Peter De Schrijver <pdeschrijver@nvidia.com>
Date: Wed, 15 Mar 2017 17:42:05 +0200
Subject: clk: tegra: Add SATA seq input control

This will be used by the powergating driver to ensure proper sequencer
state when the SATA domain is powergated.

Signed-off-by: Peter De Schrijver <pdeschrijver@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/clk/tegra/clk-tegra210.c | 25 +++++++++++++++++++++++++
 include/linux/clk/tegra.h        |  1 +
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index 6f29125ec439..f3e51e640d4d 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -181,6 +181,11 @@
 #define SATA_PLL_CFG0				0x490
 #define SATA_PLL_CFG0_PADPLL_RESET_SWCTL	BIT(0)
 #define SATA_PLL_CFG0_PADPLL_USE_LOCKDET	BIT(2)
+#define SATA_PLL_CFG0_SATA_SEQ_IN_SWCTL		BIT(4)
+#define SATA_PLL_CFG0_SATA_SEQ_RESET_INPUT_VALUE	BIT(5)
+#define SATA_PLL_CFG0_SATA_SEQ_LANE_PD_INPUT_VALUE	BIT(6)
+#define SATA_PLL_CFG0_SATA_SEQ_PADPLL_PD_INPUT_VALUE	BIT(7)
+
 #define SATA_PLL_CFG0_PADPLL_SLEEP_IDDQ		BIT(13)
 #define SATA_PLL_CFG0_SEQ_ENABLE		BIT(24)
 
@@ -483,6 +488,26 @@ void tegra210_sata_pll_hw_sequence_start(void)
 }
 EXPORT_SYMBOL_GPL(tegra210_sata_pll_hw_sequence_start);
 
+void tegra210_set_sata_pll_seq_sw(bool state)
+{
+	u32 val;
+
+	val = readl_relaxed(clk_base + SATA_PLL_CFG0);
+	if (state) {
+		val |= SATA_PLL_CFG0_SATA_SEQ_IN_SWCTL;
+		val |= SATA_PLL_CFG0_SATA_SEQ_RESET_INPUT_VALUE;
+		val |= SATA_PLL_CFG0_SATA_SEQ_LANE_PD_INPUT_VALUE;
+		val |= SATA_PLL_CFG0_SATA_SEQ_PADPLL_PD_INPUT_VALUE;
+	} else {
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_IN_SWCTL;
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_RESET_INPUT_VALUE;
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_LANE_PD_INPUT_VALUE;
+		val &= ~SATA_PLL_CFG0_SATA_SEQ_PADPLL_PD_INPUT_VALUE;
+	}
+	writel_relaxed(val, clk_base + SATA_PLL_CFG0);
+}
+EXPORT_SYMBOL_GPL(tegra210_set_sata_pll_seq_sw);
+
 static inline void _pll_misc_chk_default(void __iomem *base,
 					struct tegra_clk_pll_params *params,
 					u8 misc_num, u32 default_val, u32 mask)
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index e17d32831e28..d23c9cf26993 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -125,6 +125,7 @@ extern void tegra210_xusb_pll_hw_control_enable(void);
 extern void tegra210_xusb_pll_hw_sequence_start(void);
 extern void tegra210_sata_pll_hw_control_enable(void);
 extern void tegra210_sata_pll_hw_sequence_start(void);
+extern void tegra210_set_sata_pll_seq_sw(bool state);
 extern void tegra210_put_utmipll_in_iddq(void);
 extern void tegra210_put_utmipll_out_iddq(void);
 
-- 
cgit v1.2.3


From 79170fda313ed5be2394f87aa2a00d597f8ed4a1 Mon Sep 17 00:00:00 2001
From: Kyle Huey <me@kylehuey.com>
Date: Mon, 20 Mar 2017 01:16:24 -0700
Subject: x86/syscalls/32: Wire up arch_prctl on x86-32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hook up arch_prctl to call do_arch_prctl() on x86-32, and in 32 bit compat
mode on x86-64. This allows to have arch_prctls that are not specific to 64
bits.

On UML, simply stub out this syscall.

Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Cc: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Robert O'Callahan <robert@ocallahan.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: user-mode-linux-devel@lists.sourceforge.net
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: user-mode-linux-user@lists.sourceforge.net
Cc: David Matlack <dmatlack@google.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170320081628.18952-7-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/kernel/process_32.c           | 7 +++++++
 arch/x86/kernel/process_64.c           | 7 +++++++
 arch/x86/um/Makefile                   | 2 +-
 arch/x86/um/syscalls_32.c              | 7 +++++++
 include/linux/compat.h                 | 2 ++
 6 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/um/syscalls_32.c

(limited to 'include/linux')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9ba050fe47f3..0af59fa789ea 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -390,3 +390,4 @@
 381	i386	pkey_alloc		sys_pkey_alloc
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
+384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4c818f8bc135..ff40e74c9181 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -56,6 +57,7 @@
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 #include <asm/intel_rdt.h>
+#include <asm/proto.h>
 
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	return prev_p;
 }
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return do_arch_prctl_common(current, option, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d81b0a60a45c..ea1a6180bf39 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -635,6 +635,13 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 	return ret;
 }
 
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return do_arch_prctl_common(current, option, arg2);
+}
+#endif
+
 unsigned long KSTK_ESP(struct task_struct *task)
 {
 	return task_pt_regs(task)->sp;
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index e7e7055a8658..69f0827d5f53 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
 
 ifeq ($(CONFIG_X86_32),y)
 
-obj-y += checksum_32.o
+obj-y += checksum_32.o syscalls_32.o
 obj-$(CONFIG_ELF_CORE) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644
index 000000000000..627d68836b16
--- /dev/null
+++ b/arch/x86/um/syscalls_32.c
@@ -0,0 +1,7 @@
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return -EINVAL;
+}
diff --git a/include/linux/compat.h b/include/linux/compat.h
index aef47be2a5c1..af9dbc44fd92 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -723,6 +723,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
 					    int, const char __user *);
 
+asmlinkage long compat_sys_arch_prctl(int option, unsigned long arg2);
+
 /*
  * For most but not all architectures, "am I in a compat syscall?" and
  * "am I a compat task?" are the same question.  For architectures on which
-- 
cgit v1.2.3


From e9ea1e7f53b852147cbd568b0568c7ad97ec21a3 Mon Sep 17 00:00:00 2001
From: Kyle Huey <me@kylehuey.com>
Date: Mon, 20 Mar 2017 01:16:26 -0700
Subject: x86/arch_prctl: Add ARCH_[GET|SET]_CPUID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel supports faulting on the CPUID instruction beginning with Ivy Bridge.
When enabled, the processor will fault on attempts to execute the CPUID
instruction with CPL>0. Exposing this feature to userspace will allow a
ptracer to trap and emulate the CPUID instruction.

When supported, this feature is controlled by toggling bit 0 of
MSR_MISC_FEATURES_ENABLES. It is documented in detail in Section 2.3.2 of
https://bugzilla.kernel.org/attachment.cgi?id=243991

Implement a new pair of arch_prctls, available on both x86-32 and x86-64.

ARCH_GET_CPUID: Returns the current CPUID state, either 0 if CPUID faulting
    is enabled (and thus the CPUID instruction is not available) or 1 if
    CPUID faulting is not enabled.

ARCH_SET_CPUID: Set the CPUID state to the second argument. If
    cpuid_enabled is 0 CPUID faulting will be activated, otherwise it will
    be deactivated. Returns ENODEV if CPUID faulting is not supported on
    this system.

The state of the CPUID faulting flag is propagated across forks, but reset
upon exec.

Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Cc: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Robert O'Callahan <robert@ocallahan.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: user-mode-linux-devel@lists.sourceforge.net
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: user-mode-linux-user@lists.sourceforge.net
Cc: David Matlack <dmatlack@google.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170320081628.18952-9-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/msr-index.h   |  2 +
 arch/x86/include/asm/processor.h   |  2 +
 arch/x86/include/asm/thread_info.h |  6 ++-
 arch/x86/include/uapi/asm/prctl.h  | 11 ++++--
 arch/x86/kernel/cpu/intel.c        | 18 +++++----
 arch/x86/kernel/process.c          | 78 ++++++++++++++++++++++++++++++++++++++
 fs/exec.c                          |  1 +
 include/linux/thread_info.h        |  4 ++
 8 files changed, 109 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b1f75daca34b..673f9ac50f6d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -558,6 +558,8 @@
 /* MISC_FEATURES_ENABLES non-architectural features */
 #define MSR_MISC_FEATURES_ENABLES	0x00000140
 
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT	0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT		BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
 #define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT	1
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..a80c1b3997ed 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -884,6 +884,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+
 /* Register/unregister a process' MPX related resource */
 #define MPX_ENABLE_MANAGEMENT()	mpx_enable_management()
 #define MPX_DISABLE_MANAGEMENT()	mpx_disable_management()
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6f5eb07a95..9fc44b95f7cb 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,6 +87,7 @@ struct thread_info {
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
+#define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
@@ -110,6 +111,7 @@ struct thread_info {
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
@@ -138,7 +140,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
@@ -239,6 +241,8 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void arch_task_cache_init(void);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 835aa51c7f6e..c45765517092 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -1,10 +1,13 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS		0x1001
+#define ARCH_SET_FS		0x1002
+#define ARCH_GET_FS		0x1003
+#define ARCH_GET_GS		0x1004
+
+#define ARCH_GET_CPUID		0x1011
+#define ARCH_SET_CPUID		0x1012
 
 #define ARCH_MAP_VDSO_X32	0x2001
 #define ARCH_MAP_VDSO_32	0x2002
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a07f8295c9ed..dfa90a3a5145 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	if (ring3mwait_disabled) {
-		msr_clear_bit(MSR_MISC_FEATURES_ENABLES,
-			      MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+	if (ring3mwait_disabled)
 		return;
-	}
-
-	msr_set_bit(MSR_MISC_FEATURES_ENABLES,
-		    MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
 
 	set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+	this_cpu_or(msr_misc_features_shadow,
+		    1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
 
 	if (c == &boot_cpu_data)
 		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
@@ -505,9 +501,15 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 	if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
 		return;
 
-	/* Check features and update capabilities */
+	/* Clear all MISC features */
+	this_cpu_write(msr_misc_features_shadow, 0);
+
+	/* Check features and update capabilities and shadow control bits */
 	init_cpuid_fault(c);
 	probe_xeon_phi_r3mwait(c);
+
+	msr = this_cpu_read(msr_misc_features_shadow);
+	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
 }
 
 static void init_intel(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b12e95eceb83..0bb88428cbf2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,6 +37,7 @@
 #include <asm/vm86.h>
 #include <asm/switch_to.h>
 #include <asm/desc.h>
+#include <asm/prctl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -172,6 +173,73 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
+
+static void set_cpuid_faulting(bool on)
+{
+	u64 msrval;
+
+	msrval = this_cpu_read(msr_misc_features_shadow);
+	msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+	msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+	this_cpu_write(msr_misc_features_shadow, msrval);
+	wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
+
+static void disable_cpuid(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOCPUID in the current running context.
+		 */
+		set_cpuid_faulting(true);
+	}
+	preempt_enable();
+}
+
+static void enable_cpuid(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOCPUID in the current running context.
+		 */
+		set_cpuid_faulting(false);
+	}
+	preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+	return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+	if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+		return -ENODEV;
+
+	if (cpuid_enabled)
+		enable_cpuid();
+	else
+		disable_cpuid();
+
+	return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+	/* If cpuid was previously disabled for this task, re-enable it. */
+	if (test_thread_flag(TIF_NOCPUID))
+		enable_cpuid();
+}
+
 static inline void switch_to_bitmap(struct tss_struct *tss,
 				    struct thread_struct *prev,
 				    struct thread_struct *next,
@@ -225,6 +293,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 
 	if ((tifp ^ tifn) & _TIF_NOTSC)
 		cr4_toggle_bits(X86_CR4_TSD);
+
+	if ((tifp ^ tifn) & _TIF_NOCPUID)
+		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
 }
 
 /*
@@ -549,5 +620,12 @@ out:
 long do_arch_prctl_common(struct task_struct *task, int option,
 			  unsigned long cpuid_enabled)
 {
+	switch (option) {
+	case ARCH_GET_CPUID:
+		return get_cpuid_mode();
+	case ARCH_SET_CPUID:
+		return set_cpuid_mode(task, cpuid_enabled);
+	}
+
 	return -EINVAL;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 65145a3df065..72934df68471 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
+	arch_setup_new_exec();
 	perf_event_exec();
 	__set_task_comm(current, kbasename(bprm->filename), true);
 
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 58373875e8ee..55125d674338 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -101,6 +101,10 @@ static inline void check_object_size(const void *ptr, unsigned long n,
 { }
 #endif /* CONFIG_HARDENED_USERCOPY */
 
+#ifndef arch_setup_new_exec
+static inline void arch_setup_new_exec(void) { }
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
-- 
cgit v1.2.3


From 233ed09d7fdacf592ee91e6c97ce5f4364fbe7c0 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 17 Mar 2017 12:48:08 -0600
Subject: chardev: add helper function to register char devs with a struct
 device

Credit for this patch goes is shared with Dan Williams [1]. I've
taken things one step further to make the helper function more
useful and clean up calling code.

There's a common pattern in the kernel whereby a struct cdev is placed
in a structure along side a struct device which manages the life-cycle
of both. In the naive approach, the reference counting is broken and
the struct device can free everything before the chardev code
is entirely released.

Many developers have solved this problem by linking the internal kobjs
in this fashion:

cdev.kobj.parent = &parent_dev.kobj;

The cdev code explicitly gets and puts a reference to it's kobj parent.
So this seems like it was intended to be used this way. Dmitrty Torokhov
first put this in place in 2012 with this commit:

2f0157f char_dev: pin parent kobject

and the first instance of the fix was then done in the input subsystem
in the following commit:

4a215aa Input: fix use-after-free introduced with dynamic minor changes

Subsequently over the years, however, this issue seems to have tripped
up multiple developers independently. For example, see these commits:

0d5b7da iio: Prevent race between IIO chardev opening and IIO device
(by Lars-Peter Clausen in 2013)

ba0ef85 tpm: Fix initialization of the cdev
(by Jason Gunthorpe in 2015)

5b28dde [media] media: fix use-after-free in cdev_put() when app exits
after driver unbind
(by Shauh Khan in 2016)

This technique is similarly done in at least 15 places within the kernel
and probably should have been done so in another, at least, 5 places.
The kobj line also looks very suspect in that one would not expect
drivers to have to mess with kobject internals in this way.
Even highly experienced kernel developers can be surprised by this
code, as seen in [2].

To help alleviate this situation, and hopefully prevent future
wasted effort on this problem, this patch introduces a helper function
to register a char device along with its parent struct device.
This creates a more regular API for tying a char device to its parent
without the developer having to set members in the underlying kobject.

This patch introduce cdev_device_add and cdev_device_del which
replaces a common pattern including setting the kobj parent, calling
cdev_add and then calling device_add. It also introduces cdev_set_parent
for the few cases that set the kobject parent without using device_add.

[1] https://lkml.org/lkml/2017/2/13/700
[2] https://lkml.org/lkml/2017/2/10/370

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/char_dev.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cdev.h |  5 +++
 2 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index 44a240c4bb65..fb8507f521b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -471,6 +471,85 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
 	return 0;
 }
 
+/**
+ * cdev_set_parent() - set the parent kobject for a char device
+ * @p: the cdev structure
+ * @kobj: the kobject to take a reference to
+ *
+ * cdev_set_parent() sets a parent kobject which will be referenced
+ * appropriately so the parent is not freed before the cdev. This
+ * should be called before cdev_add.
+ */
+void cdev_set_parent(struct cdev *p, struct kobject *kobj)
+{
+	WARN_ON(!kobj->state_initialized);
+	p->kobj.parent = kobj;
+}
+
+/**
+ * cdev_device_add() - add a char device and it's corresponding
+ *	struct device, linkink
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_add() adds the char device represented by @cdev to the system,
+ * just as cdev_add does. It then adds @dev to the system using device_add
+ * The dev_t for the char device will be taken from the struct device which
+ * needs to be initialized first. This helper function correctly takes a
+ * reference to the parent device so the parent will not get released until
+ * all references to the cdev are released.
+ *
+ * This helper uses dev->devt for the device number. If it is not set
+ * it will not add the cdev and it will be equivalent to device_add.
+ *
+ * This function should be used whenever the struct cdev and the
+ * struct device are members of the same structure whose lifetime is
+ * managed by the struct device.
+ *
+ * NOTE: Callers must assume that userspace was able to open the cdev and
+ * can call cdev fops callbacks at any time, even if this function fails.
+ */
+int cdev_device_add(struct cdev *cdev, struct device *dev)
+{
+	int rc = 0;
+
+	if (dev->devt) {
+		cdev_set_parent(cdev, &dev->kobj);
+
+		rc = cdev_add(cdev, dev->devt, 1);
+		if (rc)
+			return rc;
+	}
+
+	rc = device_add(dev);
+	if (rc)
+		cdev_del(cdev);
+
+	return rc;
+}
+
+/**
+ * cdev_device_del() - inverse of cdev_device_add
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_del() is a helper function to call cdev_del and device_del.
+ * It should be used whenever cdev_device_add is used.
+ *
+ * If dev->devt is not set it will not remove the cdev and will be equivalent
+ * to device_del.
+ *
+ * NOTE: This guarantees that associated sysfs callbacks are not running
+ * or runnable, however any cdevs already open will remain and their fops
+ * will still be callable even after this function returns.
+ */
+void cdev_device_del(struct cdev *cdev, struct device *dev)
+{
+	device_del(dev);
+	if (dev->devt)
+		cdev_del(cdev);
+}
+
 static void cdev_unmap(dev_t dev, unsigned count)
 {
 	kobj_unmap(cdev_map, dev, count);
@@ -482,6 +561,10 @@ static void cdev_unmap(dev_t dev, unsigned count)
  *
  * cdev_del() removes @p from the system, possibly freeing the structure
  * itself.
+ *
+ * NOTE: This guarantees that cdev device will no longer be able to be
+ * opened, however any cdevs already open will remain and their fops will
+ * still be callable even after cdev_del returns.
  */
 void cdev_del(struct cdev *p)
 {
@@ -570,5 +653,8 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_set_parent);
+EXPORT_SYMBOL(cdev_device_add);
+EXPORT_SYMBOL(cdev_device_del);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index f8763615a5f2..408bc09ce497 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -4,6 +4,7 @@
 #include <linux/kobject.h>
 #include <linux/kdev_t.h>
 #include <linux/list.h>
+#include <linux/device.h>
 
 struct file_operations;
 struct inode;
@@ -26,6 +27,10 @@ void cdev_put(struct cdev *p);
 
 int cdev_add(struct cdev *, dev_t, unsigned);
 
+void cdev_set_parent(struct cdev *p, struct kobject *kobj);
+int cdev_device_add(struct cdev *cdev, struct device *dev);
+void cdev_device_del(struct cdev *cdev, struct device *dev);
+
 void cdev_del(struct cdev *);
 
 void cd_forget(struct inode *);
-- 
cgit v1.2.3


From 50dd1bd1e2395a78e5adfad96487092e215483e0 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 13 Mar 2017 16:54:00 +0530
Subject: drm/edid: check for HF-VSDB block

This patch implements a small function that finds if a
given CEA db is hdmi-forum vendor specific data block
or not.

V2: Rebase.
V3: Added R-B from Jose.
V4: Rebase
V5: Rebase
V6: Rebase
V7: Rebase
V8: Rebase
V9: Rebase
V10: Rebase

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Shashank Sharma <shashank.sharma@intel.com>
Reviewed-by: Jose Abreu <joabreu@synopsys.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1489404244-16608-3-git-send-email-shashank.sharma@intel.com
---
 drivers/gpu/drm/drm_edid.c | 15 +++++++++++++++
 include/linux/hdmi.h       |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 171d7a02ace0..92e9551f3c38 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -3248,6 +3248,21 @@ static bool cea_db_is_hdmi_vsdb(const u8 *db)
 	return hdmi_id == HDMI_IEEE_OUI;
 }
 
+static bool cea_db_is_hdmi_forum_vsdb(const u8 *db)
+{
+	unsigned int oui;
+
+	if (cea_db_tag(db) != VENDOR_BLOCK)
+		return false;
+
+	if (cea_db_payload_len(db) < 7)
+		return false;
+
+	oui = db[3] << 16 | db[2] << 8 | db[1];
+
+	return oui == HDMI_FORUM_IEEE_OUI;
+}
+
 #define for_each_cea_db(cea, i, start, end) \
 	for ((i) = (start); (i) < (end) && (i) + cea_db_payload_len(&(cea)[(i)]) < (end); (i) += cea_db_payload_len(&(cea)[(i)]) + 1)
 
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index edbb4fc674ed..d271ff23984f 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -35,6 +35,7 @@ enum hdmi_infoframe_type {
 };
 
 #define HDMI_IEEE_OUI 0x000c03
+#define HDMI_FORUM_IEEE_OUI 0xc45dd8
 #define HDMI_INFOFRAME_HEADER_SIZE  4
 #define HDMI_AVI_INFOFRAME_SIZE    13
 #define HDMI_SPD_INFOFRAME_SIZE    25
-- 
cgit v1.2.3


From aeb9dd1de98c1a5f2007ea5d2a154c1244caf8a0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 21 Mar 2017 16:01:30 +0800
Subject: usb/early: Add driver for xhci debug capability

XHCI debug capability (DbC) is an optional but standalone
functionality provided by an xHCI host controller. Software
learns this capability by walking through the extended
capability list of the host. XHCI specification describes
DbC in section 7.6.

This patch introduces the code to probe and initialize the
debug capability hardware during early boot. With hardware
initialized, the debug target (system on which this code is
running) will present a debug device through the debug port
(normally the first USB3 port). The debug device is fully
compliant with the USB framework and provides the equivalent
of a very high performance (USB3) full-duplex serial link
between the debug host and target. The DbC functionality is
independent of the xHCI host. There isn't any precondition
from the xHCI host side for the DbC to work.

One use for this feature is kernel debugging, for example
when your machine crashes very early before the regular
console code is initialized. Other uses include simpler,
lockless logging instead of a full-blown printk console
driver and klogd.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Nyman <mathias.nyman@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-usb@vger.kernel.org
Link: http://lkml.kernel.org/r/1490083293-3792-3-git-send-email-baolu.lu@linux.intel.com
[ Small fix to the Kconfig help text. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug        |   27 +-
 drivers/usb/Makefile          |    2 +-
 drivers/usb/early/Makefile    |    1 +
 drivers/usb/early/xhci-dbc.c  | 1014 +++++++++++++++++++++++++++++++++++++++++
 drivers/usb/early/xhci-dbc.h  |  211 +++++++++
 include/linux/usb/xhci-dbgp.h |   29 ++
 6 files changed, 1281 insertions(+), 3 deletions(-)
 create mode 100644 drivers/usb/early/xhci-dbc.c
 create mode 100644 drivers/usb/early/xhci-dbc.h
 create mode 100644 include/linux/usb/xhci-dbgp.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 63c1d13aaf9f..fcb7604172ce 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,9 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config EARLY_PRINTK_USB
+	bool
+
 config X86_VERBOSE_BOOTUP
 	bool "Enable verbose x86 bootup info messages"
 	default y
@@ -23,19 +26,20 @@ config EARLY_PRINTK
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized. For normal operation
 	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
+	  with klogd/syslogd or the X server. You should normally say N here,
 	  unless you want to debug such a crash.
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
 	depends on EARLY_PRINTK && PCI
+	select EARLY_PRINTK_USB
 	---help---
 	  Write kernel log output directly into the EHCI debug port.
 
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized. For normal operation
 	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
+	  with klogd/syslogd or the X server. You should normally say N here,
 	  unless you want to debug such a crash. You need usb debug device.
 
 config EARLY_PRINTK_EFI
@@ -48,6 +52,25 @@ config EARLY_PRINTK_EFI
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized.
 
+config EARLY_PRINTK_USB_XDBC
+	bool "Early printk via the xHCI debug port"
+	depends on EARLY_PRINTK && PCI
+	select EARLY_PRINTK_USB
+	---help---
+	  Write kernel log output directly into the xHCI debug port.
+
+	  One use for this feature is kernel debugging, for example when your
+	  machine crashes very early before the regular console code is
+	  initialized. Other uses include simpler, lockless logging instead of
+	  a full-blown printk console driver + klogd.
+
+	  For normal production environments this is normally not recommended,
+	  because it doesn't feed events into klogd/syslogd and doesn't try to
+	  print anything on the screen.
+
+	  You should normally say N here, unless you want to debug early
+	  crashes or need a very simple printk logging facility.
+
 config X86_PTDUMP_CORE
 	def_bool n
 
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 7791af6c102c..53d1356bbd06 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_USB_MICROTEK)	+= image/
 obj-$(CONFIG_USB_SERIAL)	+= serial/
 
 obj-$(CONFIG_USB)		+= misc/
-obj-$(CONFIG_EARLY_PRINTK_DBGP)	+= early/
+obj-$(CONFIG_EARLY_PRINTK_USB)	+= early/
 
 obj-$(CONFIG_USB_ATM)		+= atm/
 obj-$(CONFIG_USB_SPEEDTOUCH)	+= atm/
diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile
index 24bbe519c737..fcde2286da1c 100644
--- a/drivers/usb/early/Makefile
+++ b/drivers/usb/early/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_EARLY_PRINTK_DBGP) += ehci-dbgp.o
+obj-$(CONFIG_EARLY_PRINTK_USB_XDBC) += xhci-dbc.o
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
new file mode 100644
index 000000000000..1268818e2263
--- /dev/null
+++ b/drivers/usb/early/xhci-dbc.c
@@ -0,0 +1,1014 @@
+/**
+ * xhci-dbc.c - xHCI debug capability early driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/console.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <linux/bcd.h>
+#include <linux/export.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include "../host/xhci.h"
+#include "xhci-dbc.h"
+
+static struct xdbc_state xdbc;
+static bool early_console_keep;
+
+#define XDBC_TRACE
+#ifdef XDBC_TRACE
+#define	xdbc_trace	trace_printk
+#else
+static inline void xdbc_trace(const char *fmt, ...) { }
+#endif /* XDBC_TRACE */
+
+static void __iomem * __init xdbc_map_pci_mmio(u32 bus, u32 dev, u32 func)
+{
+	u64 val64, sz64, mask64;
+	void __iomem *base;
+	u32 val, sz;
+	u8 byte;
+
+	val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0);
+	write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, ~0);
+	sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0);
+	write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, val);
+
+	if (val == 0xffffffff || sz == 0xffffffff) {
+		pr_notice("invalid mmio bar\n");
+		return NULL;
+	}
+
+	val64	= val & PCI_BASE_ADDRESS_MEM_MASK;
+	sz64	= sz & PCI_BASE_ADDRESS_MEM_MASK;
+	mask64	= PCI_BASE_ADDRESS_MEM_MASK;
+
+	if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+		val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4);
+		write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, ~0);
+		sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4);
+		write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, val);
+
+		val64	|= (u64)val << 32;
+		sz64	|= (u64)sz << 32;
+		mask64	|= ~0ULL << 32;
+	}
+
+	sz64 &= mask64;
+
+	if (!sz64) {
+		pr_notice("invalid mmio address\n");
+		return NULL;
+	}
+
+	sz64 = 1ULL << __ffs64(sz64);
+
+	/* Check if the mem space is enabled: */
+	byte = read_pci_config_byte(bus, dev, func, PCI_COMMAND);
+	if (!(byte & PCI_COMMAND_MEMORY)) {
+		byte |= PCI_COMMAND_MEMORY;
+		write_pci_config_byte(bus, dev, func, PCI_COMMAND, byte);
+	}
+
+	xdbc.xhci_start = val64;
+	xdbc.xhci_length = sz64;
+	base = early_ioremap(val64, sz64);
+
+	return base;
+}
+
+static void * __init xdbc_get_page(dma_addr_t *dma_addr)
+{
+	void *virt;
+
+	virt = alloc_bootmem_pages_nopanic(PAGE_SIZE);
+	if (!virt)
+		return NULL;
+
+	if (dma_addr)
+		*dma_addr = (dma_addr_t)__pa(virt);
+
+	return virt;
+}
+
+static u32 __init xdbc_find_dbgp(int xdbc_num, u32 *b, u32 *d, u32 *f)
+{
+	u32 bus, dev, func, class;
+
+	for (bus = 0; bus < XDBC_PCI_MAX_BUSES; bus++) {
+		for (dev = 0; dev < XDBC_PCI_MAX_DEVICES; dev++) {
+			for (func = 0; func < XDBC_PCI_MAX_FUNCTION; func++) {
+
+				class = read_pci_config(bus, dev, func, PCI_CLASS_REVISION);
+				if ((class >> 8) != PCI_CLASS_SERIAL_USB_XHCI)
+					continue;
+
+				if (xdbc_num-- != 0)
+					continue;
+
+				*b = bus;
+				*d = dev;
+				*f = func;
+
+				return 0;
+			}
+		}
+	}
+
+	return -1;
+}
+
+static int handshake(void __iomem *ptr, u32 mask, u32 done, int wait, int delay)
+{
+	u32 result;
+
+	do {
+		result = readl(ptr);
+		result &= mask;
+		if (result == done)
+			return 0;
+		udelay(delay);
+		wait -= delay;
+	} while (wait > 0);
+
+	return -ETIMEDOUT;
+}
+
+static void __init xdbc_bios_handoff(void)
+{
+	int offset, timeout;
+	u32 val;
+
+	offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_LEGACY);
+	val = readl(xdbc.xhci_base + offset);
+
+	if (val & XHCI_HC_BIOS_OWNED) {
+		writel(val | XHCI_HC_OS_OWNED, xdbc.xhci_base + offset);
+		timeout = handshake(xdbc.xhci_base + offset, XHCI_HC_BIOS_OWNED, 0, 5000, 10);
+
+		if (timeout) {
+			pr_notice("failed to hand over xHCI control from BIOS\n");
+			writel(val & ~XHCI_HC_BIOS_OWNED, xdbc.xhci_base + offset);
+		}
+	}
+
+	/* Disable BIOS SMIs and clear all SMI events: */
+	val = readl(xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET);
+	val &= XHCI_LEGACY_DISABLE_SMI;
+	val |= XHCI_LEGACY_SMI_EVENTS;
+	writel(val, xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET);
+}
+
+static int __init
+xdbc_alloc_ring(struct xdbc_segment *seg, struct xdbc_ring *ring)
+{
+	seg->trbs = xdbc_get_page(&seg->dma);
+	if (!seg->trbs)
+		return -ENOMEM;
+
+	ring->segment = seg;
+
+	return 0;
+}
+
+static void __init xdbc_free_ring(struct xdbc_ring *ring)
+{
+	struct xdbc_segment *seg = ring->segment;
+
+	if (!seg)
+		return;
+
+	free_bootmem(seg->dma, PAGE_SIZE);
+	ring->segment = NULL;
+}
+
+static void xdbc_reset_ring(struct xdbc_ring *ring)
+{
+	struct xdbc_segment *seg = ring->segment;
+	struct xdbc_trb *link_trb;
+
+	memset(seg->trbs, 0, PAGE_SIZE);
+
+	ring->enqueue = seg->trbs;
+	ring->dequeue = seg->trbs;
+	ring->cycle_state = 1;
+
+	if (ring != &xdbc.evt_ring) {
+		link_trb = &seg->trbs[XDBC_TRBS_PER_SEGMENT - 1];
+		link_trb->field[0] = cpu_to_le32(lower_32_bits(seg->dma));
+		link_trb->field[1] = cpu_to_le32(upper_32_bits(seg->dma));
+		link_trb->field[3] = cpu_to_le32(TRB_TYPE(TRB_LINK)) | cpu_to_le32(LINK_TOGGLE);
+	}
+}
+
+static inline void xdbc_put_utf16(u16 *s, const char *c, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		s[i] = cpu_to_le16(c[i]);
+}
+
+static void xdbc_mem_init(void)
+{
+	struct xdbc_ep_context *ep_in, *ep_out;
+	struct usb_string_descriptor *s_desc;
+	struct xdbc_erst_entry *entry;
+	struct xdbc_strings *strings;
+	struct xdbc_context *ctx;
+	unsigned int max_burst;
+	u32 string_length;
+	int index = 0;
+	u32 dev_info;
+
+	xdbc_reset_ring(&xdbc.evt_ring);
+	xdbc_reset_ring(&xdbc.in_ring);
+	xdbc_reset_ring(&xdbc.out_ring);
+	memset(xdbc.table_base, 0, PAGE_SIZE);
+	memset(xdbc.out_buf, 0, PAGE_SIZE);
+
+	/* Initialize event ring segment table: */
+	xdbc.erst_size	= 16;
+	xdbc.erst_base	= xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+	xdbc.erst_dma	= xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+
+	index += XDBC_ERST_ENTRY_NUM;
+	entry = (struct xdbc_erst_entry *)xdbc.erst_base;
+
+	entry->seg_addr		= cpu_to_le64(xdbc.evt_seg.dma);
+	entry->seg_size		= cpu_to_le32(XDBC_TRBS_PER_SEGMENT);
+	entry->__reserved_0	= 0;
+
+	/* Initialize ERST registers: */
+	writel(1, &xdbc.xdbc_reg->ersts);
+	xdbc_write64(xdbc.erst_dma, &xdbc.xdbc_reg->erstba);
+	xdbc_write64(xdbc.evt_seg.dma, &xdbc.xdbc_reg->erdp);
+
+	/* Debug capability contexts: */
+	xdbc.dbcc_size	= 64 * 3;
+	xdbc.dbcc_base	= xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+	xdbc.dbcc_dma	= xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+
+	index += XDBC_DBCC_ENTRY_NUM;
+
+	/* Popluate the strings: */
+	xdbc.string_size = sizeof(struct xdbc_strings);
+	xdbc.string_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+	xdbc.string_dma	 = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+	strings		 = (struct xdbc_strings *)xdbc.string_base;
+
+	index += XDBC_STRING_ENTRY_NUM;
+
+	/* Serial string: */
+	s_desc			= (struct usb_string_descriptor *)strings->serial;
+	s_desc->bLength		= (strlen(XDBC_STRING_SERIAL) + 1) * 2;
+	s_desc->bDescriptorType	= USB_DT_STRING;
+
+	xdbc_put_utf16(s_desc->wData, XDBC_STRING_SERIAL, strlen(XDBC_STRING_SERIAL));
+	string_length = s_desc->bLength;
+	string_length <<= 8;
+
+	/* Product string: */
+	s_desc			= (struct usb_string_descriptor *)strings->product;
+	s_desc->bLength		= (strlen(XDBC_STRING_PRODUCT) + 1) * 2;
+	s_desc->bDescriptorType	= USB_DT_STRING;
+
+	xdbc_put_utf16(s_desc->wData, XDBC_STRING_PRODUCT, strlen(XDBC_STRING_PRODUCT));
+	string_length += s_desc->bLength;
+	string_length <<= 8;
+
+	/* Manufacture string: */
+	s_desc			= (struct usb_string_descriptor *)strings->manufacturer;
+	s_desc->bLength		= (strlen(XDBC_STRING_MANUFACTURER) + 1) * 2;
+	s_desc->bDescriptorType	= USB_DT_STRING;
+
+	xdbc_put_utf16(s_desc->wData, XDBC_STRING_MANUFACTURER, strlen(XDBC_STRING_MANUFACTURER));
+	string_length += s_desc->bLength;
+	string_length <<= 8;
+
+	/* String0: */
+	strings->string0[0]	= 4;
+	strings->string0[1]	= USB_DT_STRING;
+	strings->string0[2]	= 0x09;
+	strings->string0[3]	= 0x04;
+
+	string_length += 4;
+
+	/* Populate info Context: */
+	ctx = (struct xdbc_context *)xdbc.dbcc_base;
+
+	ctx->info.string0	= cpu_to_le64(xdbc.string_dma);
+	ctx->info.manufacturer	= cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH);
+	ctx->info.product	= cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 2);
+	ctx->info.serial	= cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 3);
+	ctx->info.length	= cpu_to_le32(string_length);
+
+	/* Populate bulk out endpoint context: */
+	max_burst = DEBUG_MAX_BURST(readl(&xdbc.xdbc_reg->control));
+	ep_out = (struct xdbc_ep_context *)&ctx->out;
+
+	ep_out->ep_info1	= 0;
+	ep_out->ep_info2	= cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst));
+	ep_out->deq		= cpu_to_le64(xdbc.out_seg.dma | xdbc.out_ring.cycle_state);
+
+	/* Populate bulk in endpoint context: */
+	ep_in = (struct xdbc_ep_context *)&ctx->in;
+
+	ep_in->ep_info1		= 0;
+	ep_in->ep_info2		= cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst));
+	ep_in->deq		= cpu_to_le64(xdbc.in_seg.dma | xdbc.in_ring.cycle_state);
+
+	/* Set DbC context and info registers: */
+	xdbc_write64(xdbc.dbcc_dma, &xdbc.xdbc_reg->dccp);
+
+	dev_info = cpu_to_le32((XDBC_VENDOR_ID << 16) | XDBC_PROTOCOL);
+	writel(dev_info, &xdbc.xdbc_reg->devinfo1);
+
+	dev_info = cpu_to_le32((XDBC_DEVICE_REV << 16) | XDBC_PRODUCT_ID);
+	writel(dev_info, &xdbc.xdbc_reg->devinfo2);
+
+	xdbc.in_buf = xdbc.out_buf + XDBC_MAX_PACKET;
+	xdbc.in_dma = xdbc.out_dma + XDBC_MAX_PACKET;
+}
+
+static void xdbc_do_reset_debug_port(u32 id, u32 count)
+{
+	void __iomem *ops_reg;
+	void __iomem *portsc;
+	u32 val, cap_length;
+	int i;
+
+	cap_length = readl(xdbc.xhci_base) & 0xff;
+	ops_reg = xdbc.xhci_base + cap_length;
+
+	id--;
+	for (i = id; i < (id + count); i++) {
+		portsc = ops_reg + 0x400 + i * 0x10;
+		val = readl(portsc);
+		if (!(val & PORT_CONNECT))
+			writel(val | PORT_RESET, portsc);
+	}
+}
+
+static void xdbc_reset_debug_port(void)
+{
+	u32 val, port_offset, port_count;
+	int offset = 0;
+
+	do {
+		offset = xhci_find_next_ext_cap(xdbc.xhci_base, offset, XHCI_EXT_CAPS_PROTOCOL);
+		if (!offset)
+			break;
+
+		val = readl(xdbc.xhci_base + offset);
+		if (XHCI_EXT_PORT_MAJOR(val) != 0x3)
+			continue;
+
+		val = readl(xdbc.xhci_base + offset + 8);
+		port_offset = XHCI_EXT_PORT_OFF(val);
+		port_count = XHCI_EXT_PORT_COUNT(val);
+
+		xdbc_do_reset_debug_port(port_offset, port_count);
+	} while (1);
+}
+
+static void
+xdbc_queue_trb(struct xdbc_ring *ring, u32 field1, u32 field2, u32 field3, u32 field4)
+{
+	struct xdbc_trb *trb, *link_trb;
+
+	trb = ring->enqueue;
+	trb->field[0] = cpu_to_le32(field1);
+	trb->field[1] = cpu_to_le32(field2);
+	trb->field[2] = cpu_to_le32(field3);
+	trb->field[3] = cpu_to_le32(field4);
+
+	++(ring->enqueue);
+	if (ring->enqueue >= &ring->segment->trbs[TRBS_PER_SEGMENT - 1]) {
+		link_trb = ring->enqueue;
+		if (ring->cycle_state)
+			link_trb->field[3] |= cpu_to_le32(TRB_CYCLE);
+		else
+			link_trb->field[3] &= cpu_to_le32(~TRB_CYCLE);
+
+		ring->enqueue = ring->segment->trbs;
+		ring->cycle_state ^= 1;
+	}
+}
+
+static void xdbc_ring_doorbell(int target)
+{
+	writel(DOOR_BELL_TARGET(target), &xdbc.xdbc_reg->doorbell);
+}
+
+static int xdbc_start(void)
+{
+	u32 ctrl, status;
+	int ret;
+
+	ctrl = readl(&xdbc.xdbc_reg->control);
+	writel(ctrl | CTRL_DBC_ENABLE | CTRL_PORT_ENABLE, &xdbc.xdbc_reg->control);
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, CTRL_DBC_ENABLE, 100000, 100);
+	if (ret) {
+		xdbc_trace("failed to initialize hardware\n");
+		return ret;
+	}
+
+	/* Reset port to avoid bus hang: */
+	if (xdbc.vendor == PCI_VENDOR_ID_INTEL)
+		xdbc_reset_debug_port();
+
+	/* Wait for port connection: */
+	ret = handshake(&xdbc.xdbc_reg->portsc, PORTSC_CONN_STATUS, PORTSC_CONN_STATUS, 5000000, 100);
+	if (ret) {
+		xdbc_trace("waiting for connection timed out\n");
+		return ret;
+	}
+
+	/* Wait for debug device to be configured: */
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_RUN, CTRL_DBC_RUN, 5000000, 100);
+	if (ret) {
+		xdbc_trace("waiting for device configuration timed out\n");
+		return ret;
+	}
+
+	/* Check port number: */
+	status = readl(&xdbc.xdbc_reg->status);
+	if (!DCST_DEBUG_PORT(status)) {
+		xdbc_trace("invalid root hub port number\n");
+		return -ENODEV;
+	}
+
+	xdbc.port_number = DCST_DEBUG_PORT(status);
+
+	xdbc_trace("DbC is running now, control 0x%08x port ID %d\n",
+		   readl(&xdbc.xdbc_reg->control), xdbc.port_number);
+
+	return 0;
+}
+
+static int xdbc_bulk_transfer(void *data, int size, bool read)
+{
+	struct xdbc_ring *ring;
+	struct xdbc_trb *trb;
+	u32 length, control;
+	u32 cycle;
+	u64 addr;
+
+	if (size > XDBC_MAX_PACKET) {
+		xdbc_trace("bad parameter, size %d\n", size);
+		return -EINVAL;
+	}
+
+	if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED) ||
+	    !(xdbc.flags & XDBC_FLAGS_CONFIGURED) ||
+	    (!read && (xdbc.flags & XDBC_FLAGS_OUT_STALL)) ||
+	    (read && (xdbc.flags & XDBC_FLAGS_IN_STALL))) {
+
+		xdbc_trace("connection not ready, flags %08x\n", xdbc.flags);
+		return -EIO;
+	}
+
+	ring = (read ? &xdbc.in_ring : &xdbc.out_ring);
+	trb = ring->enqueue;
+	cycle = ring->cycle_state;
+	length = TRB_LEN(size);
+	control = TRB_TYPE(TRB_NORMAL) | TRB_IOC;
+
+	if (cycle)
+		control &= cpu_to_le32(~TRB_CYCLE);
+	else
+		control |= cpu_to_le32(TRB_CYCLE);
+
+	if (read) {
+		memset(xdbc.in_buf, 0, XDBC_MAX_PACKET);
+		addr = xdbc.in_dma;
+		xdbc.flags |= XDBC_FLAGS_IN_PROCESS;
+	} else {
+		memset(xdbc.out_buf, 0, XDBC_MAX_PACKET);
+		memcpy(xdbc.out_buf, data, size);
+		addr = xdbc.out_dma;
+		xdbc.flags |= XDBC_FLAGS_OUT_PROCESS;
+	}
+
+	xdbc_queue_trb(ring, lower_32_bits(addr), upper_32_bits(addr), length, control);
+
+	/*
+	 * Add a barrier between writes of trb fields and flipping
+	 * the cycle bit:
+	 */
+	wmb();
+	if (cycle)
+		trb->field[3] |= cpu_to_le32(cycle);
+	else
+		trb->field[3] &= cpu_to_le32(~TRB_CYCLE);
+
+	xdbc_ring_doorbell(read ? IN_EP_DOORBELL : OUT_EP_DOORBELL);
+
+	return size;
+}
+
+static int xdbc_handle_external_reset(void)
+{
+	int ret = 0;
+
+	xdbc.flags = 0;
+	writel(0, &xdbc.xdbc_reg->control);
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 10);
+	if (ret)
+		goto reset_out;
+
+	xdbc_mem_init();
+
+	mmiowb();
+
+	ret = xdbc_start();
+	if (ret < 0)
+		goto reset_out;
+
+	xdbc_trace("dbc recovered\n");
+
+	xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED;
+
+	xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+
+	return 0;
+
+reset_out:
+	xdbc_trace("failed to recover from external reset\n");
+	return ret;
+}
+
+static int __init xdbc_early_setup(void)
+{
+	int ret;
+
+	writel(0, &xdbc.xdbc_reg->control);
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 100);
+	if (ret)
+		return ret;
+
+	/* Allocate the table page: */
+	xdbc.table_base = xdbc_get_page(&xdbc.table_dma);
+	if (!xdbc.table_base)
+		return -ENOMEM;
+
+	/* Get and store the transfer buffer: */
+	xdbc.out_buf = xdbc_get_page(&xdbc.out_dma);
+	if (!xdbc.out_buf)
+		return -ENOMEM;
+
+	/* Allocate the event ring: */
+	ret = xdbc_alloc_ring(&xdbc.evt_seg, &xdbc.evt_ring);
+	if (ret < 0)
+		return ret;
+
+	/* Allocate IN/OUT endpoint transfer rings: */
+	ret = xdbc_alloc_ring(&xdbc.in_seg, &xdbc.in_ring);
+	if (ret < 0)
+		return ret;
+
+	ret = xdbc_alloc_ring(&xdbc.out_seg, &xdbc.out_ring);
+	if (ret < 0)
+		return ret;
+
+	xdbc_mem_init();
+
+	mmiowb();
+
+	ret = xdbc_start();
+	if (ret < 0) {
+		writel(0, &xdbc.xdbc_reg->control);
+		return ret;
+	}
+
+	xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED;
+
+	xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+
+	return 0;
+}
+
+int __init early_xdbc_parse_parameter(char *s)
+{
+	unsigned long dbgp_num = 0;
+	u32 bus, dev, func, offset;
+	int ret;
+
+	if (!early_pci_allowed())
+		return -EPERM;
+
+	if (strstr(s, "keep"))
+		early_console_keep = true;
+
+	if (xdbc.xdbc_reg)
+		return 0;
+
+	if (*s && kstrtoul(s, 0, &dbgp_num))
+		dbgp_num = 0;
+
+	pr_notice("dbgp_num: %lu\n", dbgp_num);
+
+	/* Locate the host controller: */
+	ret = xdbc_find_dbgp(dbgp_num, &bus, &dev, &func);
+	if (ret) {
+		pr_notice("failed to locate xhci host\n");
+		return -ENODEV;
+	}
+
+	xdbc.vendor	= read_pci_config_16(bus, dev, func, PCI_VENDOR_ID);
+	xdbc.device	= read_pci_config_16(bus, dev, func, PCI_DEVICE_ID);
+	xdbc.bus	= bus;
+	xdbc.dev	= dev;
+	xdbc.func	= func;
+
+	/* Map the IO memory: */
+	xdbc.xhci_base = xdbc_map_pci_mmio(bus, dev, func);
+	if (!xdbc.xhci_base)
+		return -EINVAL;
+
+	/* Locate DbC registers: */
+	offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG);
+	if (!offset) {
+		pr_notice("xhci host doesn't support debug capability\n");
+		early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+		xdbc.xhci_base = NULL;
+		xdbc.xhci_length = 0;
+
+		return -ENODEV;
+	}
+	xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset);
+
+	return 0;
+}
+
+int __init early_xdbc_setup_hardware(void)
+{
+	int ret;
+
+	if (!xdbc.xdbc_reg)
+		return -ENODEV;
+
+	xdbc_bios_handoff();
+
+	raw_spin_lock_init(&xdbc.lock);
+
+	ret = xdbc_early_setup();
+	if (ret) {
+		pr_notice("failed to setup the connection to host\n");
+
+		xdbc_free_ring(&xdbc.evt_ring);
+		xdbc_free_ring(&xdbc.out_ring);
+		xdbc_free_ring(&xdbc.in_ring);
+
+		if (xdbc.table_dma)
+			free_bootmem(xdbc.table_dma, PAGE_SIZE);
+
+		if (xdbc.out_dma)
+			free_bootmem(xdbc.out_dma, PAGE_SIZE);
+
+		xdbc.table_base = NULL;
+		xdbc.out_buf = NULL;
+	}
+
+	return ret;
+}
+
+static void xdbc_handle_port_status(struct xdbc_trb *evt_trb)
+{
+	u32 port_reg;
+
+	port_reg = readl(&xdbc.xdbc_reg->portsc);
+	if (port_reg & PORTSC_CONN_CHANGE) {
+		xdbc_trace("connect status change event\n");
+
+		/* Check whether cable unplugged: */
+		if (!(port_reg & PORTSC_CONN_STATUS)) {
+			xdbc.flags = 0;
+			xdbc_trace("cable unplugged\n");
+		}
+	}
+
+	if (port_reg & PORTSC_RESET_CHANGE)
+		xdbc_trace("port reset change event\n");
+
+	if (port_reg & PORTSC_LINK_CHANGE)
+		xdbc_trace("port link status change event\n");
+
+	if (port_reg & PORTSC_CONFIG_CHANGE)
+		xdbc_trace("config error change\n");
+
+	/* Write back the value to clear RW1C bits: */
+	writel(port_reg, &xdbc.xdbc_reg->portsc);
+}
+
+static void xdbc_handle_tx_event(struct xdbc_trb *evt_trb)
+{
+	size_t remain_length;
+	u32 comp_code;
+	int ep_id;
+
+	comp_code	= GET_COMP_CODE(le32_to_cpu(evt_trb->field[2]));
+	remain_length	= EVENT_TRB_LEN(le32_to_cpu(evt_trb->field[2]));
+	ep_id		= TRB_TO_EP_ID(le32_to_cpu(evt_trb->field[3]));
+
+	switch (comp_code) {
+	case COMP_SUCCESS:
+		remain_length = 0;
+	case COMP_SHORT_PACKET:
+		break;
+	case COMP_TRB_ERROR:
+	case COMP_BABBLE_DETECTED_ERROR:
+	case COMP_USB_TRANSACTION_ERROR:
+	case COMP_STALL_ERROR:
+	default:
+		if (ep_id == XDBC_EPID_OUT)
+			xdbc.flags |= XDBC_FLAGS_OUT_STALL;
+		if (ep_id == XDBC_EPID_IN)
+			xdbc.flags |= XDBC_FLAGS_IN_STALL;
+
+		xdbc_trace("endpoint %d stalled\n", ep_id);
+		break;
+	}
+
+	if (ep_id == XDBC_EPID_IN) {
+		xdbc.flags &= ~XDBC_FLAGS_IN_PROCESS;
+		xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+	} else if (ep_id == XDBC_EPID_OUT) {
+		xdbc.flags &= ~XDBC_FLAGS_OUT_PROCESS;
+	} else {
+		xdbc_trace("invalid endpoint id %d\n", ep_id);
+	}
+}
+
+static void xdbc_handle_events(void)
+{
+	struct xdbc_trb *evt_trb;
+	bool update_erdp = false;
+	u32 reg;
+	u8 cmd;
+
+	cmd = read_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND);
+	if (!(cmd & PCI_COMMAND_MASTER)) {
+		cmd |= PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY;
+		write_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND, cmd);
+	}
+
+	if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED))
+		return;
+
+	/* Handle external reset events: */
+	reg = readl(&xdbc.xdbc_reg->control);
+	if (!(reg & CTRL_DBC_ENABLE)) {
+		if (xdbc_handle_external_reset()) {
+			xdbc_trace("failed to recover connection\n");
+			return;
+		}
+	}
+
+	/* Handle configure-exit event: */
+	reg = readl(&xdbc.xdbc_reg->control);
+	if (reg & CTRL_DBC_RUN_CHANGE) {
+		writel(reg, &xdbc.xdbc_reg->control);
+		if (reg & CTRL_DBC_RUN)
+			xdbc.flags |= XDBC_FLAGS_CONFIGURED;
+		else
+			xdbc.flags &= ~XDBC_FLAGS_CONFIGURED;
+	}
+
+	/* Handle endpoint stall event: */
+	reg = readl(&xdbc.xdbc_reg->control);
+	if (reg & CTRL_HALT_IN_TR) {
+		xdbc.flags |= XDBC_FLAGS_IN_STALL;
+	} else {
+		xdbc.flags &= ~XDBC_FLAGS_IN_STALL;
+		if (!(xdbc.flags & XDBC_FLAGS_IN_PROCESS))
+			xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+	}
+
+	if (reg & CTRL_HALT_OUT_TR)
+		xdbc.flags |= XDBC_FLAGS_OUT_STALL;
+	else
+		xdbc.flags &= ~XDBC_FLAGS_OUT_STALL;
+
+	/* Handle the events in the event ring: */
+	evt_trb = xdbc.evt_ring.dequeue;
+	while ((le32_to_cpu(evt_trb->field[3]) & TRB_CYCLE) == xdbc.evt_ring.cycle_state) {
+		/*
+		 * Add a barrier between reading the cycle flag and any
+		 * reads of the event's flags/data below:
+		 */
+		rmb();
+
+		switch ((le32_to_cpu(evt_trb->field[3]) & TRB_TYPE_BITMASK)) {
+		case TRB_TYPE(TRB_PORT_STATUS):
+			xdbc_handle_port_status(evt_trb);
+			break;
+		case TRB_TYPE(TRB_TRANSFER):
+			xdbc_handle_tx_event(evt_trb);
+			break;
+		default:
+			break;
+		}
+
+		++(xdbc.evt_ring.dequeue);
+		if (xdbc.evt_ring.dequeue == &xdbc.evt_seg.trbs[TRBS_PER_SEGMENT]) {
+			xdbc.evt_ring.dequeue = xdbc.evt_seg.trbs;
+			xdbc.evt_ring.cycle_state ^= 1;
+		}
+
+		evt_trb = xdbc.evt_ring.dequeue;
+		update_erdp = true;
+	}
+
+	/* Update event ring dequeue pointer: */
+	if (update_erdp)
+		xdbc_write64(__pa(xdbc.evt_ring.dequeue), &xdbc.xdbc_reg->erdp);
+}
+
+static int xdbc_bulk_write(const char *bytes, int size)
+{
+	int ret, timeout = 0;
+	unsigned long flags;
+
+retry:
+	if (in_nmi()) {
+		if (!raw_spin_trylock_irqsave(&xdbc.lock, flags))
+			return -EAGAIN;
+	} else {
+		raw_spin_lock_irqsave(&xdbc.lock, flags);
+	}
+
+	xdbc_handle_events();
+
+	/* Check completion of the previous request: */
+	if ((xdbc.flags & XDBC_FLAGS_OUT_PROCESS) && (timeout < 2000000)) {
+		raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+		udelay(100);
+		timeout += 100;
+		goto retry;
+	}
+
+	if (xdbc.flags & XDBC_FLAGS_OUT_PROCESS) {
+		raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+		xdbc_trace("previous transfer not completed yet\n");
+
+		return -ETIMEDOUT;
+	}
+
+	ret = xdbc_bulk_transfer((void *)bytes, size, false);
+	raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+
+	return ret;
+}
+
+static void early_xdbc_write(struct console *con, const char *str, u32 n)
+{
+	static char buf[XDBC_MAX_PACKET];
+	int chunk, ret;
+	int use_cr = 0;
+
+	if (!xdbc.xdbc_reg)
+		return;
+	memset(buf, 0, XDBC_MAX_PACKET);
+	while (n > 0) {
+		for (chunk = 0; chunk < XDBC_MAX_PACKET && n > 0; str++, chunk++, n--) {
+
+			if (!use_cr && *str == '\n') {
+				use_cr = 1;
+				buf[chunk] = '\r';
+				str--;
+				n++;
+				continue;
+			}
+
+			if (use_cr)
+				use_cr = 0;
+			buf[chunk] = *str;
+		}
+
+		if (chunk > 0) {
+			ret = xdbc_bulk_write(buf, chunk);
+			if (ret < 0)
+				xdbc_trace("missed message {%s}\n", buf);
+		}
+	}
+}
+
+static struct console early_xdbc_console = {
+	.name		= "earlyxdbc",
+	.write		= early_xdbc_write,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+};
+
+void __init early_xdbc_register_console(void)
+{
+	if (early_console)
+		return;
+
+	early_console = &early_xdbc_console;
+	if (early_console_keep)
+		early_console->flags &= ~CON_BOOT;
+	else
+		early_console->flags |= CON_BOOT;
+	register_console(early_console);
+}
+
+static void xdbc_unregister_console(void)
+{
+	if (early_xdbc_console.flags & CON_ENABLED)
+		unregister_console(&early_xdbc_console);
+}
+
+static int xdbc_scrub_function(void *ptr)
+{
+	unsigned long flags;
+
+	while (true) {
+		raw_spin_lock_irqsave(&xdbc.lock, flags);
+		xdbc_handle_events();
+
+		if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED)) {
+			raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+			break;
+		}
+
+		raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+		schedule_timeout_interruptible(1);
+	}
+
+	xdbc_unregister_console();
+	writel(0, &xdbc.xdbc_reg->control);
+	xdbc_trace("dbc scrub function exits\n");
+
+	return 0;
+}
+
+static int __init xdbc_init(void)
+{
+	unsigned long flags;
+	void __iomem *base;
+	int ret = 0;
+	u32 offset;
+
+	if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED))
+		return 0;
+
+	/*
+	 * It's time to shut down the DbC, so that the debug
+	 * port can be reused by the host controller:
+	 */
+	if (early_xdbc_console.index == -1 ||
+	    (early_xdbc_console.flags & CON_BOOT)) {
+		xdbc_trace("hardware not used anymore\n");
+		goto free_and_quit;
+	}
+
+	base = ioremap_nocache(xdbc.xhci_start, xdbc.xhci_length);
+	if (!base) {
+		xdbc_trace("failed to remap the io address\n");
+		ret = -ENOMEM;
+		goto free_and_quit;
+	}
+
+	raw_spin_lock_irqsave(&xdbc.lock, flags);
+	early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+	xdbc.xhci_base = base;
+	offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG);
+	xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset);
+	raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+
+	kthread_run(xdbc_scrub_function, NULL, "%s", "xdbc");
+
+	return 0;
+
+free_and_quit:
+	xdbc_free_ring(&xdbc.evt_ring);
+	xdbc_free_ring(&xdbc.out_ring);
+	xdbc_free_ring(&xdbc.in_ring);
+	free_bootmem(xdbc.table_dma, PAGE_SIZE);
+	free_bootmem(xdbc.out_dma, PAGE_SIZE);
+	writel(0, &xdbc.xdbc_reg->control);
+	early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+
+	return ret;
+}
+subsys_initcall(xdbc_init);
diff --git a/drivers/usb/early/xhci-dbc.h b/drivers/usb/early/xhci-dbc.h
new file mode 100644
index 000000000000..2df0f6e613fe
--- /dev/null
+++ b/drivers/usb/early/xhci-dbc.h
@@ -0,0 +1,211 @@
+/*
+ * xhci-dbc.h - xHCI debug capability early driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_XHCI_DBC_H
+#define __LINUX_XHCI_DBC_H
+
+#include <linux/types.h>
+#include <linux/usb/ch9.h>
+
+/*
+ * xHCI Debug Capability Register interfaces:
+ */
+struct xdbc_regs {
+	__le32	capability;
+	__le32	doorbell;
+	__le32	ersts;		/* Event Ring Segment Table Size*/
+	__le32	__reserved_0;	/* 0c~0f reserved bits */
+	__le64	erstba;		/* Event Ring Segment Table Base Address */
+	__le64	erdp;		/* Event Ring Dequeue Pointer */
+	__le32	control;
+	__le32	status;
+	__le32	portsc;		/* Port status and control */
+	__le32	__reserved_1;	/* 2b~28 reserved bits */
+	__le64	dccp;		/* Debug Capability Context Pointer */
+	__le32	devinfo1;	/* Device Descriptor Info Register 1 */
+	__le32	devinfo2;	/* Device Descriptor Info Register 2 */
+};
+
+#define DEBUG_MAX_BURST(p)	(((p) >> 16) & 0xff)
+
+#define CTRL_DBC_RUN		BIT(0)
+#define CTRL_PORT_ENABLE	BIT(1)
+#define CTRL_HALT_OUT_TR	BIT(2)
+#define CTRL_HALT_IN_TR		BIT(3)
+#define CTRL_DBC_RUN_CHANGE	BIT(4)
+#define CTRL_DBC_ENABLE		BIT(31)
+
+#define DCST_DEBUG_PORT(p)	(((p) >> 24) & 0xff)
+
+#define PORTSC_CONN_STATUS	BIT(0)
+#define PORTSC_CONN_CHANGE	BIT(17)
+#define PORTSC_RESET_CHANGE	BIT(21)
+#define PORTSC_LINK_CHANGE	BIT(22)
+#define PORTSC_CONFIG_CHANGE	BIT(23)
+
+/*
+ * xHCI Debug Capability data structures:
+ */
+struct xdbc_trb {
+	__le32 field[4];
+};
+
+struct xdbc_erst_entry {
+	__le64	seg_addr;
+	__le32	seg_size;
+	__le32	__reserved_0;
+};
+
+struct xdbc_info_context {
+	__le64	string0;
+	__le64	manufacturer;
+	__le64	product;
+	__le64	serial;
+	__le32	length;
+	__le32	__reserved_0[7];
+};
+
+struct xdbc_ep_context {
+	__le32	ep_info1;
+	__le32	ep_info2;
+	__le64	deq;
+	__le32	tx_info;
+	__le32	__reserved_0[11];
+};
+
+struct xdbc_context {
+	struct xdbc_info_context	info;
+	struct xdbc_ep_context		out;
+	struct xdbc_ep_context		in;
+};
+
+#define XDBC_INFO_CONTEXT_SIZE		48
+#define XDBC_MAX_STRING_LENGTH		64
+#define XDBC_STRING_MANUFACTURER	"Linux"
+#define XDBC_STRING_PRODUCT		"Remote GDB"
+#define XDBC_STRING_SERIAL		"0001"
+
+struct xdbc_strings {
+	char	string0[XDBC_MAX_STRING_LENGTH];
+	char	manufacturer[XDBC_MAX_STRING_LENGTH];
+	char	product[XDBC_MAX_STRING_LENGTH];
+	char	serial[XDBC_MAX_STRING_LENGTH];
+};
+
+#define XDBC_PROTOCOL		1	/* GNU Remote Debug Command Set */
+#define XDBC_VENDOR_ID		0x1d6b	/* Linux Foundation 0x1d6b */
+#define XDBC_PRODUCT_ID		0x0004	/* __le16 idProduct; device 0004 */
+#define XDBC_DEVICE_REV		0x0010	/* 0.10 */
+
+/*
+ * xHCI Debug Capability software state structures:
+ */
+struct xdbc_segment {
+	struct xdbc_trb		*trbs;
+	dma_addr_t		dma;
+};
+
+#define XDBC_TRBS_PER_SEGMENT	256
+
+struct xdbc_ring {
+	struct xdbc_segment	*segment;
+	struct xdbc_trb		*enqueue;
+	struct xdbc_trb		*dequeue;
+	u32			cycle_state;
+};
+
+#define XDBC_EPID_OUT		2
+#define XDBC_EPID_IN		3
+
+struct xdbc_state {
+	u16			vendor;
+	u16			device;
+	u32			bus;
+	u32			dev;
+	u32			func;
+	void __iomem		*xhci_base;
+	u64			xhci_start;
+	size_t			xhci_length;
+	int			port_number;
+
+	/* DbC register base */
+	struct xdbc_regs __iomem *xdbc_reg;
+
+	/* DbC table page */
+	dma_addr_t		table_dma;
+	void			*table_base;
+
+	/* event ring segment table */
+	dma_addr_t		erst_dma;
+	size_t			erst_size;
+	void			*erst_base;
+
+	/* event ring segments */
+	struct xdbc_ring	evt_ring;
+	struct xdbc_segment	evt_seg;
+
+	/* debug capability contexts */
+	dma_addr_t		dbcc_dma;
+	size_t			dbcc_size;
+	void			*dbcc_base;
+
+	/* descriptor strings */
+	dma_addr_t		string_dma;
+	size_t			string_size;
+	void			*string_base;
+
+	/* bulk OUT endpoint */
+	struct xdbc_ring	out_ring;
+	struct xdbc_segment	out_seg;
+	void			*out_buf;
+	dma_addr_t		out_dma;
+
+	/* bulk IN endpoint */
+	struct xdbc_ring	in_ring;
+	struct xdbc_segment	in_seg;
+	void			*in_buf;
+	dma_addr_t		in_dma;
+
+	u32			flags;
+
+	/* spinlock for early_xdbc_write() reentrancy */
+	raw_spinlock_t		lock;
+};
+
+#define XDBC_PCI_MAX_BUSES	256
+#define XDBC_PCI_MAX_DEVICES	32
+#define XDBC_PCI_MAX_FUNCTION	8
+
+#define XDBC_TABLE_ENTRY_SIZE	64
+#define XDBC_ERST_ENTRY_NUM	1
+#define XDBC_DBCC_ENTRY_NUM	3
+#define XDBC_STRING_ENTRY_NUM	4
+
+/* Bits definitions for xdbc_state.flags: */
+#define XDBC_FLAGS_INITIALIZED	BIT(0)
+#define XDBC_FLAGS_IN_STALL	BIT(1)
+#define XDBC_FLAGS_OUT_STALL	BIT(2)
+#define XDBC_FLAGS_IN_PROCESS	BIT(3)
+#define XDBC_FLAGS_OUT_PROCESS	BIT(4)
+#define XDBC_FLAGS_CONFIGURED	BIT(5)
+
+#define XDBC_MAX_PACKET		1024
+
+/* Door bell target: */
+#define OUT_EP_DOORBELL		0
+#define IN_EP_DOORBELL		1
+#define DOOR_BELL_TARGET(p)	(((p) & 0xff) << 8)
+
+#define xdbc_read64(regs)	xhci_read_64(NULL, (regs))
+#define xdbc_write64(val, regs)	xhci_write_64(NULL, (val), (regs))
+
+#endif /* __LINUX_XHCI_DBC_H */
diff --git a/include/linux/usb/xhci-dbgp.h b/include/linux/usb/xhci-dbgp.h
new file mode 100644
index 000000000000..80c1cca1f529
--- /dev/null
+++ b/include/linux/usb/xhci-dbgp.h
@@ -0,0 +1,29 @@
+/*
+ * Standalone xHCI debug capability driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_XHCI_DBGP_H
+#define __LINUX_XHCI_DBGP_H
+
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+int __init early_xdbc_parse_parameter(char *s);
+int __init early_xdbc_setup_hardware(void);
+void __init early_xdbc_register_console(void);
+#else
+static inline int __init early_xdbc_setup_hardware(void)
+{
+	return -ENODEV;
+}
+static inline void __init early_xdbc_register_console(void)
+{
+}
+#endif /* CONFIG_EARLY_PRINTK_USB_XDBC */
+#endif /* __LINUX_XHCI_DBGP_H */
-- 
cgit v1.2.3


From 9143059fafd4eebed2d43ffb5455178d4010e60a Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Wed, 8 Mar 2017 15:11:14 +0100
Subject: HID: remove initial reading of reports at connect

It looks like a bunch of devices do not like to be polled
for their reports at init time. When you look into the details,
it seems that for those that are requiring the quirk
HID_QUIRK_NO_INIT_REPORTS, the driver fails to retrieve part
of the features/inputs while others (more generic) work.

IMO, it should be acceptable to remove the need for the quirk
in the general case. On the small amount of cases where
we actually need to read the current values, the driver
in charge (hid-mt or wacom) already retrieves the features
manually.

There are 2 cases where we might need to retrieve the reports at
init:
1. hiddev devices with specific use-space tool
2. a device that would require the driver to fetch a specific
   feature/input at plug

For case 2, I have seen this a few time on hid-multitouch. It
is solved in hid-multitouch directly by fetching the feature.
I hope it won't be too common and this can be solved on a per-case
basis (crossing fingers).

For case 1, we moved the implementation of HID_QUIRK_NO_INIT_REPORTS
in hiddev. When somebody starts calling ioctls that needs an initial
update, the hiddev device will fetch the initial state of the reports
to mimic the current behavior. This adds a small amount of time during
the first HIDIOCGUSAGE(S), but it should be acceptable in
most cases. To keep the currently known broken devices, we have to
keep around HID_QUIRK_NO_INIT_REPORTS, but the scope will only be
for hiddev.

Note that I don't think hidraw would be affected and I checked that
the FF drivers that need to interact with the report fields are all
using output reports, which are not initialized by
usbhid_init_reports().

NO_INIT_INPUT_REPORTS is then replaced by HID_QUIRK_NO_INIT_REPORTS:
there is no point keeping it for just one device.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/i2c-hid/i2c-hid.c   | 63 -----------------------------------------
 drivers/hid/usbhid/hid-core.c   | 11 ++-----
 drivers/hid/usbhid/hid-quirks.c |  2 +-
 drivers/hid/usbhid/hiddev.c     | 13 +++++++++
 include/linux/hid.h             |  2 +-
 5 files changed, 18 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c
index ea3c3546cef7..042b90d451ee 100644
--- a/drivers/hid/i2c-hid/i2c-hid.c
+++ b/drivers/hid/i2c-hid/i2c-hid.c
@@ -508,66 +508,6 @@ static int i2c_hid_get_report_length(struct hid_report *report)
 		report->device->report_enum[report->type].numbered + 2;
 }
 
-static void i2c_hid_init_report(struct hid_report *report, u8 *buffer,
-	size_t bufsize)
-{
-	struct hid_device *hid = report->device;
-	struct i2c_client *client = hid->driver_data;
-	struct i2c_hid *ihid = i2c_get_clientdata(client);
-	unsigned int size, ret_size;
-
-	size = i2c_hid_get_report_length(report);
-	if (i2c_hid_get_report(client,
-			report->type == HID_FEATURE_REPORT ? 0x03 : 0x01,
-			report->id, buffer, size))
-		return;
-
-	i2c_hid_dbg(ihid, "report (len=%d): %*ph\n", size, size, buffer);
-
-	ret_size = buffer[0] | (buffer[1] << 8);
-
-	if (ret_size != size) {
-		dev_err(&client->dev, "error in %s size:%d / ret_size:%d\n",
-			__func__, size, ret_size);
-		return;
-	}
-
-	/* hid->driver_lock is held as we are in probe function,
-	 * we just need to setup the input fields, so using
-	 * hid_report_raw_event is safe. */
-	hid_report_raw_event(hid, report->type, buffer + 2, size - 2, 1);
-}
-
-/*
- * Initialize all reports
- */
-static void i2c_hid_init_reports(struct hid_device *hid)
-{
-	struct hid_report *report;
-	struct i2c_client *client = hid->driver_data;
-	struct i2c_hid *ihid = i2c_get_clientdata(client);
-	u8 *inbuf = kzalloc(ihid->bufsize, GFP_KERNEL);
-
-	if (!inbuf) {
-		dev_err(&client->dev, "can not retrieve initial reports\n");
-		return;
-	}
-
-	/*
-	 * The device must be powered on while we fetch initial reports
-	 * from it.
-	 */
-	pm_runtime_get_sync(&client->dev);
-
-	list_for_each_entry(report,
-		&hid->report_enum[HID_FEATURE_REPORT].report_list, list)
-		i2c_hid_init_report(report, inbuf, ihid->bufsize);
-
-	pm_runtime_put(&client->dev);
-
-	kfree(inbuf);
-}
-
 /*
  * Traverse the supplied list of reports and find the longest
  */
@@ -789,9 +729,6 @@ static int i2c_hid_start(struct hid_device *hid)
 			return ret;
 	}
 
-	if (!(hid->quirks & HID_QUIRK_NO_INIT_REPORTS))
-		i2c_hid_init_reports(hid);
-
 	return 0;
 }
 
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 961bc6fdd2d9..00e72c6ffc76 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -753,11 +753,9 @@ void usbhid_init_reports(struct hid_device *hid)
 	struct hid_report_enum *report_enum;
 	int err, ret;
 
-	if (!(hid->quirks & HID_QUIRK_NO_INIT_INPUT_REPORTS)) {
-		report_enum = &hid->report_enum[HID_INPUT_REPORT];
-		list_for_each_entry(report, &report_enum->report_list, list)
-			usbhid_submit_report(hid, report, USB_DIR_IN);
-	}
+	report_enum = &hid->report_enum[HID_INPUT_REPORT];
+	list_for_each_entry(report, &report_enum->report_list, list)
+		usbhid_submit_report(hid, report, USB_DIR_IN);
 
 	report_enum = &hid->report_enum[HID_FEATURE_REPORT];
 	list_for_each_entry(report, &report_enum->report_list, list)
@@ -1120,9 +1118,6 @@ static int usbhid_start(struct hid_device *hid)
 	usbhid->urbctrl->transfer_dma = usbhid->ctrlbuf_dma;
 	usbhid->urbctrl->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 
-	if (!(hid->quirks & HID_QUIRK_NO_INIT_REPORTS))
-		usbhid_init_reports(hid);
-
 	set_bit(HID_STARTED, &usbhid->iofl);
 
 	if (hid->quirks & HID_QUIRK_ALWAYS_POLL) {
diff --git a/drivers/hid/usbhid/hid-quirks.c b/drivers/hid/usbhid/hid-quirks.c
index d6847a664446..ec4fdba39722 100644
--- a/drivers/hid/usbhid/hid-quirks.c
+++ b/drivers/hid/usbhid/hid-quirks.c
@@ -158,7 +158,7 @@ static const struct hid_blacklist {
 	{ USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_HD, HID_QUIRK_NO_INIT_REPORTS },
 	{ USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_QUAD_HD, HID_QUIRK_NO_INIT_REPORTS },
 	{ USB_VENDOR_ID_SYNAPTICS, USB_DEVICE_ID_SYNAPTICS_TP_V103, HID_QUIRK_NO_INIT_REPORTS },
-	{ USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_KEYBOARD_A096, HID_QUIRK_NO_INIT_INPUT_REPORTS },
+	{ USB_VENDOR_ID_HOLTEK_ALT, USB_DEVICE_ID_HOLTEK_ALT_KEYBOARD_A096, HID_QUIRK_NO_INIT_REPORTS },
 	{ USB_VENDOR_ID_MULTIPLE_1781, USB_DEVICE_ID_RAPHNET_4NES4SNES_OLD, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_DRACAL_RAPHNET, USB_DEVICE_ID_RAPHNET_2NES2SNES, HID_QUIRK_MULTI_INPUT },
 	{ USB_VENDOR_ID_DRACAL_RAPHNET, USB_DEVICE_ID_RAPHNET_4NES4SNES, HID_QUIRK_MULTI_INPUT },
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 700145b15088..667171829f65 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -54,6 +54,7 @@ struct hiddev {
 	struct hid_device *hid;
 	struct list_head list;
 	spinlock_t list_lock;
+	bool initialized;
 };
 
 struct hiddev_list {
@@ -689,6 +690,7 @@ static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case HIDIOCINITREPORT:
 		usbhid_init_reports(hid);
+		hiddev->initialized = true;
 		r = 0;
 		break;
 
@@ -790,6 +792,10 @@ static long hiddev_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case HIDIOCGUSAGES:
 	case HIDIOCSUSAGES:
 	case HIDIOCGCOLLECTIONINDEX:
+		if (!hiddev->initialized) {
+			usbhid_init_reports(hid);
+			hiddev->initialized = true;
+		}
 		r = hiddev_ioctl_usage(hiddev, cmd, user_arg);
 		break;
 
@@ -910,6 +916,13 @@ int hiddev_connect(struct hid_device *hid, unsigned int force)
 		kfree(hiddev);
 		return -1;
 	}
+
+	/*
+	 * If HID_QUIRK_NO_INIT_REPORTS is set, make sure we don't initialize
+	 * the reports.
+	 */
+	hiddev->initialized = hid->quirks & HID_QUIRK_NO_INIT_REPORTS;
+
 	return 0;
 }
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 28f38e2b8f30..b2e472c3e595 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -322,7 +322,7 @@ struct hid_item {
 #define HID_QUIRK_MULTI_INPUT			0x00000040
 #define HID_QUIRK_HIDINPUT_FORCE		0x00000080
 #define HID_QUIRK_NO_EMPTY_INPUT		0x00000100
-#define HID_QUIRK_NO_INIT_INPUT_REPORTS		0x00000200
+/* 0x00000200 reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */
 #define HID_QUIRK_ALWAYS_POLL			0x00000400
 #define HID_QUIRK_SKIP_OUTPUT_REPORTS		0x00010000
 #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID		0x00020000
-- 
cgit v1.2.3


From 733aca90300b76575b8a465dc49cbed7a991fd8b Mon Sep 17 00:00:00 2001
From: Jaejoong Kim <climbbb.kim@gmail.com>
Date: Fri, 3 Mar 2017 17:54:01 +0900
Subject: HID: hiddev: reallocate hiddev's minor number

We need to store the minor number each drivers. In case of hidraw, the
minor number is stored stores in struct hidraw. But hiddev's minor is
located in struct hid_device.

The hid-core driver announces a kernel message which driver is loaded when
HID device connected, but hiddev's minor number is always zero. To proper
display hiddev's minor number, we need to store the minor number asked from
usb core and do some refactoring work (move from hiddev.c to hiddev.h) to
access hiddev in hid-core.

[jkosina@suse.cz: rebase on top of newer codebase]
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jaejoong Kim <climbbb.kim@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c      |  2 +-
 drivers/hid/usbhid/hiddev.c | 13 ++-----------
 include/linux/hid.h         |  1 -
 include/linux/hiddev.h      | 12 ++++++++++++
 4 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index e9e87d337446..1a0b91057484 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1695,7 +1695,7 @@ int hid_connect(struct hid_device *hdev, unsigned int connect_mask)
 		len += sprintf(buf + len, "input");
 	if (hdev->claimed & HID_CLAIMED_HIDDEV)
 		len += sprintf(buf + len, "%shiddev%d", len ? "," : "",
-				hdev->minor);
+				((struct hiddev *)hdev->hiddev)->minor);
 	if (hdev->claimed & HID_CLAIMED_HIDRAW)
 		len += sprintf(buf + len, "%shidraw%d", len ? "," : "",
 				((struct hidraw *)hdev->hidraw)->minor);
diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c
index 667171829f65..a8baaf60e28a 100644
--- a/drivers/hid/usbhid/hiddev.c
+++ b/drivers/hid/usbhid/hiddev.c
@@ -46,17 +46,6 @@
 #endif
 #define HIDDEV_BUFFER_SIZE	2048
 
-struct hiddev {
-	int exist;
-	int open;
-	struct mutex existancelock;
-	wait_queue_head_t wait;
-	struct hid_device *hid;
-	struct list_head list;
-	spinlock_t list_lock;
-	bool initialized;
-};
-
 struct hiddev_list {
 	struct hiddev_usage_ref buffer[HIDDEV_BUFFER_SIZE];
 	int head;
@@ -923,6 +912,8 @@ int hiddev_connect(struct hid_device *hid, unsigned int force)
 	 */
 	hiddev->initialized = hid->quirks & HID_QUIRK_NO_INIT_REPORTS;
 
+	hiddev->minor = usbhid->intf->minor;
+
 	return 0;
 }
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index b2e472c3e595..24de54835e52 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -541,7 +541,6 @@ struct hid_device {							/* device report descriptor */
 	struct list_head inputs;					/* The list of inputs */
 	void *hiddev;							/* The hiddev structure */
 	void *hidraw;
-	int minor;							/* Hiddev minor number */
 
 	int open;							/* is the device open by anyone? */
 	char name[128];							/* Device name */
diff --git a/include/linux/hiddev.h b/include/linux/hiddev.h
index a5dd8148660b..921622222957 100644
--- a/include/linux/hiddev.h
+++ b/include/linux/hiddev.h
@@ -32,6 +32,18 @@
  * In-kernel definitions.
  */
 
+struct hiddev {
+	int minor;
+	int exist;
+	int open;
+	struct mutex existancelock;
+	wait_queue_head_t wait;
+	struct hid_device *hid;
+	struct list_head list;
+	spinlock_t list_lock;
+	bool initialized;
+};
+
 struct hid_device;
 struct hid_usage;
 struct hid_field;
-- 
cgit v1.2.3


From 4875253fddd7b6d322f028ad023d44b6efb7f73b Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Mar 2017 08:56:07 -0700
Subject: blk-stat: move BLK_RQ_STAT_BATCH definition to blk-stat.c

This is an implementation detail that no-one outside of blk-stat.c uses.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c          | 2 ++
 include/linux/blk_types.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index f80582be5344..4681c488c262 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -9,6 +9,8 @@
 #include "blk-stat.h"
 #include "blk-mq.h"
 
+#define BLK_RQ_STAT_BATCH	64
+
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
 	const s32 nr_batch = READ_ONCE(stat->nr_batch);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb55d0f..e213c5e7500b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -287,8 +287,6 @@ struct blk_issue_stat {
 	u64 time;
 };
 
-#define BLK_RQ_STAT_BATCH	64
-
 struct blk_rq_stat {
 	s64 mean;
 	u64 min;
-- 
cgit v1.2.3


From 34dbad5d26e2f4b88e60f0e9ad03f99480802812 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Mar 2017 08:56:08 -0700
Subject: blk-stat: convert to callback-based statistics reporting

Currently, statistics are gathered in ~0.13s windows, and users grab the
statistics whenever they need them. This is not ideal for both in-tree
users:

1. Writeback throttling wants its own dynamically sized window of
   statistics. Since the blk-stats statistics are reset after every
   window and the wbt windows don't line up with the blk-stats windows,
   wbt doesn't see every I/O.
2. Polling currently grabs the statistics on every I/O. Again, depending
   on how the window lines up, we may miss some I/Os. It's also
   unnecessary overhead to get the statistics on every I/O; the hybrid
   polling heuristic would be just as happy with the statistics from the
   previous full window.

This reworks the blk-stats infrastructure to be callback-based: users
register a callback that they want called at a given time with all of
the statistics from the window during which the callback was active.
Users can dynamically bucketize the statistics. wbt and polling both
currently use read vs. write, but polling can be extended to further
subdivide based on request size.

The callbacks are kept on an RCU list, and each callback has percpu
stats buffers. There will only be a few users, so the overhead on the
I/O completion side is low. The stats flushing is also simplified
considerably: since the timer function is responsible for clearing the
statistics, we don't have to worry about stale statistics.

wbt is a trivial conversion. After the conversion, the windowing problem
mentioned above is fixed.

For polling, we register an extra callback that caches the previous
window's statistics in the struct request_queue for the hybrid polling
heuristic to use.

Since we no longer have a single stats buffer for the request queue,
this also removes the sysfs and debugfs stats entries. To replace those,
we add a debugfs entry for the poll statistics.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |   6 +-
 block/blk-mq-debugfs.c    |  99 +++++++--------
 block/blk-mq.c            |  76 +++++++----
 block/blk-mq.h            |   1 -
 block/blk-stat.c          | 311 ++++++++++++++++++++++------------------------
 block/blk-stat.h          | 181 +++++++++++++++++++++++++--
 block/blk-sysfs.c         |  31 +----
 block/blk-wbt.c           |  51 +++-----
 block/blk-wbt.h           |   2 +-
 include/linux/blk_types.h |   1 -
 include/linux/blkdev.h    |  10 +-
 11 files changed, 449 insertions(+), 320 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index e8a9bc0d4bbb..78d04ddededc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -852,6 +852,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 int blk_init_allocated_queue(struct request_queue *q)
 {
+	q->stats = blk_alloc_queue_stats();
+	if (!q->stats)
+		return -ENOMEM;
+
 	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
 		return -ENOMEM;
@@ -2698,7 +2702,7 @@ void blk_finish_request(struct request *req, int error)
 	struct request_queue *q = req->q;
 
 	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+		blk_stat_add(req);
 
 	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, req);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 48c88723944a..4b3f962a9c7a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -43,6 +43,42 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
 	return ret;
 }
 
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+	if (stat->nr_samples) {
+		seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+			   stat->nr_samples, stat->mean, stat->min, stat->max);
+	} else {
+		seq_puts(m, "samples=0");
+	}
+}
+
+static int queue_poll_stat_show(struct seq_file *m, void *v)
+{
+	struct request_queue *q = m->private;
+
+	seq_puts(m, "read: ");
+	print_stat(m, &q->poll_stat[READ]);
+	seq_puts(m, "\n");
+
+	seq_puts(m, "write: ");
+	print_stat(m, &q->poll_stat[WRITE]);
+	seq_puts(m, "\n");
+	return 0;
+}
+
+static int queue_poll_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, queue_poll_stat_show, inode->i_private);
+}
+
+static const struct file_operations queue_poll_stat_fops = {
+	.open		= queue_poll_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int hctx_state_show(struct seq_file *m, void *v)
 {
 	struct blk_mq_hw_ctx *hctx = m->private;
@@ -322,60 +358,6 @@ static const struct file_operations hctx_io_poll_fops = {
 	.release	= single_release,
 };
 
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
-	seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
-		   stat->nr_samples, stat->mean, stat->min, stat->max);
-}
-
-static int hctx_stats_show(struct seq_file *m, void *v)
-{
-	struct blk_mq_hw_ctx *hctx = m->private;
-	struct blk_rq_stat stat[2];
-
-	blk_stat_init(&stat[READ]);
-	blk_stat_init(&stat[WRITE]);
-
-	blk_hctx_stat_get(hctx, stat);
-
-	seq_puts(m, "read: ");
-	print_stat(m, &stat[READ]);
-	seq_puts(m, "\n");
-
-	seq_puts(m, "write: ");
-	print_stat(m, &stat[WRITE]);
-	seq_puts(m, "\n");
-	return 0;
-}
-
-static int hctx_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hctx_stats_show, inode->i_private);
-}
-
-static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct seq_file *m = file->private_data;
-	struct blk_mq_hw_ctx *hctx = m->private;
-	struct blk_mq_ctx *ctx;
-	int i;
-
-	hctx_for_each_ctx(hctx, ctx, i) {
-		blk_stat_init(&ctx->stat[READ]);
-		blk_stat_init(&ctx->stat[WRITE]);
-	}
-	return count;
-}
-
-static const struct file_operations hctx_stats_fops = {
-	.open		= hctx_stats_open,
-	.read		= seq_read,
-	.write		= hctx_stats_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int hctx_dispatched_show(struct seq_file *m, void *v)
 {
 	struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +618,11 @@ static const struct file_operations ctx_completed_fops = {
 	.release	= single_release,
 };
 
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+	{"poll_stat", 0400, &queue_poll_stat_fops},
+	{},
+};
+
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"state", 0400, &hctx_state_fops},
 	{"flags", 0400, &hctx_flags_fops},
@@ -646,7 +633,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"sched_tags", 0400, &hctx_sched_tags_fops},
 	{"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
 	{"io_poll", 0600, &hctx_io_poll_fops},
-	{"stats", 0600, &hctx_stats_fops},
 	{"dispatched", 0600, &hctx_dispatched_fops},
 	{"queued", 0600, &hctx_queued_fops},
 	{"run", 0600, &hctx_run_fops},
@@ -753,6 +739,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
 	if (!q->mq_debugfs_dir)
 		goto err;
 
+	if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
+		goto err;
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_debugfs_register_hctx(q, hctx))
 			goto err;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 559e5363bb2c..5ff66f203cd0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,9 @@
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -432,15 +435,8 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 static void blk_mq_stat_add(struct request *rq)
 {
 	if (rq->rq_flags & RQF_STATS) {
-		/*
-		 * We could rq->mq_ctx here, but there's less of a risk
-		 * of races if we have the completion event add the stats
-		 * to the local software queue.
-		 */
-		struct blk_mq_ctx *ctx;
-
-		ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
-		blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq);
 	}
 }
 
@@ -2040,8 +2036,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
-		blk_stat_init(&__ctx->stat[READ]);
-		blk_stat_init(&__ctx->stat[WRITE]);
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
@@ -2339,6 +2333,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
+	q->stats = blk_alloc_queue_stats();
+	if (!q->stats)
+		goto err_exit;
+
+	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
+					     blk_stat_rq_ddir, 2, q);
+	if (!q->poll_cb)
+		goto err_exit;
+
 	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!q->queue_ctx)
 		goto err_exit;
@@ -2740,27 +2743,52 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+	    test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+		return true;
+	blk_stat_add_callback(q, q->poll_cb);
+	return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+	/*
+	 * We don't arm the callback if polling stats are not enabled or the
+	 * callback is already active.
+	 */
+	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+	    blk_stat_is_active(q->poll_cb))
+		return;
+
+	blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+	struct request_queue *q = cb->data;
+
+	if (cb->stat[READ].nr_samples)
+		q->poll_stat[READ] = cb->stat[READ];
+	if (cb->stat[WRITE].nr_samples)
+		q->poll_stat[WRITE] = cb->stat[WRITE];
+}
+
 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx,
 				       struct request *rq)
 {
-	struct blk_rq_stat stat[2];
 	unsigned long ret = 0;
 
 	/*
 	 * If stats collection isn't on, don't sleep but turn it on for
 	 * future users
 	 */
-	if (!blk_stat_enable(q))
+	if (!blk_poll_stats_enable(q))
 		return 0;
 
-	/*
-	 * We don't have to do this once per IO, should optimize this
-	 * to just use the current window of stats until it changes
-	 */
-	memset(&stat, 0, sizeof(stat));
-	blk_hctx_stat_get(hctx, stat);
-
 	/*
 	 * As an optimistic guess, use half of the mean service time
 	 * for this type of request. We can (and should) make this smarter.
@@ -2769,10 +2797,10 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 	 * important on devices where the completion latencies are longer
 	 * than ~10 usec.
 	 */
-	if (req_op(rq) == REQ_OP_READ && stat[READ].nr_samples)
-		ret = (stat[READ].mean + 1) / 2;
-	else if (req_op(rq) == REQ_OP_WRITE && stat[WRITE].nr_samples)
-		ret = (stat[WRITE].mean + 1) / 2;
+	if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples)
+		ret = (q->poll_stat[READ].mean + 1) / 2;
+	else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples)
+		ret = (q->poll_stat[WRITE].mean + 1) / 2;
 
 	return ret;
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b79f9a7d8cf6..8d49c06fc520 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
 
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
-	struct blk_rq_stat	stat[2];
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 4681c488c262..0d8721a60db9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2016 Jens Axboe
  */
 #include <linux/kernel.h>
+#include <linux/rculist.h>
 #include <linux/blk-mq.h>
 
 #include "blk-stat.h"
@@ -11,6 +12,24 @@
 
 #define BLK_RQ_STAT_BATCH	64
 
+struct blk_queue_stats {
+	struct list_head callbacks;
+	spinlock_t lock;
+};
+
+unsigned int blk_stat_rq_ddir(const struct request *rq)
+{
+	return rq_data_dir(rq);
+}
+EXPORT_SYMBOL_GPL(blk_stat_rq_ddir);
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+	stat->min = -1ULL;
+	stat->max = stat->nr_samples = stat->mean = 0;
+	stat->batch = stat->nr_batch = 0;
+}
+
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
 	const s32 nr_batch = READ_ONCE(stat->nr_batch);
@@ -50,207 +69,171 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 	dst->nr_samples += src->nr_samples;
 }
 
-static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	uint64_t latest = 0;
-	int i, j, nr;
-
-	blk_stat_init(&dst[READ]);
-	blk_stat_init(&dst[WRITE]);
-
-	nr = 0;
-	do {
-		uint64_t newest = 0;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				blk_stat_flush_batch(&ctx->stat[READ]);
-				blk_stat_flush_batch(&ctx->stat[WRITE]);
-
-				if (!ctx->stat[READ].nr_samples &&
-				    !ctx->stat[WRITE].nr_samples)
-					continue;
-				if (ctx->stat[READ].time > newest)
-					newest = ctx->stat[READ].time;
-				if (ctx->stat[WRITE].time > newest)
-					newest = ctx->stat[WRITE].time;
-			}
-		}
+	stat->min = min(stat->min, value);
+	stat->max = max(stat->max, value);
 
-		/*
-		 * No samples
-		 */
-		if (!newest)
-			break;
-
-		if (newest > latest)
-			latest = newest;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				if (ctx->stat[READ].time == newest) {
-					blk_stat_sum(&dst[READ],
-						     &ctx->stat[READ]);
-					nr++;
-				}
-				if (ctx->stat[WRITE].time == newest) {
-					blk_stat_sum(&dst[WRITE],
-						     &ctx->stat[WRITE]);
-					nr++;
-				}
-			}
-		}
-		/*
-		 * If we race on finding an entry, just loop back again.
-		 * Should be very rare.
-		 */
-	} while (!nr);
+	if (stat->batch + value < stat->batch ||
+	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+		blk_stat_flush_batch(stat);
 
-	dst[READ].time = dst[WRITE].time = latest;
+	stat->batch += value;
+	stat->nr_batch++;
 }
 
-void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+void blk_stat_add(struct request *rq)
 {
-	if (q->mq_ops)
-		blk_mq_stat_get(q, dst);
-	else {
-		blk_stat_flush_batch(&q->rq_stats[READ]);
-		blk_stat_flush_batch(&q->rq_stats[WRITE]);
-		memcpy(&dst[READ], &q->rq_stats[READ],
-		       sizeof(struct blk_rq_stat));
-		memcpy(&dst[WRITE], &q->rq_stats[WRITE],
-		       sizeof(struct blk_rq_stat));
+	struct request_queue *q = rq->q;
+	struct blk_stat_callback *cb;
+	struct blk_rq_stat *stat;
+	int bucket;
+	s64 now, value;
+
+	now = __blk_stat_time(ktime_to_ns(ktime_get()));
+	if (now < blk_stat_time(&rq->issue_stat))
+		return;
+
+	value = now - blk_stat_time(&rq->issue_stat);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+		if (blk_stat_is_active(cb)) {
+			bucket = cb->bucket_fn(rq);
+			stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
+			__blk_stat_add(stat, value);
+		}
 	}
+	rcu_read_unlock();
 }
 
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+static void blk_stat_timer_fn(unsigned long data)
 {
-	struct blk_mq_ctx *ctx;
-	unsigned int i, nr;
+	struct blk_stat_callback *cb = (void *)data;
+	unsigned int bucket;
+	int cpu;
 
-	nr = 0;
-	do {
-		uint64_t newest = 0;
+	for (bucket = 0; bucket < cb->buckets; bucket++)
+		blk_stat_init(&cb->stat[bucket]);
 
-		hctx_for_each_ctx(hctx, ctx, i) {
-			blk_stat_flush_batch(&ctx->stat[READ]);
-			blk_stat_flush_batch(&ctx->stat[WRITE]);
+	for_each_online_cpu(cpu) {
+		struct blk_rq_stat *cpu_stat;
 
-			if (!ctx->stat[READ].nr_samples &&
-			    !ctx->stat[WRITE].nr_samples)
-				continue;
-
-			if (ctx->stat[READ].time > newest)
-				newest = ctx->stat[READ].time;
-			if (ctx->stat[WRITE].time > newest)
-				newest = ctx->stat[WRITE].time;
+		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+		for (bucket = 0; bucket < cb->buckets; bucket++) {
+			blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+			blk_stat_init(&cpu_stat[bucket]);
 		}
+	}
 
-		if (!newest)
-			break;
-
-		hctx_for_each_ctx(hctx, ctx, i) {
-			if (ctx->stat[READ].time == newest) {
-				blk_stat_sum(&dst[READ], &ctx->stat[READ]);
-				nr++;
-			}
-			if (ctx->stat[WRITE].time == newest) {
-				blk_stat_sum(&dst[WRITE], &ctx->stat[WRITE]);
-				nr++;
-			}
-		}
-		/*
-		 * If we race on finding an entry, just loop back again.
-		 * Should be very rare, as the window is only updated
-		 * occasionally
-		 */
-	} while (!nr);
+	cb->timer_fn(cb);
 }
 
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+			unsigned int (*bucket_fn)(const struct request *),
+			unsigned int buckets, void *data)
 {
-	stat->min = -1ULL;
-	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = stat->nr_batch = 0;
-	stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+	struct blk_stat_callback *cb;
 
-void blk_stat_init(struct blk_rq_stat *stat)
-{
-	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+	cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+	if (!cb)
+		return NULL;
 
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
-	return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
-}
+	cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
+				 GFP_KERNEL);
+	if (!cb->stat) {
+		kfree(cb);
+		return NULL;
+	}
+	cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
+				      __alignof__(struct blk_rq_stat));
+	if (!cb->cpu_stat) {
+		kfree(cb->stat);
+		kfree(cb);
+		return NULL;
+	}
 
-bool blk_stat_is_current(struct blk_rq_stat *stat)
-{
-	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+	cb->timer_fn = timer_fn;
+	cb->bucket_fn = bucket_fn;
+	cb->data = data;
+	cb->buckets = buckets;
+	setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+	return cb;
 }
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
 
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_add_callback(struct request_queue *q,
+			   struct blk_stat_callback *cb)
 {
-	s64 now, value;
+	unsigned int bucket;
+	int cpu;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
-		return;
+	for_each_possible_cpu(cpu) {
+		struct blk_rq_stat *cpu_stat;
 
-	if (!__blk_stat_is_current(stat, now))
-		__blk_stat_init(stat, now);
+		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+		for (bucket = 0; bucket < cb->buckets; bucket++)
+			blk_stat_init(&cpu_stat[bucket]);
+	}
 
-	value = now - blk_stat_time(&rq->issue_stat);
-	if (value > stat->max)
-		stat->max = value;
-	if (value < stat->min)
-		stat->min = value;
+	spin_lock(&q->stats->lock);
+	list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
+}
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
 
-	if (stat->batch + value < stat->batch ||
-	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-		blk_stat_flush_batch(stat);
+void blk_stat_remove_callback(struct request_queue *q,
+			      struct blk_stat_callback *cb)
+{
+	spin_lock(&q->stats->lock);
+	list_del_rcu(&cb->list);
+	if (list_empty(&q->stats->callbacks))
+		clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
 
-	stat->batch += value;
-	stat->nr_batch++;
+	del_timer_sync(&cb->timer);
 }
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
 
-void blk_stat_clear(struct request_queue *q)
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
 {
-	if (q->mq_ops) {
-		struct blk_mq_hw_ctx *hctx;
-		struct blk_mq_ctx *ctx;
-		int i, j;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				blk_stat_init(&ctx->stat[READ]);
-				blk_stat_init(&ctx->stat[WRITE]);
-			}
-		}
-	} else {
-		blk_stat_init(&q->rq_stats[READ]);
-		blk_stat_init(&q->rq_stats[WRITE]);
-	}
+	struct blk_stat_callback *cb;
+
+	cb = container_of(head, struct blk_stat_callback, rcu);
+	free_percpu(cb->cpu_stat);
+	kfree(cb->stat);
+	kfree(cb);
 }
 
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+void blk_stat_free_callback(struct blk_stat_callback *cb)
 {
-	stat->time = (stat->time & BLK_STAT_MASK) |
-			(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+	call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
-	if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
-		return false;
-	}
+	struct blk_queue_stats *stats;
+
+	stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+	if (!stats)
+		return NULL;
+
+	INIT_LIST_HEAD(&stats->callbacks);
+	spin_lock_init(&stats->lock);
+
+	return stats;
+}
+
+void blk_free_queue_stats(struct blk_queue_stats *stats)
+{
+	if (!stats)
+		return;
+
+	WARN_ON(!list_empty(&stats->callbacks));
 
-	return true;
+	kfree(stats);
 }
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 34384328b46b..6ad5b8c59a79 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,11 +1,11 @@
 #ifndef BLK_STAT_H
 #define BLK_STAT_H
 
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC		134217728ULL
-#define BLK_STAT_NSEC_MASK	~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
 
 /*
  * Upper 3 bits can be used elsewhere
@@ -15,14 +15,69 @@
 #define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
 #define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
 
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
-void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+/**
+ * struct blk_stat_callback - Block statistics callback.
+ *
+ * A &struct blk_stat_callback is associated with a &struct request_queue. While
+ * @timer is active, that queue's request completion latencies are sorted into
+ * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
+ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
+ */
+struct blk_stat_callback {
+	/*
+	 * @list: RCU list of callbacks for a &struct request_queue.
+	 */
+	struct list_head list;
+
+	/**
+	 * @timer: Timer for the next callback invocation.
+	 */
+	struct timer_list timer;
+
+	/**
+	 * @cpu_stat: Per-cpu statistics buckets.
+	 */
+	struct blk_rq_stat __percpu *cpu_stat;
+
+	/**
+	 * @bucket_fn: Given a request, returns which statistics bucket it
+	 * should be accounted under.
+	 */
+	unsigned int (*bucket_fn)(const struct request *);
+
+	/**
+	 * @buckets: Number of statistics buckets.
+	 */
+	unsigned int buckets;
+
+	/**
+	 * @stat: Array of statistics buckets.
+	 */
+	struct blk_rq_stat *stat;
+
+	/**
+	 * @fn: Callback function.
+	 */
+	void (*timer_fn)(struct blk_stat_callback *);
+
+	/**
+	 * @data: Private pointer for the user.
+	 */
+	void *data;
+
+	struct rcu_head rcu;
+};
+
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
+
+static inline void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+	stat->time = ((stat->time & BLK_STAT_MASK) |
+		      (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK));
+}
 
 static inline u64 __blk_stat_time(u64 time)
 {
@@ -34,4 +89,104 @@ static inline u64 blk_stat_time(struct blk_issue_stat *stat)
 	return __blk_stat_time(stat->time);
 }
 
+/*
+ * blk_stat_rq_ddir() - Bucket callback function for the request data direction.
+ * @rq: Request.
+ *
+ * This is the same as rq_data_dir() but as a function so it can be used as
+ * @bucket_fn for blk_stat_alloc_callback().
+ *
+ * Return: Data direction of the request, either READ or WRITE.
+ */
+unsigned int blk_stat_rq_ddir(const struct request *rq);
+
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @timer_fn: Timer callback function.
+ * @bucket_fn: Bucket callback function.
+ * @buckets: Number of statistics buckets.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * See &struct blk_stat_callback for details on the callback functions.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+			unsigned int (*bucket_fn)(const struct request *),
+			unsigned int buckets, void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+			   struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+			      struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_is_active() - Check if a block statistics callback is currently
+ * gathering statistics.
+ * @cb: The callback.
+ */
+static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
+{
+	return timer_pending(&cb->timer);
+}
+
+/**
+ * blk_stat_activate_nsecs() - Gather block statistics during a time window in
+ * nanoseconds.
+ * @cb: The callback.
+ * @nsecs: Number of nanoseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
+					   u64 nsecs)
+{
+	mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
+}
+
+/**
+ * blk_stat_activate_msecs() - Gather block statistics during a time window in
+ * milliseconds.
+ * @cb: The callback.
+ * @msecs: Number of milliseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
+					   unsigned int msecs)
+{
+	mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
+}
+
 #endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fdb45fd0db0b..fa831cb2fc30 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
-	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
-			pre, (long long) stat->nr_samples,
-			(long long) stat->mean, (long long) stat->min,
-			(long long) stat->max);
-}
-
-static ssize_t queue_stats_show(struct request_queue *q, char *page)
-{
-	struct blk_rq_stat stat[2];
-	ssize_t ret;
-
-	blk_queue_stat_get(q, stat);
-
-	ret = print_stat(page, &stat[READ], "read :");
-	ret += print_stat(page + ret, &stat[WRITE], "write:");
-	return ret;
-}
-
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -691,11 +671,6 @@ static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
-static struct queue_sysfs_entry queue_stats_entry = {
-	.attr = {.name = "stats", .mode = S_IRUGO },
-	.show = queue_stats_show,
-};
-
 static struct queue_sysfs_entry queue_wb_lat_entry = {
 	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_wb_lat_show,
@@ -733,7 +708,6 @@ static struct attribute *default_attrs[] = {
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
-	&queue_stats_entry.attr,
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
 	NULL,
@@ -811,6 +785,9 @@ static void blk_release_queue(struct kobject *kobj)
 		container_of(kobj, struct request_queue, kobj);
 
 	wbt_exit(q);
+	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+		blk_stat_remove_callback(q, q->poll_cb);
+	blk_stat_free_callback(q->poll_cb);
 	bdi_put(q->backing_dev_info);
 	blkcg_exit_queue(q);
 
@@ -819,6 +796,8 @@ static void blk_release_queue(struct kobject *kobj)
 		elevator_exit(q->elevator);
 	}
 
+	blk_free_queue_stats(q->stats);
+
 	blk_exit_rl(&q->root_rl);
 
 	if (q->queue_tags)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index aafe5b551224..ffa80e11cf14 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -277,7 +277,7 @@ enum {
 	LAT_EXCEEDED,
 };
 
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 {
 	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
 	u64 thislat;
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 		 * waited or still has writes in flights, consider us doing
 		 * just writes as well.
 		 */
-		if ((stat[WRITE].nr_samples && blk_stat_is_current(stat)) ||
-		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+		if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
+		    wbt_inflight(rwb))
 			return LAT_UNKNOWN_WRITES;
 		return LAT_UNKNOWN;
 	}
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 	return LAT_OK;
 }
 
-static int latency_exceeded(struct rq_wb *rwb)
-{
-	struct blk_rq_stat stat[2];
-
-	blk_queue_stat_get(rwb->queue, stat);
-	return __latency_exceeded(rwb, stat);
-}
-
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
 	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
 
 	rwb->scale_step--;
 	rwb->unknown_cnt = 0;
-	blk_stat_clear(rwb->queue);
 
 	rwb->scaled_max = calc_wb_limits(rwb);
 
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 
 	rwb->scaled_max = false;
 	rwb->unknown_cnt = 0;
-	blk_stat_clear(rwb->queue);
 	calc_wb_limits(rwb);
 	rwb_trace_step(rwb, "step down");
 }
 
 static void rwb_arm_timer(struct rq_wb *rwb)
 {
-	unsigned long expires;
-
 	if (rwb->scale_step > 0) {
 		/*
 		 * We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 		rwb->cur_win_nsec = rwb->win_nsec;
 	}
 
-	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
-	mod_timer(&rwb->window_timer, expires);
+	blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
 }
 
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb)
 {
-	struct rq_wb *rwb = (struct rq_wb *) data;
+	struct rq_wb *rwb = cb->data;
 	unsigned int inflight = wbt_inflight(rwb);
 	int status;
 
-	status = latency_exceeded(rwb);
+	status = latency_exceeded(rwb, cb->stat);
 
 	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
 			inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 
 	__wbt_wait(rwb, bio->bi_opf, lock);
 
-	if (!timer_pending(&rwb->window_timer))
+	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
 
 	if (current_is_kswapd())
@@ -675,7 +662,7 @@ void wbt_disable_default(struct request_queue *q)
 	struct rq_wb *rwb = q->rq_wb;
 
 	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
-		del_timer_sync(&rwb->window_timer);
+		blk_stat_remove_callback(q, rwb->cb);
 		rwb->win_nsec = rwb->min_lat_nsec = 0;
 		wbt_update_limits(rwb);
 	}
@@ -699,24 +686,23 @@ int wbt_init(struct request_queue *q)
 	struct rq_wb *rwb;
 	int i;
 
-	/*
-	 * For now, we depend on the stats window being larger than
-	 * our monitoring window. Ensure that this isn't inadvertently
-	 * violated.
-	 */
-	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
 	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
 
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;
 
+	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, blk_stat_rq_ddir, 2, rwb);
+	if (!rwb->cb) {
+		kfree(rwb);
+		return -ENOMEM;
+	}
+
 	for (i = 0; i < WBT_NUM_RWQ; i++) {
 		atomic_set(&rwb->rq_wait[i].inflight, 0);
 		init_waitqueue_head(&rwb->rq_wait[i].wait);
 	}
 
-	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
 	rwb->wc = 1;
 	rwb->queue_depth = RWB_DEF_DEPTH;
 	rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +712,10 @@ int wbt_init(struct request_queue *q)
 	wbt_update_limits(rwb);
 
 	/*
-	 * Assign rwb, and turn on stats tracking for this queue
+	 * Assign rwb and add the stats callback.
 	 */
 	q->rq_wb = rwb;
-	blk_stat_enable(q);
+	blk_stat_add_callback(q, rwb->cb);
 
 	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 
@@ -744,7 +730,8 @@ void wbt_exit(struct request_queue *q)
 	struct rq_wb *rwb = q->rq_wb;
 
 	if (rwb) {
-		del_timer_sync(&rwb->window_timer);
+		blk_stat_remove_callback(q, rwb->cb);
+		blk_stat_free_callback(rwb->cb);
 		q->rq_wb = NULL;
 		kfree(rwb);
 	}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67..591ff2f4b2ee 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -81,7 +81,7 @@ struct rq_wb {
 	u64 win_nsec;				/* default window size */
 	u64 cur_win_nsec;			/* current window size */
 
-	struct timer_list window_timer;
+	struct blk_stat_callback *cb;
 
 	s64 sync_issue;
 	void *sync_cookie;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e213c5e7500b..270119a501fb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -294,7 +294,6 @@ struct blk_rq_stat {
 	s32 nr_samples;
 	s32 nr_batch;
 	u64 batch;
-	s64 time;
 };
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..1a7dc42a8918 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -40,6 +40,8 @@ struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
 struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -388,6 +390,7 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	struct blk_queue_stats	*stats;
 	struct rq_wb		*rq_wb;
 
 	/*
@@ -505,8 +508,6 @@ struct request_queue {
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
-	struct blk_rq_stat	rq_stats[2];
-
 	/*
 	 * Number of active block driver functions for which blk_drain_queue()
 	 * must wait. Must be incremented around functions that unlock the
@@ -516,6 +517,10 @@ struct request_queue {
 
 	unsigned int		rq_timeout;
 	int			poll_nsec;
+
+	struct blk_stat_callback	*poll_cb;
+	struct blk_rq_stat	poll_stat[2];
+
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 	struct list_head	timeout_list;
@@ -611,6 +616,7 @@ struct request_queue {
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
 #define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
+#define QUEUE_FLAG_POLL_STATS  29	/* collecting stats for hybrid polling */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From 8bae3551e93de4e8a5b959c495b06de9264be0d5 Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Thu, 16 Mar 2017 23:18:47 +0100
Subject: net: usb: usbnet: add new api ethtool_{get|set}_link_ksettings

The ethtool api {get|set}_settings is deprecated.
We add the new api {get|set}_link_ksettings to this driver.

As I don't have the hardware, I'd be very pleased if
someone may test this patch.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Acked-by: Oliver Neukum <oneukum@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/usb/usbnet.h |  4 ++++
 2 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 3de65ea6531a..1b40b189435a 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -980,6 +980,40 @@ int usbnet_set_settings (struct net_device *net, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL_GPL(usbnet_set_settings);
 
+int usbnet_get_link_ksettings(struct net_device *net,
+			      struct ethtool_link_ksettings *cmd)
+{
+	struct usbnet *dev = netdev_priv(net);
+
+	if (!dev->mii.mdio_read)
+		return -EOPNOTSUPP;
+
+	return mii_ethtool_get_link_ksettings(&dev->mii, cmd);
+}
+EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings);
+
+int usbnet_set_link_ksettings(struct net_device *net,
+			      const struct ethtool_link_ksettings *cmd)
+{
+	struct usbnet *dev = netdev_priv(net);
+	int retval;
+
+	if (!dev->mii.mdio_write)
+		return -EOPNOTSUPP;
+
+	retval = mii_ethtool_set_link_ksettings(&dev->mii, cmd);
+
+	/* link speed/duplex might have changed */
+	if (dev->driver_info->link_reset)
+		dev->driver_info->link_reset(dev);
+
+	/* hard_mtu or rx_urb_size may change in link_reset() */
+	usbnet_update_max_qlen(dev);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings);
+
 u32 usbnet_get_link (struct net_device *net)
 {
 	struct usbnet *dev = netdev_priv(net);
@@ -1046,6 +1080,8 @@ static const struct ethtool_ops usbnet_ethtool_ops = {
 	.get_msglevel		= usbnet_get_msglevel,
 	.set_msglevel		= usbnet_set_msglevel,
 	.get_ts_info		= ethtool_op_get_ts_info,
+	.get_link_ksettings	= usbnet_get_link_ksettings,
+	.set_link_ksettings	= usbnet_set_link_ksettings,
 };
 
 /*-------------------------------------------------------------------------*/
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 6e0ce8c7b8cb..5bd80078b7fe 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -265,6 +265,10 @@ extern int usbnet_get_settings(struct net_device *net,
 			       struct ethtool_cmd *cmd);
 extern int usbnet_set_settings(struct net_device *net,
 			       struct ethtool_cmd *cmd);
+extern int usbnet_get_link_ksettings(struct net_device *net,
+				     struct ethtool_link_ksettings *cmd);
+extern int usbnet_set_link_ksettings(struct net_device *net,
+				     const struct ethtool_link_ksettings *cmd);
 extern u32 usbnet_get_link(struct net_device *net);
 extern u32 usbnet_get_msglevel(struct net_device *);
 extern void usbnet_set_msglevel(struct net_device *, u32);
-- 
cgit v1.2.3


From bde87ad64c4cee7a1d5b41d217b440e21050813e Mon Sep 17 00:00:00 2001
From: Philippe Reynes <tremyfr@gmail.com>
Date: Thu, 16 Mar 2017 23:18:57 +0100
Subject: net: usb: usb: remove old api ethtool_{get|set}_settings

The function usbnet_{get|set}_settings is no longer used,
so we remove it.

Signed-off-by: Philippe Reynes <tremyfr@gmail.com>
Acked-by: Oliver Neukum <oneukum@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 35 -----------------------------------
 include/linux/usb/usbnet.h |  4 ----
 2 files changed, 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 1b40b189435a..13d4ec5f6f34 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -947,39 +947,6 @@ EXPORT_SYMBOL_GPL(usbnet_open);
  * they'll probably want to use this base set.
  */
 
-int usbnet_get_settings (struct net_device *net, struct ethtool_cmd *cmd)
-{
-	struct usbnet *dev = netdev_priv(net);
-
-	if (!dev->mii.mdio_read)
-		return -EOPNOTSUPP;
-
-	return mii_ethtool_gset(&dev->mii, cmd);
-}
-EXPORT_SYMBOL_GPL(usbnet_get_settings);
-
-int usbnet_set_settings (struct net_device *net, struct ethtool_cmd *cmd)
-{
-	struct usbnet *dev = netdev_priv(net);
-	int retval;
-
-	if (!dev->mii.mdio_write)
-		return -EOPNOTSUPP;
-
-	retval = mii_ethtool_sset(&dev->mii, cmd);
-
-	/* link speed/duplex might have changed */
-	if (dev->driver_info->link_reset)
-		dev->driver_info->link_reset(dev);
-
-	/* hard_mtu or rx_urb_size may change in link_reset() */
-	usbnet_update_max_qlen(dev);
-
-	return retval;
-
-}
-EXPORT_SYMBOL_GPL(usbnet_set_settings);
-
 int usbnet_get_link_ksettings(struct net_device *net,
 			      struct ethtool_link_ksettings *cmd)
 {
@@ -1072,8 +1039,6 @@ EXPORT_SYMBOL_GPL(usbnet_set_msglevel);
 
 /* drivers may override default ethtool_ops in their bind() routine */
 static const struct ethtool_ops usbnet_ethtool_ops = {
-	.get_settings		= usbnet_get_settings,
-	.set_settings		= usbnet_set_settings,
 	.get_link		= usbnet_get_link,
 	.nway_reset		= usbnet_nway_reset,
 	.get_drvinfo		= usbnet_get_drvinfo,
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 5bd80078b7fe..e2b56917450f 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -261,10 +261,6 @@ extern void usbnet_pause_rx(struct usbnet *);
 extern void usbnet_resume_rx(struct usbnet *);
 extern void usbnet_purge_paused_rxq(struct usbnet *);
 
-extern int usbnet_get_settings(struct net_device *net,
-			       struct ethtool_cmd *cmd);
-extern int usbnet_set_settings(struct net_device *net,
-			       struct ethtool_cmd *cmd);
 extern int usbnet_get_link_ksettings(struct net_device *net,
 				     struct ethtool_link_ksettings *cmd);
 extern int usbnet_set_link_ksettings(struct net_device *net,
-- 
cgit v1.2.3


From a8f5102af2a7740a4b3200a27beddf27f23f921a Mon Sep 17 00:00:00 2001
From: Joao Pinto <Joao.Pinto@synopsys.com>
Date: Fri, 17 Mar 2017 16:11:06 +0000
Subject: net: stmmac: TX and RX queue priority configuration

This patch adds the configuration of RX and TX queues' priority.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt   |  5 +++
 drivers/net/ethernet/stmicro/stmmac/common.h       |  4 ++
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h       | 13 ++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  | 37 +++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 48 ++++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c   |  4 ++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 16 ++++++++
 include/linux/stmmac.h                             |  4 ++
 8 files changed, 131 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index a7b0e41cb264..d11bd09f4ba6 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -83,6 +83,7 @@ Optional properties:
 			- snps,dcb-algorithm: Queue to be enabled as DCB
 			- snps,avb-algorithm: Queue to be enabled as AVB
 		- snps,map-to-dma-channel: Channel to map
+		- snps,priority: RX queue priority (Range: 0x0 to 0xF)
 - Multiple TX Queues parameters: below the list of all the parameters to
 				 configure the multiple TX queues:
 	- snps,tx-queues-to-use: number of TX queues to be used in the driver
@@ -101,6 +102,7 @@ Optional properties:
 			- snps,idle_slope: unlock on WoL
 			- snps,high_credit: max write outstanding req. limit
 			- snps,low_credit: max read outstanding req. limit
+		- snps,priority: TX queue priority (Range: 0x0 to 0xF)
 Examples:
 
 	stmmac_axi_setup: stmmac-axi-config {
@@ -115,6 +117,7 @@ Examples:
 		queue0 {
 			snps,dcb-algorithm;
 			snps,map-to-dma-channel = <0x0>;
+			snps,priority = <0x0>;
 		};
 	};
 
@@ -124,6 +127,7 @@ Examples:
 		queue0 {
 			snps,weight = <0x10>;
 			snps,dcb-algorithm;
+			snps,priority = <0x0>;
 		};
 
 		queue1 {
@@ -132,6 +136,7 @@ Examples:
 			snps,idle_slope = <0x1000>;
 			snps,high_credit = <0x3E800>;
 			snps,low_credit = <0xFFC18000>;
+			snps,priority = <0x1>;
 		};
 	};
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index badc4414d67b..e0b31e759c0e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -469,6 +469,10 @@ struct stmmac_ops {
 	int (*rx_ipc)(struct mac_device_info *hw);
 	/* Enable RX Queues */
 	void (*rx_queue_enable)(struct mac_device_info *hw, u8 mode, u32 queue);
+	/* RX Queues Priority */
+	void (*rx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
+	/* TX Queues Priority */
+	void (*tx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
 	/* Program RX Algorithms */
 	void (*prog_mtl_rx_algorithms)(struct mac_device_info *hw, u32 rx_alg);
 	/* Program TX Algorithms */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index 54bcdb4d10db..a6c382d22ebf 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -22,7 +22,12 @@
 #define GMAC_HASH_TAB_32_63		0x00000014
 #define GMAC_RX_FLOW_CTRL		0x00000090
 #define GMAC_QX_TX_FLOW_CTRL(x)		(0x70 + x * 4)
+#define GMAC_TXQ_PRTY_MAP0		0x98
+#define GMAC_TXQ_PRTY_MAP1		0x9C
 #define GMAC_RXQ_CTRL0			0x000000a0
+#define GMAC_RXQ_CTRL1			0x000000a4
+#define GMAC_RXQ_CTRL2			0x000000a8
+#define GMAC_RXQ_CTRL3			0x000000ac
 #define GMAC_INT_STATUS			0x000000b0
 #define GMAC_INT_EN			0x000000b4
 #define GMAC_1US_TIC_COUNTER		0x000000dc
@@ -54,6 +59,14 @@
 /* MAC Flow Control RX */
 #define GMAC_RX_FLOW_CTRL_RFE		BIT(0)
 
+/* RX Queues Priorities */
+#define GMAC_RXQCTRL_PSRQX_MASK(x)	GENMASK(7 + ((x) * 8), 0 + ((x) * 8))
+#define GMAC_RXQCTRL_PSRQX_SHIFT(x)	((x) * 8)
+
+/* TX Queues Priorities */
+#define GMAC_TXQCTRL_PSTQX_MASK(x)	GENMASK(7 + ((x) * 8), 0 + ((x) * 8))
+#define GMAC_TXQCTRL_PSTQX_SHIFT(x)	((x) * 8)
+
 /* MAC Flow Control TX */
 #define GMAC_TX_FLOW_CTRL_TFE		BIT(1)
 #define GMAC_TX_FLOW_CTRL_PT_SHIFT	16
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 10599dbc232f..342f62abb9ca 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -74,6 +74,41 @@ static void dwmac4_rx_queue_enable(struct mac_device_info *hw,
 	writel(value, ioaddr + GMAC_RXQ_CTRL0);
 }
 
+static void dwmac4_rx_queue_priority(struct mac_device_info *hw,
+				     u32 prio, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 base_register;
+	u32 value;
+
+	base_register = (queue < 4) ? GMAC_RXQ_CTRL2 : GMAC_RXQ_CTRL3;
+
+	value = readl(ioaddr + base_register);
+
+	value &= ~GMAC_RXQCTRL_PSRQX_MASK(queue);
+	value |= (prio << GMAC_RXQCTRL_PSRQX_SHIFT(queue)) &
+						GMAC_RXQCTRL_PSRQX_MASK(queue);
+	writel(value, ioaddr + base_register);
+}
+
+static void dwmac4_tx_queue_priority(struct mac_device_info *hw,
+				     u32 prio, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 base_register;
+	u32 value;
+
+	base_register = (queue < 4) ? GMAC_TXQ_PRTY_MAP0 : GMAC_TXQ_PRTY_MAP1;
+
+	value = readl(ioaddr + base_register);
+
+	value &= ~GMAC_TXQCTRL_PSTQX_MASK(queue);
+	value |= (prio << GMAC_TXQCTRL_PSTQX_SHIFT(queue)) &
+						GMAC_TXQCTRL_PSTQX_MASK(queue);
+
+	writel(value, ioaddr + base_register);
+}
+
 static void dwmac4_prog_mtl_rx_algorithms(struct mac_device_info *hw,
 					  u32 rx_alg)
 {
@@ -603,6 +638,8 @@ static const struct stmmac_ops dwmac4_ops = {
 	.core_init = dwmac4_core_init,
 	.rx_ipc = dwmac4_rx_ipc_enable,
 	.rx_queue_enable = dwmac4_rx_queue_enable,
+	.rx_queue_prio = dwmac4_rx_queue_priority,
+	.tx_queue_prio = dwmac4_tx_queue_priority,
 	.prog_mtl_rx_algorithms = dwmac4_prog_mtl_rx_algorithms,
 	.prog_mtl_tx_algorithms = dwmac4_prog_mtl_tx_algorithms,
 	.set_mtl_tx_queue_weight = dwmac4_set_mtl_tx_queue_weight,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index a389dfbe630c..0f2c0d762e33 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2292,6 +2292,46 @@ static void stmmac_rx_queue_dma_chan_map(struct stmmac_priv *priv)
 	}
 }
 
+/**
+ *  stmmac_mac_config_rx_queues_prio - Configure RX Queue priority
+ *  @priv: driver private structure
+ *  Description: It is used for configuring the RX Queue Priority
+ */
+static void stmmac_mac_config_rx_queues_prio(struct stmmac_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	u32 prio;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		if (!priv->plat->rx_queues_cfg[queue].use_prio)
+			continue;
+
+		prio = priv->plat->rx_queues_cfg[queue].prio;
+		priv->hw->mac->rx_queue_prio(priv->hw, prio, queue);
+	}
+}
+
+/**
+ *  stmmac_mac_config_tx_queues_prio - Configure TX Queue priority
+ *  @priv: driver private structure
+ *  Description: It is used for configuring the TX Queue Priority
+ */
+static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv)
+{
+	u32 tx_queues_count = priv->plat->tx_queues_to_use;
+	u32 queue;
+	u32 prio;
+
+	for (queue = 0; queue < tx_queues_count; queue++) {
+		if (!priv->plat->tx_queues_cfg[queue].use_prio)
+			continue;
+
+		prio = priv->plat->tx_queues_cfg[queue].prio;
+		priv->hw->mac->tx_queue_prio(priv->hw, prio, queue);
+	}
+}
+
 /**
  *  stmmac_mtl_configuration - Configure MTL
  *  @priv: driver private structure
@@ -2329,6 +2369,14 @@ static void stmmac_mtl_configuration(struct stmmac_priv *priv)
 
 	/* Set the HW DMA mode and the COE */
 	stmmac_dma_operation_mode(priv);
+
+	/* Set RX priorities */
+	if (rx_queues_count > 1 && priv->hw->mac->rx_queue_prio)
+		stmmac_mac_config_rx_queues_prio(priv);
+
+	/* Set TX priorities */
+	if (tx_queues_count > 1 && priv->hw->mac->tx_queue_prio)
+		stmmac_mac_config_tx_queues_prio(priv);
 }
 
 /**
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index cea472a7c335..ffe4fac22d3d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -92,6 +92,10 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat)
 	/* Set default number of RX and TX queues to use */
 	plat->tx_queues_to_use = 1;
 	plat->rx_queues_to_use = 1;
+
+	/* Disable Priority config by default */
+	plat->tx_queues_cfg[0].use_prio = false;
+	plat->rx_queues_cfg[0].use_prio = false;
 }
 
 static int quark_default_data(struct plat_stmmacenet_data *plat,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 37f550ae76a5..77b0468dd79f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -182,6 +182,14 @@ static void stmmac_mtl_setup(struct platform_device *pdev,
 			plat->rx_queues_cfg[queue].chan = queue;
 		/* TODO: Dynamic mapping to be included in the future */
 
+		if (of_property_read_u32(q_node, "snps,priority",
+					&plat->rx_queues_cfg[queue].prio)) {
+			plat->rx_queues_cfg[queue].prio = 0;
+			plat->rx_queues_cfg[queue].use_prio = false;
+		} else {
+			plat->rx_queues_cfg[queue].use_prio = true;
+		}
+
 		queue++;
 	}
 
@@ -235,6 +243,14 @@ static void stmmac_mtl_setup(struct platform_device *pdev,
 			plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB;
 		}
 
+		if (of_property_read_u32(q_node, "snps,priority",
+					&plat->tx_queues_cfg[queue].prio)) {
+			plat->tx_queues_cfg[queue].prio = 0;
+			plat->tx_queues_cfg[queue].use_prio = false;
+		} else {
+			plat->tx_queues_cfg[queue].use_prio = true;
+		}
+
 		queue++;
 	}
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index be47b859e954..b7d5e7ae9591 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -127,6 +127,8 @@ struct stmmac_axi {
 struct stmmac_rxq_cfg {
 	u8 mode_to_use;
 	u8 chan;
+	bool use_prio;
+	u32 prio;
 };
 
 struct stmmac_txq_cfg {
@@ -137,6 +139,8 @@ struct stmmac_txq_cfg {
 	u32 idle_slope;
 	u32 high_credit;
 	u32 low_credit;
+	bool use_prio;
+	u32 prio;
 };
 
 struct plat_stmmacenet_data {
-- 
cgit v1.2.3


From abe80fdc6ee664b2f8515f91b45e852b65dbb1a1 Mon Sep 17 00:00:00 2001
From: Joao Pinto <Joao.Pinto@synopsys.com>
Date: Fri, 17 Mar 2017 16:11:07 +0000
Subject: net: stmmac: RX queue routing configuration

This patch adds the configuration of RX queues' routing.

Signed-off-by: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/stmmac.txt   |  6 ++++
 drivers/net/ethernet/stmicro/stmmac/common.h       | 17 +++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h       | 16 ++++++++++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  | 34 ++++++++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 25 ++++++++++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c   |  3 ++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 14 +++++++++
 include/linux/stmmac.h                             |  1 +
 8 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index d11bd09f4ba6..784d98862b52 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -83,6 +83,12 @@ Optional properties:
 			- snps,dcb-algorithm: Queue to be enabled as DCB
 			- snps,avb-algorithm: Queue to be enabled as AVB
 		- snps,map-to-dma-channel: Channel to map
+		- Specifiy specific packet routing:
+			- snps,route-avcp: AV Untagged Control packets
+			- snps,route-ptp: PTP Packets
+			- snps,route-dcbcp: DCB Control Packets
+			- snps,route-up: Untagged Packets
+			- snps,route-multi-broad: Multicast & Broadcast Packets
 		- snps,priority: RX queue priority (Range: 0x0 to 0xF)
 - Multiple TX Queues parameters: below the list of all the parameters to
 				 configure the multiple TX queues:
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index e0b31e759c0e..572cf8b61707 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -246,6 +246,15 @@ struct stmmac_extra_stats {
 #define STMMAC_TX_MAX_FRAMES	256
 #define STMMAC_TX_FRAMES	64
 
+/* Packets types */
+enum packets_types {
+	PACKET_AVCPQ = 0x1, /* AV Untagged Control packets */
+	PACKET_PTPQ = 0x2, /* PTP Packets */
+	PACKET_DCBCPQ = 0x3, /* DCB Control Packets */
+	PACKET_UPQ = 0x4, /* Untagged Packets */
+	PACKET_MCBCQ = 0x5, /* Multicast & Broadcast Packets */
+};
+
 /* Rx IPC status */
 enum rx_frame_status {
 	good_frame = 0x0,
@@ -473,6 +482,9 @@ struct stmmac_ops {
 	void (*rx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
 	/* TX Queues Priority */
 	void (*tx_queue_prio)(struct mac_device_info *hw, u32 prio, u32 queue);
+	/* RX Queues Routing */
+	void (*rx_queue_routing)(struct mac_device_info *hw, u8 packet,
+				 u32 queue);
 	/* Program RX Algorithms */
 	void (*prog_mtl_rx_algorithms)(struct mac_device_info *hw, u32 rx_alg);
 	/* Program TX Algorithms */
@@ -581,6 +593,11 @@ struct mac_device_info {
 	unsigned int ps;
 };
 
+struct stmmac_rx_routing {
+	u32 reg_mask;
+	u32 reg_shift;
+};
+
 struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
 					int perfect_uc_entries,
 					int *synopsys_id);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
index a6c382d22ebf..d74cedf2a397 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h
@@ -44,6 +44,22 @@
 #define GMAC_ADDR_HIGH(reg)		(0x300 + reg * 8)
 #define GMAC_ADDR_LOW(reg)		(0x304 + reg * 8)
 
+/* RX Queues Routing */
+#define GMAC_RXQCTRL_AVCPQ_MASK		GENMASK(2, 0)
+#define GMAC_RXQCTRL_AVCPQ_SHIFT	0
+#define GMAC_RXQCTRL_PTPQ_MASK		GENMASK(6, 4)
+#define GMAC_RXQCTRL_PTPQ_SHIFT		4
+#define GMAC_RXQCTRL_DCBCPQ_MASK	GENMASK(10, 8)
+#define GMAC_RXQCTRL_DCBCPQ_SHIFT	8
+#define GMAC_RXQCTRL_UPQ_MASK		GENMASK(14, 12)
+#define GMAC_RXQCTRL_UPQ_SHIFT		12
+#define GMAC_RXQCTRL_MCBCQ_MASK		GENMASK(18, 16)
+#define GMAC_RXQCTRL_MCBCQ_SHIFT	16
+#define GMAC_RXQCTRL_MCBCQEN		BIT(20)
+#define GMAC_RXQCTRL_MCBCQEN_SHIFT	20
+#define GMAC_RXQCTRL_TACPQE		BIT(21)
+#define GMAC_RXQCTRL_TACPQE_SHIFT	21
+
 /* MAC Packet Filtering */
 #define GMAC_PACKET_FILTER_PR		BIT(0)
 #define GMAC_PACKET_FILTER_HMC		BIT(2)
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 342f62abb9ca..40ce20218402 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -109,6 +109,39 @@ static void dwmac4_tx_queue_priority(struct mac_device_info *hw,
 	writel(value, ioaddr + base_register);
 }
 
+static void dwmac4_tx_queue_routing(struct mac_device_info *hw,
+				    u8 packet, u32 queue)
+{
+	void __iomem *ioaddr = hw->pcsr;
+	u32 value;
+
+	const struct stmmac_rx_routing route_possibilities[] = {
+		{ GMAC_RXQCTRL_AVCPQ_MASK, GMAC_RXQCTRL_AVCPQ_SHIFT },
+		{ GMAC_RXQCTRL_PTPQ_MASK, GMAC_RXQCTRL_PTPQ_SHIFT },
+		{ GMAC_RXQCTRL_DCBCPQ_MASK, GMAC_RXQCTRL_DCBCPQ_SHIFT },
+		{ GMAC_RXQCTRL_UPQ_MASK, GMAC_RXQCTRL_UPQ_SHIFT },
+		{ GMAC_RXQCTRL_MCBCQ_MASK, GMAC_RXQCTRL_MCBCQ_SHIFT },
+	};
+
+	value = readl(ioaddr + GMAC_RXQ_CTRL1);
+
+	/* routing configuration */
+	value &= ~route_possibilities[packet - 1].reg_mask;
+	value |= (queue << route_possibilities[packet-1].reg_shift) &
+		 route_possibilities[packet - 1].reg_mask;
+
+	/* some packets require extra ops */
+	if (packet == PACKET_AVCPQ) {
+		value &= ~GMAC_RXQCTRL_TACPQE;
+		value |= 0x1 << GMAC_RXQCTRL_TACPQE_SHIFT;
+	} else if (packet == PACKET_MCBCQ) {
+		value &= ~GMAC_RXQCTRL_MCBCQEN;
+		value |= 0x1 << GMAC_RXQCTRL_MCBCQEN_SHIFT;
+	}
+
+	writel(value, ioaddr + GMAC_RXQ_CTRL1);
+}
+
 static void dwmac4_prog_mtl_rx_algorithms(struct mac_device_info *hw,
 					  u32 rx_alg)
 {
@@ -640,6 +673,7 @@ static const struct stmmac_ops dwmac4_ops = {
 	.rx_queue_enable = dwmac4_rx_queue_enable,
 	.rx_queue_prio = dwmac4_rx_queue_priority,
 	.tx_queue_prio = dwmac4_tx_queue_priority,
+	.rx_queue_routing = dwmac4_tx_queue_routing,
 	.prog_mtl_rx_algorithms = dwmac4_prog_mtl_rx_algorithms,
 	.prog_mtl_tx_algorithms = dwmac4_prog_mtl_tx_algorithms,
 	.set_mtl_tx_queue_weight = dwmac4_set_mtl_tx_queue_weight,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 0f2c0d762e33..531bf1dc35cd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2332,6 +2332,27 @@ static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv)
 	}
 }
 
+/**
+ *  stmmac_mac_config_rx_queues_routing - Configure RX Queue Routing
+ *  @priv: driver private structure
+ *  Description: It is used for configuring the RX queue routing
+ */
+static void stmmac_mac_config_rx_queues_routing(struct stmmac_priv *priv)
+{
+	u32 rx_queues_count = priv->plat->rx_queues_to_use;
+	u32 queue;
+	u8 packet;
+
+	for (queue = 0; queue < rx_queues_count; queue++) {
+		/* no specific packet type routing specified for the queue */
+		if (priv->plat->rx_queues_cfg[queue].pkt_route == 0x0)
+			continue;
+
+		packet = priv->plat->rx_queues_cfg[queue].pkt_route;
+		priv->hw->mac->rx_queue_prio(priv->hw, packet, queue);
+	}
+}
+
 /**
  *  stmmac_mtl_configuration - Configure MTL
  *  @priv: driver private structure
@@ -2377,6 +2398,10 @@ static void stmmac_mtl_configuration(struct stmmac_priv *priv)
 	/* Set TX priorities */
 	if (tx_queues_count > 1 && priv->hw->mac->tx_queue_prio)
 		stmmac_mac_config_tx_queues_prio(priv);
+
+	/* Set RX routing */
+	if (rx_queues_count > 1 && priv->hw->mac->rx_queue_routing)
+		stmmac_mac_config_rx_queues_routing(priv);
 }
 
 /**
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
index ffe4fac22d3d..a224d7bf1c1b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c
@@ -96,6 +96,9 @@ static void stmmac_default_data(struct plat_stmmacenet_data *plat)
 	/* Disable Priority config by default */
 	plat->tx_queues_cfg[0].use_prio = false;
 	plat->rx_queues_cfg[0].use_prio = false;
+
+	/* Disable RX queues routing by default */
+	plat->rx_queues_cfg[0].pkt_route = 0x0;
 }
 
 static int quark_default_data(struct plat_stmmacenet_data *plat,
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 77b0468dd79f..f5c8b1bca002 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -190,6 +190,20 @@ static void stmmac_mtl_setup(struct platform_device *pdev,
 			plat->rx_queues_cfg[queue].use_prio = true;
 		}
 
+		/* RX queue specific packet type routing */
+		if (of_property_read_bool(q_node, "snps,route-avcp"))
+			plat->rx_queues_cfg[queue].pkt_route = PACKET_AVCPQ;
+		else if (of_property_read_bool(q_node, "snps,route-ptp"))
+			plat->rx_queues_cfg[queue].pkt_route = PACKET_PTPQ;
+		else if (of_property_read_bool(q_node, "snps,route-dcbcp"))
+			plat->rx_queues_cfg[queue].pkt_route = PACKET_DCBCPQ;
+		else if (of_property_read_bool(q_node, "snps,route-up"))
+			plat->rx_queues_cfg[queue].pkt_route = PACKET_UPQ;
+		else if (of_property_read_bool(q_node, "snps,route-multi-broad"))
+			plat->rx_queues_cfg[queue].pkt_route = PACKET_MCBCQ;
+		else
+			plat->rx_queues_cfg[queue].pkt_route = 0x0;
+
 		queue++;
 	}
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index b7d5e7ae9591..cd98ee232ad1 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -127,6 +127,7 @@ struct stmmac_axi {
 struct stmmac_rxq_cfg {
 	u8 mode_to_use;
 	u8 chan;
+	u8 pkt_route;
 	bool use_prio;
 	u32 prio;
 };
-- 
cgit v1.2.3


From f9fe1c12d126f9887441fa5bb165046f30ddd4b5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruenba@redhat.com>
Date: Sat, 18 Mar 2017 00:36:15 +0100
Subject: rhashtable: Add rhashtable_lookup_get_insert_fast

Add rhashtable_lookup_get_insert_fast for fixed keys, similar to
rhashtable_lookup_get_insert_key for explicit keys.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 092292b6675e..e507290cd2c7 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -915,6 +915,28 @@ static inline int rhashtable_lookup_insert_fast(
 	return ret == NULL ? 0 : -EEXIST;
 }
 
+/**
+ * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @params:	hash table parameters
+ *
+ * Just like rhashtable_lookup_insert_fast(), but this function returns the
+ * object if it exists, NULL if it did not and the insertion was successful,
+ * and an ERR_PTR otherwise.
+ */
+static inline void *rhashtable_lookup_get_insert_fast(
+	struct rhashtable *ht, struct rhash_head *obj,
+	const struct rhashtable_params params)
+{
+	const char *key = rht_obj(ht, obj);
+
+	BUG_ON(ht->p.obj_hashfn);
+
+	return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params,
+					false);
+}
+
 /**
  * rhashtable_lookup_insert_key - search and insert object to hash table
  *				  with explicit key
-- 
cgit v1.2.3


From 61012985eb132a2fa5e4a3eddbc33528334fa377 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 16 Mar 2017 16:23:55 +0200
Subject: iommu/vt-d: Use lo_hi_readq() / lo_hi_writeq()

There is already helper functions to do 64-bit I/O on 32-bit machines or
buses, thus we don't need to reinvent the wheel.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/intel-iommu.h | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index c573a52ae440..485a5b48f038 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -30,6 +30,8 @@
 #include <linux/mmu_notifier.h>
 #include <linux/list.h>
 #include <linux/iommu.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+
 #include <asm/cacheflush.h>
 #include <asm/iommu.h>
 
@@ -72,24 +74,8 @@
 
 #define OFFSET_STRIDE		(9)
 
-#ifdef CONFIG_64BIT
 #define dmar_readq(a) readq(a)
 #define dmar_writeq(a,v) writeq(v,a)
-#else
-static inline u64 dmar_readq(void __iomem *addr)
-{
-	u32 lo, hi;
-	lo = readl(addr);
-	hi = readl(addr + 4);
-	return (((u64) hi) << 32) + lo;
-}
-
-static inline void dmar_writeq(void __iomem *addr, u64 val)
-{
-	writel((u32)val, addr);
-	writel((u32)(val >> 32), addr + 4);
-}
-#endif
 
 #define DMAR_VER_MAJOR(v)		(((v) & 0xf0) >> 4)
 #define DMAR_VER_MINOR(v)		((v) & 0x0f)
-- 
cgit v1.2.3


From 21aff52ab2c831c2f07d48e2fa8d4bab26a66992 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 20 Mar 2017 20:11:28 +0100
Subject: iommu: Add dummy implementations for !IOMMU_IOVA

Currently, building code which uses the API guarded by the IOMMU_IOVA
will fail to link if IOMMU_IOVA is not enabled. Often this code will be
using the API provided by the IOMMU_API Kconfig symbol, but support for
this can be optional, with code falling back to contiguous memory. This
commit implements dummy functions for the IOVA API so that it can be
compiled out.

With both IOMMU_API and IOMMU_IOVA optional, code can now be built with
or without support for IOMMU without having to resort to #ifdefs in the
user code.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index f27bb2c62fca..548982ad5f2f 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -82,6 +82,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
+#ifdef CONFIG_IOMMU_IOVA
 int iova_cache_get(void);
 void iova_cache_put(void);
 
@@ -106,5 +107,95 @@ void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
 	struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi);
 void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
+#else
+static inline int iova_cache_get(void)
+{
+	return -ENOTSUPP;
+}
+
+static inline void iova_cache_put(void)
+{
+}
+
+static inline struct iova *alloc_iova_mem(void)
+{
+	return NULL;
+}
+
+static inline void free_iova_mem(struct iova *iova)
+{
+}
+
+static inline void free_iova(struct iova_domain *iovad, unsigned long pfn)
+{
+}
+
+static inline void __free_iova(struct iova_domain *iovad, struct iova *iova)
+{
+}
+
+static inline struct iova *alloc_iova(struct iova_domain *iovad,
+				      unsigned long size,
+				      unsigned long limit_pfn,
+				      bool size_aligned)
+{
+	return NULL;
+}
+
+static inline void free_iova_fast(struct iova_domain *iovad,
+				  unsigned long pfn,
+				  unsigned long size)
+{
+}
+
+static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
+					    unsigned long size,
+					    unsigned long limit_pfn)
+{
+	return 0;
+}
+
+static inline struct iova *reserve_iova(struct iova_domain *iovad,
+					unsigned long pfn_lo,
+					unsigned long pfn_hi)
+{
+	return NULL;
+}
+
+static inline void copy_reserved_iova(struct iova_domain *from,
+				      struct iova_domain *to)
+{
+}
+
+static inline void init_iova_domain(struct iova_domain *iovad,
+				    unsigned long granule,
+				    unsigned long start_pfn,
+				    unsigned long pfn_32bit)
+{
+}
+
+static inline struct iova *find_iova(struct iova_domain *iovad,
+				     unsigned long pfn)
+{
+	return NULL;
+}
+
+static inline void put_iova_domain(struct iova_domain *iovad)
+{
+}
+
+static inline struct iova *split_and_remove_iova(struct iova_domain *iovad,
+						 struct iova *iova,
+						 unsigned long pfn_lo,
+						 unsigned long pfn_hi)
+{
+	return NULL;
+}
+
+static inline void free_cpu_cached_iovas(unsigned int cpu,
+					 struct iova_domain *iovad)
+{
+}
+#endif
 
 #endif
-- 
cgit v1.2.3


From 273df9635385b2156851c7ee49f40658d7bcb29d Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 16 Mar 2017 17:00:19 +0000
Subject: iommu/dma: Make PCI window reservation generic

Now that we're applying the IOMMU API reserved regions to our IOVA
domains, we shouldn't need to privately special-case PCI windows, or
indeed anything else which isn't specific to our iommu-dma layer.
However, since those aren't IOMMU-specific either, rather than start
duplicating code into IOMMU drivers let's transform the existing
function into an iommu_get_resv_regions() helper that they can share.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm-smmu-v3.c |  2 ++
 drivers/iommu/arm-smmu.c    |  2 ++
 drivers/iommu/dma-iommu.c   | 38 ++++++++++++++++++++++++++++----------
 include/linux/dma-iommu.h   |  5 +++++
 4 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 591bb96047c9..bbd46efbe075 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1893,6 +1893,8 @@ static void arm_smmu_get_resv_regions(struct device *dev,
 		return;
 
 	list_add_tail(&region->list, head);
+
+	iommu_dma_get_resv_regions(dev, head);
 }
 
 static void arm_smmu_put_resv_regions(struct device *dev,
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index b493c99e17f7..9b33700b7c69 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1613,6 +1613,8 @@ static void arm_smmu_get_resv_regions(struct device *dev,
 		return;
 
 	list_add_tail(&region->list, head);
+
+	iommu_dma_get_resv_regions(dev, head);
 }
 
 static void arm_smmu_put_resv_regions(struct device *dev,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5787f919f4ec..85652110c8ff 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,22 +167,43 @@ void iommu_put_dma_cookie(struct iommu_domain *domain)
 }
 EXPORT_SYMBOL(iommu_put_dma_cookie);
 
-static void iova_reserve_pci_windows(struct pci_dev *dev,
-		struct iova_domain *iovad)
+/**
+ * iommu_dma_get_resv_regions - Reserved region driver helper
+ * @dev: Device from iommu_get_resv_regions()
+ * @list: Reserved region list from iommu_get_resv_regions()
+ *
+ * IOMMU drivers can use this to implement their .get_resv_regions callback
+ * for general non-IOMMU-specific reservations. Currently, this covers host
+ * bridge windows for PCI devices.
+ */
+void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-	struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+	struct pci_host_bridge *bridge;
 	struct resource_entry *window;
-	unsigned long lo, hi;
 
+	if (!dev_is_pci(dev))
+		return;
+
+	bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
 	resource_list_for_each_entry(window, &bridge->windows) {
+		struct iommu_resv_region *region;
+		phys_addr_t start;
+		size_t length;
+
 		if (resource_type(window->res) != IORESOURCE_MEM)
 			continue;
 
-		lo = iova_pfn(iovad, window->res->start - window->offset);
-		hi = iova_pfn(iovad, window->res->end - window->offset);
-		reserve_iova(iovad, lo, hi);
+		start = window->res->start - window->offset;
+		length = window->res->end - window->res->start + 1;
+		region = iommu_alloc_resv_region(start, length, 0,
+				IOMMU_RESV_RESERVED);
+		if (!region)
+			return;
+
+		list_add_tail(&region->list, list);
 	}
 }
+EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
 static int cookie_init_hw_msi_region(struct iommu_dma_cookie *cookie,
 		phys_addr_t start, phys_addr_t end)
@@ -218,9 +239,6 @@ static int iova_reserve_iommu_regions(struct device *dev,
 	LIST_HEAD(resv_regions);
 	int ret = 0;
 
-	if (dev_is_pci(dev))
-		iova_reserve_pci_windows(to_pci_dev(dev), iovad);
-
 	iommu_get_resv_regions(dev, &resv_regions);
 	list_for_each_entry(region, &resv_regions, list) {
 		unsigned long lo, hi;
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 5725c94b1f12..b6635a46fc7c 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -71,6 +71,7 @@ int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg);
+void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
 #else
 
@@ -100,6 +101,10 @@ static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg)
 {
 }
 
+static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
+{
+}
+
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __KERNEL__ */
 #endif	/* __DMA_IOMMU_H */
-- 
cgit v1.2.3


From 4c355cdfbba537971b5c3849680b1b6453a7a383 Mon Sep 17 00:00:00 2001
From: "Reshetova, Elena" <elena.reshetova@intel.com>
Date: Tue, 21 Mar 2017 13:59:19 +0200
Subject: net: convert sk_filter.refcnt from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  3 ++-
 net/core/filter.c      | 17 ++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index dffa072b7b79..511fe910bf1d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -7,6 +7,7 @@
 #include <stdarg.h>
 
 #include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/compat.h>
 #include <linux/skbuff.h>
 #include <linux/linkage.h>
@@ -430,7 +431,7 @@ struct bpf_prog {
 };
 
 struct sk_filter {
-	atomic_t	refcnt;
+	refcount_t	refcnt;
 	struct rcu_head	rcu;
 	struct bpf_prog	*prog;
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index ebaeaf2e46e8..c7f0ccd1c0d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -928,7 +928,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu)
  */
 static void sk_filter_release(struct sk_filter *fp)
 {
-	if (atomic_dec_and_test(&fp->refcnt))
+	if (refcount_dec_and_test(&fp->refcnt))
 		call_rcu(&fp->rcu, sk_filter_release_rcu);
 }
 
@@ -943,20 +943,27 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 /* try to charge the socket memory if there is space available
  * return true on success
  */
-bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 {
 	u32 filter_size = bpf_prog_size(fp->prog->len);
 
 	/* same check as in sock_kmalloc() */
 	if (filter_size <= sysctl_optmem_max &&
 	    atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
-		atomic_inc(&fp->refcnt);
 		atomic_add(filter_size, &sk->sk_omem_alloc);
 		return true;
 	}
 	return false;
 }
 
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
+{
+	bool ret = __sk_filter_charge(sk, fp);
+	if (ret)
+		refcount_inc(&fp->refcnt);
+	return ret;
+}
+
 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 {
 	struct sock_filter *old_prog;
@@ -1179,12 +1186,12 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
 		return -ENOMEM;
 
 	fp->prog = prog;
-	atomic_set(&fp->refcnt, 0);
 
-	if (!sk_filter_charge(sk, fp)) {
+	if (!__sk_filter_charge(sk, fp)) {
 		kfree(fp);
 		return -ENOMEM;
 	}
+	refcount_set(&fp->refcnt, 1);
 
 	old_fp = rcu_dereference_protected(sk->sk_filter,
 					   lockdep_sock_is_held(sk));
-- 
cgit v1.2.3


From 2d72d5016f00fc7d64b95e79405787dea73669af Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 21 Mar 2017 16:12:11 +0100
Subject: net: stmmac: Use AVB mode by default

Prior to the recent multi-queue changes the driver would configure the
queues to use the AVB mode, but the mode then got switched to DCB. The
hardware still works fine in DCB mode, but my testing capabilities are
limited, so it's safer to revert to the prior setting anyway.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Acked-By: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/stmmac.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index cd98ee232ad1..3921cb9dfadb 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -56,8 +56,8 @@
 #define MTL_RX_ALGORITHM_WSP	0x5
 
 /* RX/TX Queue Mode */
-#define MTL_QUEUE_DCB		0x0
-#define MTL_QUEUE_AVB		0x1
+#define MTL_QUEUE_AVB		0x0
+#define MTL_QUEUE_DCB		0x1
 
 /* The MDC clock could be set higher than the IEEE 802.3
  * specified frequency limit 0f 2.5 MHz, by programming a clock divider
-- 
cgit v1.2.3


From 9860118b58241169f67ba77dfeb935fcf53ce4cd Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Mar 2017 16:36:37 +0000
Subject: net: phy: move phy MMD accessors to phy-core.c

Move the phy_(read|write)__mmd() helpers out of line, they will become
our main MMD accessor functions, and so will be a little more complex.
This complexity doesn't belong in an inline function.  Also move the
_indirect variants as well to keep like functionality together.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/Makefile   |   2 +-
 drivers/net/phy/phy-core.c | 135 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/phy.c      |  85 ----------------------------
 include/linux/phy.h        |  20 +------
 4 files changed, 138 insertions(+), 104 deletions(-)
 create mode 100644 drivers/net/phy/phy-core.c

(limited to 'include/linux')

diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 407b0b601ea8..82d915614646 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -1,7 +1,7 @@
 # Makefile for Linux PHY drivers and MDIO bus drivers
 
 libphy-y			:= phy.o phy_device.o mdio_bus.o mdio_device.o \
-				   mdio-boardinfo.o
+				   mdio-boardinfo.o phy-core.o
 libphy-$(CONFIG_SWPHY)		+= swphy.o
 libphy-$(CONFIG_LED_TRIGGER_PHY)	+= phy_led_triggers.o
 
diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
new file mode 100644
index 000000000000..b8d8276a3099
--- /dev/null
+++ b/drivers/net/phy/phy-core.c
@@ -0,0 +1,135 @@
+/*
+ * Core PHY library, taken from phy.c
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+#include <linux/export.h>
+#include <linux/phy.h>
+
+static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad,
+				    int addr)
+{
+	/* Write the desired MMD Devad */
+	bus->write(bus, addr, MII_MMD_CTRL, devad);
+
+	/* Write the desired MMD register address */
+	bus->write(bus, addr, MII_MMD_DATA, prtad);
+
+	/* Select the Function : DATA with no post increment */
+	bus->write(bus, addr, MII_MMD_CTRL, (devad | MII_MMD_CTRL_NOINCR));
+}
+
+/**
+ * phy_read_mmd_indirect - reads data from the MMD registers
+ * @phydev: The PHY device bus
+ * @prtad: MMD Address
+ * @devad: MMD DEVAD
+ *
+ * Description: it reads data from the MMD registers (clause 22 to access to
+ * clause 45) of the specified phy address.
+ * To read these register we have:
+ * 1) Write reg 13 // DEVAD
+ * 2) Write reg 14 // MMD Address
+ * 3) Write reg 13 // MMD Data Command for MMD DEVAD
+ * 3) Read  reg 14 // Read MMD data
+ */
+int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad)
+{
+	struct phy_driver *phydrv = phydev->drv;
+	int addr = phydev->mdio.addr;
+	int value = -1;
+
+	if (!phydrv->read_mmd_indirect) {
+		struct mii_bus *bus = phydev->mdio.bus;
+
+		mutex_lock(&bus->mdio_lock);
+		mmd_phy_indirect(bus, prtad, devad, addr);
+
+		/* Read the content of the MMD's selected register */
+		value = bus->read(bus, addr, MII_MMD_DATA);
+		mutex_unlock(&bus->mdio_lock);
+	} else {
+		value = phydrv->read_mmd_indirect(phydev, prtad, devad, addr);
+	}
+	return value;
+}
+EXPORT_SYMBOL(phy_read_mmd_indirect);
+
+/**
+ * phy_read_mmd - Convenience function for reading a register
+ * from an MMD on a given PHY.
+ * @phydev: The phy_device struct
+ * @devad: The MMD to read from
+ * @regnum: The register on the MMD to read
+ *
+ * Same rules as for phy_read();
+ */
+int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)
+{
+	if (!phydev->is_c45)
+		return -EOPNOTSUPP;
+
+	return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr,
+			    MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff));
+}
+EXPORT_SYMBOL(phy_read_mmd);
+
+/**
+ * phy_write_mmd_indirect - writes data to the MMD registers
+ * @phydev: The PHY device
+ * @prtad: MMD Address
+ * @devad: MMD DEVAD
+ * @data: data to write in the MMD register
+ *
+ * Description: Write data from the MMD registers of the specified
+ * phy address.
+ * To write these register we have:
+ * 1) Write reg 13 // DEVAD
+ * 2) Write reg 14 // MMD Address
+ * 3) Write reg 13 // MMD Data Command for MMD DEVAD
+ * 3) Write reg 14 // Write MMD data
+ */
+void phy_write_mmd_indirect(struct phy_device *phydev, int prtad,
+				   int devad, u32 data)
+{
+	struct phy_driver *phydrv = phydev->drv;
+	int addr = phydev->mdio.addr;
+
+	if (!phydrv->write_mmd_indirect) {
+		struct mii_bus *bus = phydev->mdio.bus;
+
+		mutex_lock(&bus->mdio_lock);
+		mmd_phy_indirect(bus, prtad, devad, addr);
+
+		/* Write the data into MMD's selected register */
+		bus->write(bus, addr, MII_MMD_DATA, data);
+		mutex_unlock(&bus->mdio_lock);
+	} else {
+		phydrv->write_mmd_indirect(phydev, prtad, devad, addr, data);
+	}
+}
+EXPORT_SYMBOL(phy_write_mmd_indirect);
+
+/**
+ * phy_write_mmd - Convenience function for writing a register
+ * on an MMD on a given PHY.
+ * @phydev: The phy_device struct
+ * @devad: The MMD to read from
+ * @regnum: The register on the MMD to read
+ * @val: value to write to @regnum
+ *
+ * Same rules as for phy_write();
+ */
+int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
+{
+	if (!phydev->is_c45)
+		return -EOPNOTSUPP;
+
+	regnum = MII_ADDR_C45 | ((devad & 0x1f) << 16) | (regnum & 0xffff);
+
+	return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val);
+}
+EXPORT_SYMBOL(phy_write_mmd);
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 1be69d8bc909..ffc28c42e2d1 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1192,91 +1192,6 @@ void phy_mac_interrupt(struct phy_device *phydev, int new_link)
 }
 EXPORT_SYMBOL(phy_mac_interrupt);
 
-static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad,
-				    int addr)
-{
-	/* Write the desired MMD Devad */
-	bus->write(bus, addr, MII_MMD_CTRL, devad);
-
-	/* Write the desired MMD register address */
-	bus->write(bus, addr, MII_MMD_DATA, prtad);
-
-	/* Select the Function : DATA with no post increment */
-	bus->write(bus, addr, MII_MMD_CTRL, (devad | MII_MMD_CTRL_NOINCR));
-}
-
-/**
- * phy_read_mmd_indirect - reads data from the MMD registers
- * @phydev: The PHY device bus
- * @prtad: MMD Address
- * @devad: MMD DEVAD
- *
- * Description: it reads data from the MMD registers (clause 22 to access to
- * clause 45) of the specified phy address.
- * To read these register we have:
- * 1) Write reg 13 // DEVAD
- * 2) Write reg 14 // MMD Address
- * 3) Write reg 13 // MMD Data Command for MMD DEVAD
- * 3) Read  reg 14 // Read MMD data
- */
-int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad)
-{
-	struct phy_driver *phydrv = phydev->drv;
-	int addr = phydev->mdio.addr;
-	int value = -1;
-
-	if (!phydrv->read_mmd_indirect) {
-		struct mii_bus *bus = phydev->mdio.bus;
-
-		mutex_lock(&bus->mdio_lock);
-		mmd_phy_indirect(bus, prtad, devad, addr);
-
-		/* Read the content of the MMD's selected register */
-		value = bus->read(bus, addr, MII_MMD_DATA);
-		mutex_unlock(&bus->mdio_lock);
-	} else {
-		value = phydrv->read_mmd_indirect(phydev, prtad, devad, addr);
-	}
-	return value;
-}
-EXPORT_SYMBOL(phy_read_mmd_indirect);
-
-/**
- * phy_write_mmd_indirect - writes data to the MMD registers
- * @phydev: The PHY device
- * @prtad: MMD Address
- * @devad: MMD DEVAD
- * @data: data to write in the MMD register
- *
- * Description: Write data from the MMD registers of the specified
- * phy address.
- * To write these register we have:
- * 1) Write reg 13 // DEVAD
- * 2) Write reg 14 // MMD Address
- * 3) Write reg 13 // MMD Data Command for MMD DEVAD
- * 3) Write reg 14 // Write MMD data
- */
-void phy_write_mmd_indirect(struct phy_device *phydev, int prtad,
-				   int devad, u32 data)
-{
-	struct phy_driver *phydrv = phydev->drv;
-	int addr = phydev->mdio.addr;
-
-	if (!phydrv->write_mmd_indirect) {
-		struct mii_bus *bus = phydev->mdio.bus;
-
-		mutex_lock(&bus->mdio_lock);
-		mmd_phy_indirect(bus, prtad, devad, addr);
-
-		/* Write the data into MMD's selected register */
-		bus->write(bus, addr, MII_MMD_DATA, data);
-		mutex_unlock(&bus->mdio_lock);
-	} else {
-		phydrv->write_mmd_indirect(phydev, prtad, devad, addr, data);
-	}
-}
-EXPORT_SYMBOL(phy_write_mmd_indirect);
-
 /**
  * phy_init_eee - init and check the EEE feature
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 43a774873aa9..bcb4549b41d6 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -651,14 +651,7 @@ struct phy_fixup {
  *
  * Same rules as for phy_read();
  */
-static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)
-{
-	if (!phydev->is_c45)
-		return -EOPNOTSUPP;
-
-	return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr,
-			    MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff));
-}
+int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
 
 /**
  * phy_read_mmd_indirect - reads data from the MMD registers
@@ -752,16 +745,7 @@ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev)
  *
  * Same rules as for phy_write();
  */
-static inline int phy_write_mmd(struct phy_device *phydev, int devad,
-				u32 regnum, u16 val)
-{
-	if (!phydev->is_c45)
-		return -EOPNOTSUPP;
-
-	regnum = MII_ADDR_C45 | ((devad & 0x1f) << 16) | (regnum & 0xffff);
-
-	return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val);
-}
+int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);
 
 /**
  * phy_write_mmd_indirect - writes data to the MMD registers
-- 
cgit v1.2.3


From 1ee6b9bc6206cd0837bc16e46f580e40fe663384 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Mar 2017 16:36:43 +0000
Subject: net: phy: make phy_(read|write)_mmd() generic MMD accessors

Make phy_(read|write)_mmd() generic 802.3 clause 45 register accessors
for both Clause 22 and Clause 45 PHYs, using either the direct register
reading for Clause 45, or the indirect method for Clause 22 PHYs.
Allow this behaviour to be overriden by PHY drivers where necessary.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 33 +++++++++++++++++++++++++--------
 include/linux/phy.h        | 24 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index b8d8276a3099..d791100afab2 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -69,11 +69,18 @@ EXPORT_SYMBOL(phy_read_mmd_indirect);
  */
 int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)
 {
-	if (!phydev->is_c45)
-		return -EOPNOTSUPP;
+	if (regnum > (u16)~0 || devad > 32)
+		return -EINVAL;
 
-	return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr,
-			    MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff));
+	if (phydev->drv->read_mmd)
+		return phydev->drv->read_mmd(phydev, devad, regnum);
+
+	if (phydev->is_c45) {
+		u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff);
+		return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, addr);
+	}
+
+	return phy_read_mmd_indirect(phydev, regnum, devad);
 }
 EXPORT_SYMBOL(phy_read_mmd);
 
@@ -125,11 +132,21 @@ EXPORT_SYMBOL(phy_write_mmd_indirect);
  */
 int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
 {
-	if (!phydev->is_c45)
-		return -EOPNOTSUPP;
+	if (regnum > (u16)~0 || devad > 32)
+		return -EINVAL;
+
+	if (phydev->drv->read_mmd)
+		return phydev->drv->write_mmd(phydev, devad, regnum, val);
+
+	if (phydev->is_c45) {
+		u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff);
+
+		return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr,
+				     addr, val);
+	}
 
-	regnum = MII_ADDR_C45 | ((devad & 0x1f) << 16) | (regnum & 0xffff);
+	phy_write_mmd_indirect(phydev, regnum, devad, val);
 
-	return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val);
+	return 0;
 }
 EXPORT_SYMBOL(phy_write_mmd);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index bcb4549b41d6..b8feeffeb64c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -587,6 +587,30 @@ struct phy_driver {
 	 */
 	void (*link_change_notify)(struct phy_device *dev);
 
+	/*
+	 * Phy specific driver override for reading a MMD register.
+	 * This function is optional for PHY specific drivers.  When
+	 * not provided, the default MMD read function will be used
+	 * by phy_read_mmd(), which will use either a direct read for
+	 * Clause 45 PHYs or an indirect read for Clause 22 PHYs.
+	 *  devnum is the MMD device number within the PHY device,
+	 *  regnum is the register within the selected MMD device.
+	 */
+	int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum);
+
+	/*
+	 * Phy specific driver override for writing a MMD register.
+	 * This function is optional for PHY specific drivers.  When
+	 * not provided, the default MMD write function will be used
+	 * by phy_write_mmd(), which will use either a direct write for
+	 * Clause 45 PHYs, or an indirect write for Clause 22 PHYs.
+	 *  devnum is the MMD device number within the PHY device,
+	 *  regnum is the register within the selected MMD device.
+	 *  val is the value to be written.
+	 */
+	int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum,
+			 u16 val);
+
 	/* A function provided by a phy specific driver to override the
 	 * the PHY driver framework support for reading a MMD register
 	 * from the PHY. If not supported, return -1. This function is
-- 
cgit v1.2.3


From 3b85d8df2655a4a5831ee8233108b53e69efa1ed Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Tue, 21 Mar 2017 16:37:03 +0000
Subject: net: phy: remove the indirect MMD read/write methods

Remove the indirect MMD read/write methods which are now no longer
necessary.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy-core.c | 119 +++++++++++++--------------------------------
 include/linux/phy.h        |  42 ----------------
 2 files changed, 34 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c
index d791100afab2..80795ccd3fab 100644
--- a/drivers/net/phy/phy-core.c
+++ b/drivers/net/phy/phy-core.c
@@ -22,103 +22,42 @@ static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad,
 	bus->write(bus, addr, MII_MMD_CTRL, (devad | MII_MMD_CTRL_NOINCR));
 }
 
-/**
- * phy_read_mmd_indirect - reads data from the MMD registers
- * @phydev: The PHY device bus
- * @prtad: MMD Address
- * @devad: MMD DEVAD
- *
- * Description: it reads data from the MMD registers (clause 22 to access to
- * clause 45) of the specified phy address.
- * To read these register we have:
- * 1) Write reg 13 // DEVAD
- * 2) Write reg 14 // MMD Address
- * 3) Write reg 13 // MMD Data Command for MMD DEVAD
- * 3) Read  reg 14 // Read MMD data
- */
-int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad)
-{
-	struct phy_driver *phydrv = phydev->drv;
-	int addr = phydev->mdio.addr;
-	int value = -1;
-
-	if (!phydrv->read_mmd_indirect) {
-		struct mii_bus *bus = phydev->mdio.bus;
-
-		mutex_lock(&bus->mdio_lock);
-		mmd_phy_indirect(bus, prtad, devad, addr);
-
-		/* Read the content of the MMD's selected register */
-		value = bus->read(bus, addr, MII_MMD_DATA);
-		mutex_unlock(&bus->mdio_lock);
-	} else {
-		value = phydrv->read_mmd_indirect(phydev, prtad, devad, addr);
-	}
-	return value;
-}
-EXPORT_SYMBOL(phy_read_mmd_indirect);
-
 /**
  * phy_read_mmd - Convenience function for reading a register
  * from an MMD on a given PHY.
  * @phydev: The phy_device struct
- * @devad: The MMD to read from
- * @regnum: The register on the MMD to read
+ * @devad: The MMD to read from (0..31)
+ * @regnum: The register on the MMD to read (0..65535)
  *
  * Same rules as for phy_read();
  */
 int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum)
 {
+	int val;
+
 	if (regnum > (u16)~0 || devad > 32)
 		return -EINVAL;
 
-	if (phydev->drv->read_mmd)
-		return phydev->drv->read_mmd(phydev, devad, regnum);
-
-	if (phydev->is_c45) {
+	if (phydev->drv->read_mmd) {
+		val = phydev->drv->read_mmd(phydev, devad, regnum);
+	} else if (phydev->is_c45) {
 		u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff);
-		return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, addr);
-	}
 
-	return phy_read_mmd_indirect(phydev, regnum, devad);
-}
-EXPORT_SYMBOL(phy_read_mmd);
-
-/**
- * phy_write_mmd_indirect - writes data to the MMD registers
- * @phydev: The PHY device
- * @prtad: MMD Address
- * @devad: MMD DEVAD
- * @data: data to write in the MMD register
- *
- * Description: Write data from the MMD registers of the specified
- * phy address.
- * To write these register we have:
- * 1) Write reg 13 // DEVAD
- * 2) Write reg 14 // MMD Address
- * 3) Write reg 13 // MMD Data Command for MMD DEVAD
- * 3) Write reg 14 // Write MMD data
- */
-void phy_write_mmd_indirect(struct phy_device *phydev, int prtad,
-				   int devad, u32 data)
-{
-	struct phy_driver *phydrv = phydev->drv;
-	int addr = phydev->mdio.addr;
-
-	if (!phydrv->write_mmd_indirect) {
+		val = mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, addr);
+	} else {
 		struct mii_bus *bus = phydev->mdio.bus;
+		int phy_addr = phydev->mdio.addr;
 
 		mutex_lock(&bus->mdio_lock);
-		mmd_phy_indirect(bus, prtad, devad, addr);
+		mmd_phy_indirect(bus, regnum, devad, phy_addr);
 
-		/* Write the data into MMD's selected register */
-		bus->write(bus, addr, MII_MMD_DATA, data);
+		/* Read the content of the MMD's selected register */
+		val = bus->read(bus, phy_addr, MII_MMD_DATA);
 		mutex_unlock(&bus->mdio_lock);
-	} else {
-		phydrv->write_mmd_indirect(phydev, prtad, devad, addr, data);
 	}
+	return val;
 }
-EXPORT_SYMBOL(phy_write_mmd_indirect);
+EXPORT_SYMBOL(phy_read_mmd);
 
 /**
  * phy_write_mmd - Convenience function for writing a register
@@ -132,21 +71,31 @@ EXPORT_SYMBOL(phy_write_mmd_indirect);
  */
 int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val)
 {
+	int ret;
+
 	if (regnum > (u16)~0 || devad > 32)
 		return -EINVAL;
 
-	if (phydev->drv->read_mmd)
-		return phydev->drv->write_mmd(phydev, devad, regnum, val);
-
-	if (phydev->is_c45) {
+	if (phydev->drv->read_mmd) {
+		ret = phydev->drv->write_mmd(phydev, devad, regnum, val);
+	} else if (phydev->is_c45) {
 		u32 addr = MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff);
 
-		return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr,
-				     addr, val);
-	}
+		ret = mdiobus_write(phydev->mdio.bus, phydev->mdio.addr,
+				    addr, val);
+	} else {
+		struct mii_bus *bus = phydev->mdio.bus;
+		int phy_addr = phydev->mdio.addr;
 
-	phy_write_mmd_indirect(phydev, regnum, devad, val);
+		mutex_lock(&bus->mdio_lock);
+		mmd_phy_indirect(bus, regnum, devad, phy_addr);
 
-	return 0;
+		/* Write the data into MMD's selected register */
+		bus->write(bus, phy_addr, MII_MMD_DATA, val);
+		mutex_unlock(&bus->mdio_lock);
+
+		ret = 0;
+	}
+	return ret;
 }
 EXPORT_SYMBOL(phy_write_mmd);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index b8feeffeb64c..2efca6b39fba 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -611,24 +611,6 @@ struct phy_driver {
 	int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum,
 			 u16 val);
 
-	/* A function provided by a phy specific driver to override the
-	 * the PHY driver framework support for reading a MMD register
-	 * from the PHY. If not supported, return -1. This function is
-	 * optional for PHY specific drivers, if not provided then the
-	 * default MMD read function is used by the PHY framework.
-	 */
-	int (*read_mmd_indirect)(struct phy_device *dev, int ptrad,
-				 int devnum, int regnum);
-
-	/* A function provided by a phy specific driver to override the
-	 * the PHY driver framework support for writing a MMD register
-	 * from the PHY. This function is optional for PHY specific drivers,
-	 * if not provided then the default MMD read function is used by
-	 * the PHY framework.
-	 */
-	void (*write_mmd_indirect)(struct phy_device *dev, int ptrad,
-				   int devnum, int regnum, u32 val);
-
 	/* Get the size and type of the eeprom contained within a plug-in
 	 * module */
 	int (*module_info)(struct phy_device *dev,
@@ -677,17 +659,6 @@ struct phy_fixup {
  */
 int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum);
 
-/**
- * phy_read_mmd_indirect - reads data from the MMD registers
- * @phydev: The PHY device bus
- * @prtad: MMD Address
- * @addr: PHY address on the MII bus
- *
- * Description: it reads data from the MMD registers (clause 22 to access to
- * clause 45) of the specified phy address.
- */
-int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad);
-
 /**
  * phy_read - Convenience function for reading a given PHY register
  * @phydev: the phy_device struct
@@ -771,19 +742,6 @@ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev)
  */
 int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);
 
-/**
- * phy_write_mmd_indirect - writes data to the MMD registers
- * @phydev: The PHY device
- * @prtad: MMD Address
- * @devad: MMD DEVAD
- * @data: data to write in the MMD register
- *
- * Description: Write data from the MMD registers of the specified
- * phy address.
- */
-void phy_write_mmd_indirect(struct phy_device *phydev, int prtad,
-			    int devad, u32 data);
-
 struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
 				     bool is_c45,
 				     struct phy_c45_device_ids *c45_ids);
-- 
cgit v1.2.3


From 0634c2958927198797bf9e55d26fb51cce4c22b4 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 22 Mar 2017 09:16:27 -0500
Subject: of: Add function for generating a DT modalias with a newline

The modalias sysfs attr is lacking a newline for DT aliases on platform
devices. The macio and ibmebus correctly add the newline, but open code it.
Introduce a new function, of_device_modalias(), that fills the buffer with
the modalias including the newline and update users of the old
of_device_get_modalias function.

Signed-off-by: Rob Herring <robh@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Frank Rowand <frowand.list@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/pseries/ibmebus.c |  5 +----
 drivers/base/platform.c                  |  2 +-
 drivers/macintosh/macio_sysfs.c          |  7 +------
 drivers/of/device.c                      | 18 ++++++++++++++++--
 drivers/tty/serdev/core.c                |  5 +----
 drivers/usb/common/ulpi.c                |  2 +-
 include/linux/of_device.h                |  7 +++----
 7 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
index 99a6bf7f3bcf..b363e439ddb9 100644
--- a/arch/powerpc/platforms/pseries/ibmebus.c
+++ b/arch/powerpc/platforms/pseries/ibmebus.c
@@ -410,10 +410,7 @@ static ssize_t name_show(struct device *dev,
 static ssize_t modalias_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
-	buf[len] = '\n';
-	buf[len+1] = 0;
-	return len+1;
+	return of_device_modalias(dev, buf, PAGE_SIZE);
 }
 
 static struct device_attribute ibmebus_bus_device_attrs[] = {
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index c2456839214a..a102152301c8 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -847,7 +847,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
 	struct platform_device	*pdev = to_platform_device(dev);
 	int len;
 
-	len = of_device_get_modalias(dev, buf, PAGE_SIZE -1);
+	len = of_device_modalias(dev, buf, PAGE_SIZE);
 	if (len != -ENODEV)
 		return len;
 
diff --git a/drivers/macintosh/macio_sysfs.c b/drivers/macintosh/macio_sysfs.c
index 8eb40afbd0f5..0b1f9c76c68d 100644
--- a/drivers/macintosh/macio_sysfs.c
+++ b/drivers/macintosh/macio_sysfs.c
@@ -41,12 +41,7 @@ compatible_show (struct device *dev, struct device_attribute *attr, char *buf)
 static ssize_t modalias_show (struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
-	int len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
-
-	buf[len] = '\n';
-	buf[len+1] = 0;
-
-	return len+1;
+	return of_device_modalias(dev, buf, PAGE_SIZE);
 }
 
 static ssize_t devspec_show(struct device *dev,
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 210432a3c4ec..6e2f9113b1b7 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -176,7 +176,7 @@ const void *of_device_get_match_data(const struct device *dev)
 }
 EXPORT_SYMBOL(of_device_get_match_data);
 
-ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
+static ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
 {
 	const char *compat;
 	int cplen, i;
@@ -225,7 +225,6 @@ ssize_t of_device_get_modalias(struct device *dev, char *str, ssize_t len)
 
 	return repend;
 }
-EXPORT_SYMBOL_GPL(of_device_get_modalias);
 
 int of_device_request_module(struct device *dev)
 {
@@ -250,6 +249,21 @@ int of_device_request_module(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(of_device_request_module);
 
+/**
+ * of_device_modalias - Fill buffer with newline terminated modalias string
+ */
+ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len)
+{
+	ssize_t sl = of_device_get_modalias(dev, str, len - 2);
+	if (sl < 0)
+		return sl;
+
+	str[sl++] = '\n';
+	str[sl] = 0;
+	return sl;
+}
+EXPORT_SYMBOL_GPL(of_device_modalias);
+
 /**
  * of_device_uevent - Display OF related uevent information
  */
diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index f4c6c90add78..531aa89ff243 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -191,10 +191,7 @@ static int serdev_drv_remove(struct device *dev)
 static ssize_t modalias_show(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
-	buf[len] = '\n';
-	buf[len+1] = 0;
-	return len+1;
+	return of_device_modalias(dev, buf, PAGE_SIZE);
 }
 
 static struct device_attribute serdev_device_attrs[] = {
diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c
index c9480d77810c..930e8f35f8df 100644
--- a/drivers/usb/common/ulpi.c
+++ b/drivers/usb/common/ulpi.c
@@ -107,7 +107,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 	int len;
 	struct ulpi *ulpi = to_ulpi_dev(dev);
 
-	len = of_device_get_modalias(dev, buf, PAGE_SIZE - 1);
+	len = of_device_modalias(dev, buf, PAGE_SIZE);
 	if (len != -ENODEV)
 		return len;
 
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index c12dace043f3..169ea0bd8eb4 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -34,8 +34,7 @@ extern void of_device_unregister(struct platform_device *ofdev);
 
 extern const void *of_device_get_match_data(const struct device *dev);
 
-extern ssize_t of_device_get_modalias(struct device *dev,
-					char *str, ssize_t len);
+extern ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len);
 extern int of_device_request_module(struct device *dev);
 
 extern void of_device_uevent(struct device *dev, struct kobj_uevent_env *env);
@@ -72,8 +71,8 @@ static inline const void *of_device_get_match_data(const struct device *dev)
 	return NULL;
 }
 
-static inline int of_device_get_modalias(struct device *dev,
-				   char *str, ssize_t len)
+static inline int of_device_modalias(struct device *dev,
+				     char *str, ssize_t len)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From bbea124bc99df968011e76eba105fe964a4eceab Mon Sep 17 00:00:00 2001
From: Joel Scherpelz <jscherpelz@google.com>
Date: Wed, 22 Mar 2017 18:19:04 +0900
Subject: net: ipv6: Add sysctl for minimum prefix len acceptable in RIOs.

This commit adds a new sysctl accept_ra_rt_info_min_plen that
defines the minimum acceptable prefix length of Route Information
Options. The new sysctl is intended to be used together with
accept_ra_rt_info_max_plen to configure a range of acceptable
prefix lengths. It is useful to prevent misconfigurations from
unintentionally blackholing too much of the IPv6 address space
(e.g., home routers announcing RIOs for fc00::/7, which is
incorrect).

Signed-off-by: Joel Scherpelz <jscherpelz@google.com>
Acked-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 13 +++++++++++--
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 include/uapi/linux/sysctl.h            |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       |  2 ++
 6 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index b57308e76b1d..eaee2c8d4c00 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1461,11 +1461,20 @@ accept_ra_pinfo - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_rt_info_min_plen - INTEGER
+	Minimum prefix length of Route Information in RA.
+
+	Route Information w/ prefix smaller than this variable shall
+	be ignored.
+
+	Functional default: 0 if accept_ra_rtr_pref is enabled.
+			    -1 if accept_ra_rtr_pref is disabled.
+
 accept_ra_rt_info_max_plen - INTEGER
 	Maximum prefix length of Route Information in RA.
 
-	Route Information w/ prefix larger than or equal to this
-	variable shall be ignored.
+	Route Information w/ prefix larger than this variable shall
+	be ignored.
 
 	Functional default: 0 if accept_ra_rtr_pref is enabled.
 			    -1 if accept_ra_rtr_pref is disabled.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index f0d79bd054ca..e1b442996f81 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -37,6 +37,7 @@ struct ipv6_devconf {
 	__s32		accept_ra_rtr_pref;
 	__s32		rtr_probe_interval;
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	__s32		accept_ra_rt_info_min_plen;
 	__s32		accept_ra_rt_info_max_plen;
 #endif
 #endif
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index d8f6a1ac9af4..2ae59178189d 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -184,6 +184,7 @@ enum {
 	DEVCONF_ENHANCED_DAD,
 	DEVCONF_ADDR_GEN_MODE,
 	DEVCONF_DISABLE_POLICY,
+	DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index d2b12152e358..e13d48058b8d 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -568,6 +568,7 @@ enum {
 	NET_IPV6_PROXY_NDP=23,
 	NET_IPV6_ACCEPT_SOURCE_ROUTE=25,
 	NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
+	NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
 	__NET_IPV6_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8c69768a5c46..dff5beb26a01 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -224,6 +224,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.accept_ra_rtr_pref	= 1,
 	.rtr_probe_interval	= 60 * HZ,
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_min_plen = 0,
 	.accept_ra_rt_info_max_plen = 0,
 #endif
 #endif
@@ -277,6 +278,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.accept_ra_rtr_pref	= 1,
 	.rtr_probe_interval	= 60 * HZ,
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	.accept_ra_rt_info_min_plen = 0,
 	.accept_ra_rt_info_max_plen = 0,
 #endif
 #endif
@@ -4979,6 +4981,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_RTR_PROBE_INTERVAL] =
 		jiffies_to_msecs(cnf->rtr_probe_interval);
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen;
 	array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
 #endif
 #endif
@@ -6121,6 +6124,13 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.proc_handler	= proc_dointvec_jiffies,
 	},
 #ifdef CONFIG_IPV6_ROUTE_INFO
+	{
+		.procname	= "accept_ra_rt_info_min_plen",
+		.data		= &ipv6_devconf.accept_ra_rt_info_min_plen,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "accept_ra_rt_info_max_plen",
 		.data		= &ipv6_devconf.accept_ra_rt_info_max_plen,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 112ccbc0a8ac..b5812b3f7539 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1418,6 +1418,8 @@ skip_linkparms:
 			if (ri->prefix_len == 0 &&
 			    !in6_dev->cnf.accept_ra_defrtr)
 				continue;
+			if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen)
+				continue;
 			if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
 				continue;
 			rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
-- 
cgit v1.2.3


From 56f668dfe00dcf086734f1c42ea999398fad6572 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 22 Mar 2017 10:00:33 -0700
Subject: bpf: Add array of maps support

This patch adds a few helper funcs to enable map-in-map
support (i.e. outer_map->inner_map).  The first outer_map type
BPF_MAP_TYPE_ARRAY_OF_MAPS is also added in this patch.
The next patch will introduce a hash of maps type.

Any bpf map type can be acted as an inner_map.  The exception
is BPF_MAP_TYPE_PROG_ARRAY because the extra level of
indirection makes it harder to verify the owner_prog_type
and owner_jited.

Multi-level map-in-map is not supported (i.e. map->map is ok
but not map->map->map).

When adding an inner_map to an outer_map, it currently checks the
map_type, key_size, value_size, map_flags, max_entries and ops.
The verifier also uses those map's properties to do static analysis.
map_flags is needed because we need to ensure BPF_PROG_TYPE_PERF_EVENT
is using a preallocated hashtab for the inner_hash also.  ops and
max_entries are needed to generate inlined map-lookup instructions.
For simplicity reason, a simple '==' test is used for both map_flags
and max_entries.  The equality of ops is implied by the equality of
map_type.

During outer_map creation time, an inner_map_fd is needed to create an
outer_map.  However, the inner_map_fd's life time does not depend on the
outer_map.  The inner_map_fd is merely used to initialize
the inner_map_meta of the outer_map.

Also, for the outer_map:

* It allows element update and delete from syscall
* It allows element lookup from bpf_prog

The above is similar to the current fd_array pattern.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  2 +
 kernel/bpf/Makefile      |  2 +-
 kernel/bpf/arraymap.c    | 63 +++++++++++++++++++++++++++++++
 kernel/bpf/map_in_map.c  | 97 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/map_in_map.h  | 23 ++++++++++++
 kernel/bpf/syscall.c     |  7 +++-
 kernel/bpf/verifier.c    | 42 ++++++++++++++++-----
 8 files changed, 225 insertions(+), 12 deletions(-)
 create mode 100644 kernel/bpf/map_in_map.c
 create mode 100644 kernel/bpf/map_in_map.h

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index da8c64ca8dc9..3f3cdf9b15e8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -50,6 +50,7 @@ struct bpf_map {
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
 	atomic_t usercnt;
+	struct bpf_map *inner_map_meta;
 };
 
 struct bpf_map_type_list {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0539a0ceef38..1701ec1e7de3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -96,6 +96,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_LRU_HASH,
 	BPF_MAP_TYPE_LRU_PERCPU_HASH,
 	BPF_MAP_TYPE_LPM_TRIE,
+	BPF_MAP_TYPE_ARRAY_OF_MAPS,
 };
 
 enum bpf_prog_type {
@@ -152,6 +153,7 @@ union bpf_attr {
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
 		__u32	map_flags;	/* prealloc or not */
+		__u32	inner_map_fd;	/* fd pointing to the inner map */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e1ce4f4fd7fd..e1e5e658f2db 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
 obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 4d7d5d0ed76a..bc9da93db403 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -17,6 +17,8 @@
 #include <linux/filter.h>
 #include <linux/perf_event.h>
 
+#include "map_in_map.h"
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -602,3 +604,64 @@ static int __init register_cgroup_array_map(void)
 }
 late_initcall(register_cgroup_array_map);
 #endif
+
+static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_map *map, *inner_map_meta;
+
+	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+	if (IS_ERR(inner_map_meta))
+		return inner_map_meta;
+
+	map = fd_array_map_alloc(attr);
+	if (IS_ERR(map)) {
+		bpf_map_meta_free(inner_map_meta);
+		return map;
+	}
+
+	map->inner_map_meta = inner_map_meta;
+
+	return map;
+}
+
+static void array_of_map_free(struct bpf_map *map)
+{
+	/* map->inner_map_meta is only accessed by syscall which
+	 * is protected by fdget/fdput.
+	 */
+	bpf_map_meta_free(map->inner_map_meta);
+	bpf_fd_array_map_clear(map);
+	fd_array_map_free(map);
+}
+
+static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_map **inner_map = array_map_lookup_elem(map, key);
+
+	if (!inner_map)
+		return NULL;
+
+	return READ_ONCE(*inner_map);
+}
+
+static const struct bpf_map_ops array_of_map_ops = {
+	.map_alloc = array_of_map_alloc,
+	.map_free = array_of_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = array_of_map_lookup_elem,
+	.map_delete_elem = fd_array_map_delete_elem,
+	.map_fd_get_ptr = bpf_map_fd_get_ptr,
+	.map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
+
+static struct bpf_map_type_list array_of_map_type __ro_after_init = {
+	.ops = &array_of_map_ops,
+	.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
+};
+
+static int __init register_array_of_map(void)
+{
+	bpf_register_map_type(&array_of_map_type);
+	return 0;
+}
+late_initcall(register_array_of_map);
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
new file mode 100644
index 000000000000..59bcdf821ae4
--- /dev/null
+++ b/kernel/bpf/map_in_map.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/slab.h>
+#include <linux/bpf.h>
+
+#include "map_in_map.h"
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
+{
+	struct bpf_map *inner_map, *inner_map_meta;
+	struct fd f;
+
+	f = fdget(inner_map_ufd);
+	inner_map = __bpf_map_get(f);
+	if (IS_ERR(inner_map))
+		return inner_map;
+
+	/* prog_array->owner_prog_type and owner_jited
+	 * is a runtime binding.  Doing static check alone
+	 * in the verifier is not enough.
+	 */
+	if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+		fdput(f);
+		return ERR_PTR(-ENOTSUPP);
+	}
+
+	/* Does not support >1 level map-in-map */
+	if (inner_map->inner_map_meta) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+	if (!inner_map_meta) {
+		fdput(f);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inner_map_meta->map_type = inner_map->map_type;
+	inner_map_meta->key_size = inner_map->key_size;
+	inner_map_meta->value_size = inner_map->value_size;
+	inner_map_meta->map_flags = inner_map->map_flags;
+	inner_map_meta->ops = inner_map->ops;
+	inner_map_meta->max_entries = inner_map->max_entries;
+
+	fdput(f);
+	return inner_map_meta;
+}
+
+void bpf_map_meta_free(struct bpf_map *map_meta)
+{
+	kfree(map_meta);
+}
+
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+			const struct bpf_map *meta1)
+{
+	/* No need to compare ops because it is covered by map_type */
+	return meta0->map_type == meta1->map_type &&
+		meta0->key_size == meta1->key_size &&
+		meta0->value_size == meta1->value_size &&
+		meta0->map_flags == meta1->map_flags &&
+		meta0->max_entries == meta1->max_entries;
+}
+
+void *bpf_map_fd_get_ptr(struct bpf_map *map,
+			 struct file *map_file /* not used */,
+			 int ufd)
+{
+	struct bpf_map *inner_map;
+	struct fd f;
+
+	f = fdget(ufd);
+	inner_map = __bpf_map_get(f);
+	if (IS_ERR(inner_map))
+		return inner_map;
+
+	if (bpf_map_meta_equal(map->inner_map_meta, inner_map))
+		inner_map = bpf_map_inc(inner_map, false);
+	else
+		inner_map = ERR_PTR(-EINVAL);
+
+	fdput(f);
+	return inner_map;
+}
+
+void bpf_map_fd_put_ptr(void *ptr)
+{
+	/* ptr->ops->map_free() has to go through one
+	 * rcu grace period by itself.
+	 */
+	bpf_map_put(ptr);
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
new file mode 100644
index 000000000000..177fadb689dc
--- /dev/null
+++ b/kernel/bpf/map_in_map.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __MAP_IN_MAP_H__
+#define __MAP_IN_MAP_H__
+
+#include <linux/types.h>
+
+struct file;
+struct bpf_map;
+
+struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd);
+void bpf_map_meta_free(struct bpf_map *map_meta);
+bool bpf_map_meta_equal(const struct bpf_map *meta0,
+			const struct bpf_map *meta1);
+void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
+			 int ufd);
+void bpf_map_fd_put_ptr(void *ptr);
+
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 48c914b983bd..6e24fdf1f373 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -215,7 +215,7 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD map_flags
+#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -352,6 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_array_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
+	} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
+		err = -ENOTSUPP;
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
@@ -438,7 +440,8 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_percpu_array_update(map, key, value, attr->flags);
 	} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
 		   map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
-		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+		   map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY ||
+		   map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
 		rcu_read_lock();
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
 						   attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9bf82267f2f9..3b8f528c5473 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1199,6 +1199,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		    func_id != BPF_FUNC_current_task_under_cgroup)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+		if (func_id != BPF_FUNC_map_lookup_elem)
+			goto error;
 	default:
 		break;
 	}
@@ -2101,14 +2104,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
-		reg->type = type;
+		if (type == UNKNOWN_VALUE) {
+			__mark_reg_unknown_value(regs, regno);
+		} else if (reg->map_ptr->inner_map_meta) {
+			reg->type = CONST_PTR_TO_MAP;
+			reg->map_ptr = reg->map_ptr->inner_map_meta;
+		} else {
+			reg->type = type;
+		}
 		/* We don't need id from this point onwards anymore, thus we
 		 * should better reset it, so that state pruning has chances
 		 * to take effect.
 		 */
 		reg->id = 0;
-		if (type == UNKNOWN_VALUE)
-			__mark_reg_unknown_value(regs, regno);
 	}
 }
 
@@ -3033,16 +3041,32 @@ process_bpf_exit:
 	return 0;
 }
 
+static int check_map_prealloc(struct bpf_map *map)
+{
+	return (map->map_type != BPF_MAP_TYPE_HASH &&
+		map->map_type != BPF_MAP_TYPE_PERCPU_HASH) ||
+		!(map->map_flags & BPF_F_NO_PREALLOC);
+}
+
 static int check_map_prog_compatibility(struct bpf_map *map,
 					struct bpf_prog *prog)
 
 {
-	if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
-	    (map->map_type == BPF_MAP_TYPE_HASH ||
-	     map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
-	    (map->map_flags & BPF_F_NO_PREALLOC)) {
-		verbose("perf_event programs can only use preallocated hash map\n");
-		return -EINVAL;
+	/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
+	 * preallocated hash maps, since doing memory allocation
+	 * in overflow_handler can crash depending on where nmi got
+	 * triggered.
+	 */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
+		if (!check_map_prealloc(map)) {
+			verbose("perf_event programs can only use preallocated hash map\n");
+			return -EINVAL;
+		}
+		if (map->inner_map_meta &&
+		    !check_map_prealloc(map->inner_map_meta)) {
+			verbose("perf_event programs can only use preallocated inner hash map\n");
+			return -EINVAL;
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From bcc6b1b7ebf857a9fe56202e2be3361131588c15 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 22 Mar 2017 10:00:34 -0700
Subject: bpf: Add hash of maps support

This patch adds hash of maps support (hashmap->bpf_map).
BPF_MAP_TYPE_HASH_OF_MAPS is added.

A map-in-map contains a pointer to another map and lets call
this pointer 'inner_map_ptr'.

Notes on deleting inner_map_ptr from a hash map:

1. For BPF_F_NO_PREALLOC map-in-map, when deleting
   an inner_map_ptr, the htab_elem itself will go through
   a rcu grace period and the inner_map_ptr resides
   in the htab_elem.

2. For pre-allocated htab_elem (!BPF_F_NO_PREALLOC),
   when deleting an inner_map_ptr, the htab_elem may
   get reused immediately.  This situation is similar
   to the existing prealloc-ated use cases.

   However, the bpf_map_fd_put_ptr() calls bpf_map_put() which calls
   inner_map->ops->map_free(inner_map) which will go
   through a rcu grace period (i.e. all bpf_map's map_free
   currently goes through a rcu grace period).  Hence,
   the inner_map_ptr is still safe for the rcu reader side.

This patch also includes BPF_MAP_TYPE_HASH_OF_MAPS to the
check_map_prealloc() in the verifier.  preallocation is a
must for BPF_PROG_TYPE_PERF_EVENT.  Hence, even we don't expect
heavy updates to map-in-map, enforcing BPF_F_NO_PREALLOC for map-in-map
is impossible without disallowing BPF_PROG_TYPE_PERF_EVENT from using
map-in-map first.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   2 +
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/hashtab.c     | 121 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     |   8 +++-
 kernel/bpf/verifier.c    |   4 +-
 5 files changed, 134 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3f3cdf9b15e8..2ae39a3e9ead 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -277,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags);
 void bpf_fd_array_map_clear(struct bpf_map *map);
+int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
+				void *key, void *value, u64 map_flags);
 
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1701ec1e7de3..ce6f029ac368 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_LRU_PERCPU_HASH,
 	BPF_MAP_TYPE_LPM_TRIE,
 	BPF_MAP_TYPE_ARRAY_OF_MAPS,
+	BPF_MAP_TYPE_HASH_OF_MAPS,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 000153acb6d5..343fb5394c95 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -16,6 +16,7 @@
 #include <linux/rculist_nulls.h>
 #include "percpu_freelist.h"
 #include "bpf_lru_list.h"
+#include "map_in_map.h"
 
 struct bucket {
 	struct hlist_nulls_head head;
@@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size
 	return *(void __percpu **)(l->key + key_size);
 }
 
+static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
+{
+	return *(void **)(l->key + roundup(map->key_size, 8));
+}
+
 static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
 {
 	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
@@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head)
 
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
+	struct bpf_map *map = &htab->map;
+
+	if (map->ops->map_fd_put_ptr) {
+		void *ptr = fd_htab_map_get_ptr(map, l);
+
+		map->ops->map_fd_put_ptr(ptr);
+	}
+
 	if (l->state == HTAB_EXTRA_ELEM_USED) {
 		l->state = HTAB_EXTRA_ELEM_FREE;
 		return;
@@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab)
 		}
 	}
 }
+
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
 	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
 };
 
+static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_map *map;
+
+	if (attr->value_size != sizeof(u32))
+		return ERR_PTR(-EINVAL);
+
+	/* pointer is stored internally */
+	attr->value_size = sizeof(void *);
+	map = htab_map_alloc(attr);
+	attr->value_size = sizeof(u32);
+
+	return map;
+}
+
+static void fd_htab_map_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct hlist_nulls_node *n;
+	struct hlist_nulls_head *head;
+	struct htab_elem *l;
+	int i;
+
+	for (i = 0; i < htab->n_buckets; i++) {
+		head = select_bucket(htab, i);
+
+		hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+			void *ptr = fd_htab_map_get_ptr(map, l);
+
+			map->ops->map_fd_put_ptr(ptr);
+		}
+	}
+
+	htab_map_free(map);
+}
+
+/* only called from syscall */
+int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
+				void *key, void *value, u64 map_flags)
+{
+	void *ptr;
+	int ret;
+	u32 ufd = *(u32 *)value;
+
+	ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	ret = htab_map_update_elem(map, key, &ptr, map_flags);
+	if (ret)
+		map->ops->map_fd_put_ptr(ptr);
+
+	return ret;
+}
+
+static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_map *map, *inner_map_meta;
+
+	inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd);
+	if (IS_ERR(inner_map_meta))
+		return inner_map_meta;
+
+	map = fd_htab_map_alloc(attr);
+	if (IS_ERR(map)) {
+		bpf_map_meta_free(inner_map_meta);
+		return map;
+	}
+
+	map->inner_map_meta = inner_map_meta;
+
+	return map;
+}
+
+static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_map **inner_map  = htab_map_lookup_elem(map, key);
+
+	if (!inner_map)
+		return NULL;
+
+	return READ_ONCE(*inner_map);
+}
+
+static void htab_of_map_free(struct bpf_map *map)
+{
+	bpf_map_meta_free(map->inner_map_meta);
+	fd_htab_map_free(map);
+}
+
+static const struct bpf_map_ops htab_of_map_ops = {
+	.map_alloc = htab_of_map_alloc,
+	.map_free = htab_of_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_of_map_lookup_elem,
+	.map_delete_elem = htab_map_delete_elem,
+	.map_fd_get_ptr = bpf_map_fd_get_ptr,
+	.map_fd_put_ptr = bpf_map_fd_put_ptr,
+};
+
+static struct bpf_map_type_list htab_of_map_type __ro_after_init = {
+	.ops = &htab_of_map_ops,
+	.type = BPF_MAP_TYPE_HASH_OF_MAPS,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
 	bpf_register_map_type(&htab_percpu_type);
 	bpf_register_map_type(&htab_lru_type);
 	bpf_register_map_type(&htab_lru_percpu_type);
+	bpf_register_map_type(&htab_of_map_type);
 	return 0;
 }
 late_initcall(register_htab_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 6e24fdf1f373..c35ebfe6d84d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -352,7 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_array_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) {
+	} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+		   map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
 		err = -ENOTSUPP;
 	} else {
 		rcu_read_lock();
@@ -446,6 +447,11 @@ static int map_update_elem(union bpf_attr *attr)
 		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
 						   attr->flags);
 		rcu_read_unlock();
+	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+		rcu_read_lock();
+		err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+						  attr->flags);
+		rcu_read_unlock();
 	} else {
 		rcu_read_lock();
 		err = map->ops->map_update_elem(map, key, value, attr->flags);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3b8f528c5473..09923cc5c7c7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1200,6 +1200,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
 			goto error;
 	default:
@@ -3044,7 +3045,8 @@ process_bpf_exit:
 static int check_map_prealloc(struct bpf_map *map)
 {
 	return (map->map_type != BPF_MAP_TYPE_HASH &&
-		map->map_type != BPF_MAP_TYPE_PERCPU_HASH) ||
+		map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
+		map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) ||
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-- 
cgit v1.2.3


From b4d8c7aea15efa8c6272c58d78296f8b017c4c6a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Thu, 23 Mar 2017 00:06:17 +0100
Subject: iommu/iova: Fix compile error with CONFIG_IOMMU_IOVA=m

The #ifdef in iova.h only catches the CONFIG_IOMMU_IOVA=y
case, so that compilation as a module fails with duplicate
function definition errors. Fix it by catching both cases in
the #if.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iova.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iova.h b/include/linux/iova.h
index 548982ad5f2f..e0a892ae45c0 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -82,7 +82,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
 	return iova >> iova_shift(iovad);
 }
 
-#ifdef CONFIG_IOMMU_IOVA
+#if IS_ENABLED(CONFIG_IOMMU_IOVA)
 int iova_cache_get(void);
 void iova_cache_put(void);
 
-- 
cgit v1.2.3


From b7eaf1aab9f8bd2e49fceed77ebc66c1b5800718 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 22 Mar 2017 00:08:50 +0100
Subject: cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely

The way the schedutil governor uses the PELT metric causes it to
underestimate the CPU utilization in some cases.

That can be easily demonstrated by running kernel compilation on
a Sandy Bridge Intel processor, running turbostat in parallel with
it and looking at the values written to the MSR_IA32_PERF_CTL
register.  Namely, the expected result would be that when all CPUs
were 100% busy, all of them would be requested to run in the maximum
P-state, but observation shows that this clearly isn't the case.
The CPUs run in the maximum P-state for a while and then are
requested to run slower and go back to the maximum P-state after
a while again.  That causes the actual frequency of the processor to
visibly oscillate below the sustainable maximum in a jittery fashion
which clearly is not desirable.

That has been attributed to CPU utilization metric updates on task
migration that cause the total utilization value for the CPU to be
reduced by the utilization of the migrated task.  If that happens,
the schedutil governor may see a CPU utilization reduction and will
attempt to reduce the CPU frequency accordingly right away.  That
may be premature, though, for example if the system is generally
busy and there are other runnable tasks waiting to be run on that
CPU already.

This is unlikely to be an issue on systems where cpufreq policies are
shared between multiple CPUs, because in those cases the policy
utilization is computed as the maximum of the CPU utilization values
over the whole policy and if that turns out to be low, reducing the
frequency for the policy most likely is a good idea anyway.  On
systems with one CPU per policy, however, it may affect performance
adversely and even lead to increased energy consumption in some cases.

On those systems it may be addressed by taking another utilization
metric into consideration, like whether or not the CPU whose
frequency is about to be reduced has been idle recently, because if
that's not the case, the CPU is likely to be busy in the near future
and its frequency should not be reduced.

To that end, use the counter of idle calls in the timekeeping code.
Namely, make the schedutil governor look at that counter for the
current CPU every time before its frequency is about to be reduced.
If the counter has not changed since the previous iteration of the
governor computations for that CPU, the CPU has been busy for all
that time and its frequency should not be decreased, so if the new
frequency would be lower than the one set previously, the governor
will skip the frequency update.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Joel Fernandes <joelaf@google.com>
---
 include/linux/tick.h             |  1 +
 kernel/sched/cpufreq_schedutil.c | 27 +++++++++++++++++++++++++++
 kernel/time/tick-sched.c         | 12 ++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index a04fea19676f..fe01e68bf520 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
+extern unsigned long tick_nohz_get_idle_calls(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f5ffe241812e..c1ffb5dc8af6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -61,6 +61,11 @@ struct sugov_cpu {
 	unsigned long util;
 	unsigned long max;
 	unsigned int flags;
+
+	/* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+	unsigned long saved_idle_calls;
+#endif
 };
 
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -192,6 +197,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 	sg_cpu->iowait_boost >>= 1;
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+	unsigned long idle_calls = tick_nohz_get_idle_calls();
+	bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+	sg_cpu->saved_idle_calls = idle_calls;
+	return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
 static void sugov_update_single(struct update_util_data *hook, u64 time,
 				unsigned int flags)
 {
@@ -200,6 +218,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned long util, max;
 	unsigned int next_f;
+	bool busy;
 
 	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
@@ -207,12 +226,20 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
+	busy = sugov_cpu_is_busy(sg_cpu);
+
 	if (flags & SCHED_CPUFREQ_RT_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
 		sugov_get_util(&util, &max);
 		sugov_iowait_boost(sg_cpu, &util, &max);
 		next_f = get_next_freq(sg_policy, util, max);
+		/*
+		 * Do not reduce the frequency if the CPU has not been idle
+		 * recently, as the reduction is likely to be premature then.
+		 */
+		if (busy && next_f < sg_policy->next_freq)
+			next_f = sg_policy->next_freq;
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7fe53be86077..64c97fc130c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	return ts->idle_calls;
+}
+
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-- 
cgit v1.2.3


From b7d680d7bf584bce6023343304b819009a7c3336 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:36:54 +0100
Subject: bdi: Mark congested->bdi as internal

congested->bdi pointer is used only to be able to remove congested
structure from bdi->cgwb_congested_tree on structure release. Moreover
the pointer can become NULL when we unregister the bdi. Rename the field
to __bdi and add a comment to make it more explicit this is internal
stuff of memcg writeback code and people should not use the field as
such use will be likely race prone.

We do not bother with converting congested->bdi to a proper refcounted
reference. It will be slightly ugly to special-case bdi->wb.congested to
avoid effectively a cyclic reference of bdi to itself and the reference
gets cleared from bdi_unregister() making it impossible to reference
a freed bdi.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  4 +++-
 mm/backing-dev.c                 | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ad955817916d..8fb3dcdebc80 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -54,7 +54,9 @@ struct bdi_writeback_congested {
 	atomic_t refcnt;		/* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
-	struct backing_dev_info *bdi;	/* the associated bdi */
+	struct backing_dev_info *__bdi;	/* the associated bdi, set to NULL
+					 * on bdi unregistration. For memcg-wb
+					 * internal use only! */
 	int blkcg_id;			/* ID of the associated blkcg */
 	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..12408f86783c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -438,7 +438,7 @@ retry:
 		return NULL;
 
 	atomic_set(&new_congested->refcnt, 0);
-	new_congested->bdi = bdi;
+	new_congested->__bdi = bdi;
 	new_congested->blkcg_id = blkcg_id;
 	goto retry;
 
@@ -466,10 +466,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
 	}
 
 	/* bdi might already have been destroyed leaving @congested unlinked */
-	if (congested->bdi) {
+	if (congested->__bdi) {
 		rb_erase(&congested->rb_node,
-			 &congested->bdi->cgwb_congested_tree);
-		congested->bdi = NULL;
+			 &congested->__bdi->cgwb_congested_tree);
+		congested->__bdi = NULL;
 	}
 
 	spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -752,7 +752,7 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
 			rb_entry(rbn, struct bdi_writeback_congested, rb_node);
 
 		rb_erase(rbn, &bdi->cgwb_congested_tree);
-		congested->bdi = NULL;	/* mark @congested unlinked */
+		congested->__bdi = NULL;	/* mark @congested unlinked */
 	}
 	spin_unlock_irq(&cgwb_lock);
 }
-- 
cgit v1.2.3


From 5318ce7d46866e1dbc20ab9349b93753edba0b3e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:36:57 +0100
Subject: bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy()

Currently we waited for all cgwbs to get freed in cgwb_bdi_destroy()
which also means that writeback has been shutdown on them. Since this
wait is going away, directly shutdown writeback on cgwbs from
cgwb_bdi_destroy() to avoid live writeback structures after
bdi_unregister() has finished. To make that safe with concurrent
shutdown from cgwb_release_workfn(), we also have to make sure
wb_shutdown() returns only after the bdi_writeback structure is really
shutdown.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  1 +
 mm/backing-dev.c                 | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8fb3dcdebc80..8af720f22a2d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -21,6 +21,7 @@ struct dentry;
  */
 enum wb_state {
 	WB_registered,		/* bdi_register() was done */
+	WB_shutting_down,	/* wb_shutdown() in progress */
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
 };
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e3d56dba4da8..b67be4fc12c4 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -356,8 +356,15 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	spin_lock_bh(&wb->work_lock);
 	if (!test_and_clear_bit(WB_registered, &wb->state)) {
 		spin_unlock_bh(&wb->work_lock);
+		/*
+		 * Wait for wb shutdown to finish if someone else is just
+		 * running wb_shutdown(). Otherwise we could proceed to wb /
+		 * bdi destruction before wb_shutdown() is finished.
+		 */
+		wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
 		return;
 	}
+	set_bit(WB_shutting_down, &wb->state);
 	spin_unlock_bh(&wb->work_lock);
 
 	cgwb_remove_from_bdi_list(wb);
@@ -369,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	flush_delayed_work(&wb->dwork);
 	WARN_ON(!list_empty(&wb->work_list));
+	/*
+	 * Make sure bit gets cleared after shutdown is finished. Matches with
+	 * the barrier provided by test_and_clear_bit() above.
+	 */
+	smp_wmb();
+	clear_bit(WB_shutting_down, &wb->state);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
@@ -699,12 +712,21 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 {
 	struct radix_tree_iter iter;
 	void **slot;
+	struct bdi_writeback *wb;
 
 	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
 
 	spin_lock_irq(&cgwb_lock);
 	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 		cgwb_kill(*slot);
+
+	while (!list_empty(&bdi->wb_list)) {
+		wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+				      bdi_node);
+		spin_unlock_irq(&cgwb_lock);
+		wb_shutdown(wb);
+		spin_lock_irq(&cgwb_lock);
+	}
 	spin_unlock_irq(&cgwb_lock);
 
 	/*
-- 
cgit v1.2.3


From 4514451e79ae5baabb85d22ba3523602e59d5218 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:36:58 +0100
Subject: bdi: Do not wait for cgwbs release in bdi_unregister()

Currently we wait for all cgwbs to get released in cgwb_bdi_destroy()
(called from bdi_unregister()). That is however unnecessary now when
cgwb->bdi is a proper refcounted reference (thus bdi cannot get
released before all cgwbs are released) and when cgwb_bdi_destroy()
shuts down writeback directly.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  1 -
 mm/backing-dev.c                 | 22 +---------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8af720f22a2d..e66d4722db8e 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -164,7 +164,6 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
-	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
 #else
 	struct bdi_writeback_congested *wb_congested;
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b67be4fc12c4..8c30b1a7aae5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -406,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
 /*
  * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
  * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
- * protected.  cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
  */
 static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
 
 /**
  * wb_congested_get_create - get or create a wb_congested
@@ -505,7 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
 						release_work);
-	struct backing_dev_info *bdi = wb->bdi;
 
 	wb_shutdown(wb);
 
@@ -516,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 	percpu_ref_exit(&wb->refcnt);
 	wb_exit(wb);
 	kfree_rcu(wb, rcu);
-
-	if (atomic_dec_and_test(&bdi->usage_cnt))
-		wake_up_all(&cgwb_release_wait);
 }
 
 static void cgwb_release(struct percpu_ref *refcnt)
@@ -608,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
 		/* we might have raced another instance of this function */
 		ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
 		if (!ret) {
-			atomic_inc(&bdi->usage_cnt);
 			list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
 			list_add(&wb->memcg_node, memcg_cgwb_list);
 			list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -698,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
-	atomic_set(&bdi->usage_cnt, 1);
 
 	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
 	if (!ret) {
@@ -728,18 +720,6 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 		spin_lock_irq(&cgwb_lock);
 	}
 	spin_unlock_irq(&cgwb_lock);
-
-	/*
-	 * All cgwb's must be shutdown and released before returning.  Drain
-	 * the usage counter to wait for all cgwb's ever created on @bdi.
-	 */
-	atomic_dec(&bdi->usage_cnt);
-	wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
-	/*
-	 * Grab back our reference so that we hold it when @bdi gets
-	 * re-registered.
-	 */
-	atomic_inc(&bdi->usage_cnt);
 }
 
 /**
-- 
cgit v1.2.3


From f759741d9d913eb57784a94b9bca78b376fc26a9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:37:00 +0100
Subject: block: Fix oops in locked_inode_to_wb_and_lock_list()

When block device is closed, we call inode_detach_wb() in __blkdev_put()
which sets inode->i_wb to NULL. That is contrary to expectations that
inode->i_wb stays valid once set during the whole inode's lifetime and
leads to oops in wb_get() in locked_inode_to_wb_and_lock_list() because
inode_to_wb() returned NULL.

The reason why we called inode_detach_wb() is not valid anymore though.
BDI is guaranteed to stay along until we call bdi_put() from
bdev_evict_inode() so we can postpone calling inode_detach_wb() to that
moment.

Also add a warning to catch if someone uses inode_detach_wb() in a
dangerous way.

Reported-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c            | 8 ++------
 include/linux/writeback.h | 1 +
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 53e2389ae4d4..f2d59f143ef4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode)
 	spin_lock(&bdev_lock);
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
+	/* Detach inode from wb early as bdi_put() may free bdi->wb */
+	inode_detach_wb(inode);
 	if (bdev->bd_bdi != &noop_backing_dev_info) {
 		bdi_put(bdev->bd_bdi);
 		bdev->bd_bdi = &noop_backing_dev_info;
@@ -1875,12 +1877,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		kill_bdev(bdev);
 
 		bdev_write_inode(bdev);
-		/*
-		 * Detaching bdev inode from its wb in __destroy_inode()
-		 * is too late: the queue which embeds its bdi (along with
-		 * root wb) can be gone as soon as we put_disk() below.
-		 */
-		inode_detach_wb(bdev->bd_inode);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a3c0cbd7c888..d5815794416c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page)
 static inline void inode_detach_wb(struct inode *inode)
 {
 	if (inode->i_wb) {
+		WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
 		wb_put(inode->i_wb);
 		inode->i_wb = NULL;
 	}
-- 
cgit v1.2.3


From c70c176ff8c3ff0ac6ef9a831cd591ea9a66bd1a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:37:01 +0100
Subject: kobject: Export kobject_get_unless_zero()

Make the function available for outside use and fortify it against NULL
kobject.

CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/kobject.h | 2 ++
 lib/kobject.c           | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599e..ca85cb80e99a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name);
 extern int __must_check kobject_move(struct kobject *, struct kobject *);
 
 extern struct kobject *kobject_get(struct kobject *kobj);
+extern struct kobject * __must_check kobject_get_unless_zero(
+						struct kobject *kobj);
 extern void kobject_put(struct kobject *kobj);
 
 extern const void *kobject_namespace(struct kobject *kobj);
diff --git a/lib/kobject.c b/lib/kobject.c
index 445dcaeb0f56..763d70a18941 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj)
 }
 EXPORT_SYMBOL(kobject_get);
 
-static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
 {
+	if (!kobj)
+		return NULL;
 	if (!kref_get_unless_zero(&kobj->kref))
 		kobj = NULL;
 	return kobj;
 }
+EXPORT_SYMBOL(kobject_get_unless_zero);
 
 /*
  * kobject_cleanup - free kobject resources.
-- 
cgit v1.2.3


From 7642747d674aff1f7cfe74ad9af7e9b12ab1d5ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 22 Mar 2017 15:01:49 -0400
Subject: blk-mq: remove BLK_MQ_F_DEFER_ISSUE

This flag was never used since it was introduced.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 8 +-------
 include/linux/blk-mq.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ff66f203cd0..998c041358f2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1531,13 +1531,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	plug = current->plug;
-	/*
-	 * If the driver supports defer issued based on 'last', then
-	 * queue it up like normal since we can potentially save some
-	 * CPU this way.
-	 */
-	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
-	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+	if (((plug && !blk_queue_nomerges(q)) || is_sync)) {
 		struct request *old_rq = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b296a9006117..5b3e201c8d4f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -152,7 +152,6 @@ enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
-	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
-- 
cgit v1.2.3


From 210f7cdcf088c304ee0533ffd33d6f71a8821862 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: percpu-refcount: support synchronous switch to atomic mode.

percpu_ref_switch_to_atomic_sync() schedules the switch to atomic mode, then
waits for it to complete.

Also export percpu_ref_switch_to_* so they can be used from modules.

This will be used in md/raid to count the number of pending write
requests to an array.
We occasionally need to check if the count is zero, but most often
we don't care.
We always want updates to the counter to be fast, as in some cases
we count every 4K page.

Signed-off-by: NeilBrown <neilb@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 3a481a49546e..c13dceb87b60 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
 void percpu_ref_exit(struct percpu_ref *ref);
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch);
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9ac959ef4cae..fe03c6d52761 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -260,6 +260,22 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
+
+/**
+ * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ *
+ * Schedule switching the ref to atomic mode, and wait for the
+ * switch to complete.  Caller must ensure that no other thread
+ * will switch back to percpu mode.
+ */
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
+{
+	percpu_ref_switch_to_atomic(ref, NULL);
+	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
 
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
@@ -290,6 +306,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
 
 /**
  * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
-- 
cgit v1.2.3


From a8c06e407ef969461b7f51ec72839fe382dd3c29 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 13 Mar 2017 10:18:41 +0800
Subject: usb: separate out sysdev pointer from usb_bus

For xhci-hcd platform device, all the DMA parameters are not
configured properly, notably dma ops for dwc3 devices.

The idea here is that you pass in the parent of_node along with
the child device pointer, so it would behave exactly like the
parent already does. The difference is that it also handles all
the other attributes besides the mask.

sysdev will represent the physical device, as seen from firmware
or bus.Splitting the usb_bus->controller field into the
Linux-internal device (used for the sysfs hierarchy, for printks
and for power management) and a new pointer (used for DMA,
DT enumeration and phy lookup) probably covers all that we really
need.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sriram Dash <sriram.dash@nxp.com>
Tested-by: Baolin Wang <baolin.wang@linaro.org>
Tested-by: Brian Norris <briannorris@chromium.org>
Tested-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Tested-by: Vivek Gautam <vivek.gautam@codeaurora.org>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>
Cc: Felipe Balbi <felipe.balbi@linux.intel.com>
Cc: Grygorii Strashko <grygorii.strashko@ti.com>
Cc: Sinjan Kumar <sinjank@codeaurora.org>
Cc: David Fisher <david.fisher1@synopsys.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: "Thang Q. Nguyen" <tqnguyen@apm.com>
Cc: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Bjorn Andersson <bjorn.andersson@linaro.org>
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Jon Masters <jcm@redhat.com>
Cc: Dann Frazier <dann.frazier@canonical.com>
Cc: Peter Chen <peter.chen@nxp.com>
Cc: Leo Li <pku.leo@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/buffer.c | 12 +++----
 drivers/usb/core/hcd.c    | 80 ++++++++++++++++++++++++++---------------------
 drivers/usb/core/usb.c    | 18 +++++------
 include/linux/usb.h       |  1 +
 include/linux/usb/hcd.h   |  3 ++
 5 files changed, 64 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index b9bf6e2eb6fe..b64568cf572c 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -66,7 +66,7 @@ int hcd_buffer_create(struct usb_hcd *hcd)
 	int		i, size;
 
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!hcd->self.controller->dma_mask &&
+	    (!is_device_dma_capable(hcd->self.sysdev) &&
 	     !(hcd->driver->flags & HCD_LOCAL_MEM)))
 		return 0;
 
@@ -75,7 +75,7 @@ int hcd_buffer_create(struct usb_hcd *hcd)
 		if (!size)
 			continue;
 		snprintf(name, sizeof(name), "buffer-%d", size);
-		hcd->pool[i] = dma_pool_create(name, hcd->self.controller,
+		hcd->pool[i] = dma_pool_create(name, hcd->self.sysdev,
 				size, size, 0);
 		if (!hcd->pool[i]) {
 			hcd_buffer_destroy(hcd);
@@ -130,7 +130,7 @@ void *hcd_buffer_alloc(
 
 	/* some USB hosts just use PIO */
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!bus->controller->dma_mask &&
+	    (!is_device_dma_capable(bus->sysdev) &&
 	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
 		*dma = ~(dma_addr_t) 0;
 		return kmalloc(size, mem_flags);
@@ -140,7 +140,7 @@ void *hcd_buffer_alloc(
 		if (size <= pool_max[i])
 			return dma_pool_alloc(hcd->pool[i], mem_flags, dma);
 	}
-	return dma_alloc_coherent(hcd->self.controller, size, dma, mem_flags);
+	return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags);
 }
 
 void hcd_buffer_free(
@@ -157,7 +157,7 @@ void hcd_buffer_free(
 		return;
 
 	if (!IS_ENABLED(CONFIG_HAS_DMA) ||
-	    (!bus->controller->dma_mask &&
+	    (!is_device_dma_capable(bus->sysdev) &&
 	     !(hcd->driver->flags & HCD_LOCAL_MEM))) {
 		kfree(addr);
 		return;
@@ -169,5 +169,5 @@ void hcd_buffer_free(
 			return;
 		}
 	}
-	dma_free_coherent(hcd->self.controller, size, addr, dma);
+	dma_free_coherent(hcd->self.sysdev, size, addr, dma);
 }
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 612fab6e54fb..2342c1ffd900 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1073,6 +1073,7 @@ static void usb_deregister_bus (struct usb_bus *bus)
 static int register_root_hub(struct usb_hcd *hcd)
 {
 	struct device *parent_dev = hcd->self.controller;
+	struct device *sysdev = hcd->self.sysdev;
 	struct usb_device *usb_dev = hcd->self.root_hub;
 	const int devnum = 1;
 	int retval;
@@ -1119,7 +1120,7 @@ static int register_root_hub(struct usb_hcd *hcd)
 		/* Did the HC die before the root hub was registered? */
 		if (HCD_DEAD(hcd))
 			usb_hc_died (hcd);	/* This time clean up */
-		usb_dev->dev.of_node = parent_dev->of_node;
+		usb_dev->dev.of_node = sysdev->of_node;
 	}
 	mutex_unlock(&usb_bus_idr_lock);
 
@@ -1465,19 +1466,19 @@ void usb_hcd_unmap_urb_for_dma(struct usb_hcd *hcd, struct urb *urb)
 	dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 	if (IS_ENABLED(CONFIG_HAS_DMA) &&
 	    (urb->transfer_flags & URB_DMA_MAP_SG))
-		dma_unmap_sg(hcd->self.controller,
+		dma_unmap_sg(hcd->self.sysdev,
 				urb->sg,
 				urb->num_sgs,
 				dir);
 	else if (IS_ENABLED(CONFIG_HAS_DMA) &&
 		 (urb->transfer_flags & URB_DMA_MAP_PAGE))
-		dma_unmap_page(hcd->self.controller,
+		dma_unmap_page(hcd->self.sysdev,
 				urb->transfer_dma,
 				urb->transfer_buffer_length,
 				dir);
 	else if (IS_ENABLED(CONFIG_HAS_DMA) &&
 		 (urb->transfer_flags & URB_DMA_MAP_SINGLE))
-		dma_unmap_single(hcd->self.controller,
+		dma_unmap_single(hcd->self.sysdev,
 				urb->transfer_dma,
 				urb->transfer_buffer_length,
 				dir);
@@ -1520,11 +1521,11 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 			return ret;
 		if (IS_ENABLED(CONFIG_HAS_DMA) && hcd->self.uses_dma) {
 			urb->setup_dma = dma_map_single(
-					hcd->self.controller,
+					hcd->self.sysdev,
 					urb->setup_packet,
 					sizeof(struct usb_ctrlrequest),
 					DMA_TO_DEVICE);
-			if (dma_mapping_error(hcd->self.controller,
+			if (dma_mapping_error(hcd->self.sysdev,
 						urb->setup_dma))
 				return -EAGAIN;
 			urb->transfer_flags |= URB_SETUP_MAP_SINGLE;
@@ -1555,7 +1556,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 				}
 
 				n = dma_map_sg(
-						hcd->self.controller,
+						hcd->self.sysdev,
 						urb->sg,
 						urb->num_sgs,
 						dir);
@@ -1570,12 +1571,12 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 			} else if (urb->sg) {
 				struct scatterlist *sg = urb->sg;
 				urb->transfer_dma = dma_map_page(
-						hcd->self.controller,
+						hcd->self.sysdev,
 						sg_page(sg),
 						sg->offset,
 						urb->transfer_buffer_length,
 						dir);
-				if (dma_mapping_error(hcd->self.controller,
+				if (dma_mapping_error(hcd->self.sysdev,
 						urb->transfer_dma))
 					ret = -EAGAIN;
 				else
@@ -1585,11 +1586,11 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb,
 				ret = -EAGAIN;
 			} else {
 				urb->transfer_dma = dma_map_single(
-						hcd->self.controller,
+						hcd->self.sysdev,
 						urb->transfer_buffer,
 						urb->transfer_buffer_length,
 						dir);
-				if (dma_mapping_error(hcd->self.controller,
+				if (dma_mapping_error(hcd->self.sysdev,
 						urb->transfer_dma))
 					ret = -EAGAIN;
 				else
@@ -2495,24 +2496,8 @@ static void init_giveback_urb_bh(struct giveback_urb_bh *bh)
 	tasklet_init(&bh->bh, usb_giveback_urb_bh, (unsigned long)bh);
 }
 
-/**
- * usb_create_shared_hcd - create and initialize an HCD structure
- * @driver: HC driver that will use this hcd
- * @dev: device for this HC, stored in hcd->self.controller
- * @bus_name: value to store in hcd->self.bus_name
- * @primary_hcd: a pointer to the usb_hcd structure that is sharing the
- *              PCI device.  Only allocate certain resources for the primary HCD
- * Context: !in_interrupt()
- *
- * Allocate a struct usb_hcd, with extra space at the end for the
- * HC driver's private data.  Initialize the generic members of the
- * hcd structure.
- *
- * Return: On success, a pointer to the created and initialized HCD structure.
- * On failure (e.g. if memory is unavailable), %NULL.
- */
-struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
-		struct device *dev, const char *bus_name,
+struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver,
+		struct device *sysdev, struct device *dev, const char *bus_name,
 		struct usb_hcd *primary_hcd)
 {
 	struct usb_hcd *hcd;
@@ -2553,8 +2538,9 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 
 	usb_bus_init(&hcd->self);
 	hcd->self.controller = dev;
+	hcd->self.sysdev = sysdev;
 	hcd->self.bus_name = bus_name;
-	hcd->self.uses_dma = (dev->dma_mask != NULL);
+	hcd->self.uses_dma = (sysdev->dma_mask != NULL);
 
 	init_timer(&hcd->rh_timer);
 	hcd->rh_timer.function = rh_timer_func;
@@ -2569,6 +2555,30 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 			"USB Host Controller";
 	return hcd;
 }
+EXPORT_SYMBOL_GPL(__usb_create_hcd);
+
+/**
+ * usb_create_shared_hcd - create and initialize an HCD structure
+ * @driver: HC driver that will use this hcd
+ * @dev: device for this HC, stored in hcd->self.controller
+ * @bus_name: value to store in hcd->self.bus_name
+ * @primary_hcd: a pointer to the usb_hcd structure that is sharing the
+ *              PCI device.  Only allocate certain resources for the primary HCD
+ * Context: !in_interrupt()
+ *
+ * Allocate a struct usb_hcd, with extra space at the end for the
+ * HC driver's private data.  Initialize the generic members of the
+ * hcd structure.
+ *
+ * Return: On success, a pointer to the created and initialized HCD structure.
+ * On failure (e.g. if memory is unavailable), %NULL.
+ */
+struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
+		struct device *dev, const char *bus_name,
+		struct usb_hcd *primary_hcd)
+{
+	return __usb_create_hcd(driver, dev, dev, bus_name, primary_hcd);
+}
 EXPORT_SYMBOL_GPL(usb_create_shared_hcd);
 
 /**
@@ -2588,7 +2598,7 @@ EXPORT_SYMBOL_GPL(usb_create_shared_hcd);
 struct usb_hcd *usb_create_hcd(const struct hc_driver *driver,
 		struct device *dev, const char *bus_name)
 {
-	return usb_create_shared_hcd(driver, dev, bus_name, NULL);
+	return __usb_create_hcd(driver, dev, dev, bus_name, NULL);
 }
 EXPORT_SYMBOL_GPL(usb_create_hcd);
 
@@ -2715,7 +2725,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	struct usb_device *rhdev;
 
 	if (IS_ENABLED(CONFIG_USB_PHY) && !hcd->usb_phy) {
-		struct usb_phy *phy = usb_get_phy_dev(hcd->self.controller, 0);
+		struct usb_phy *phy = usb_get_phy_dev(hcd->self.sysdev, 0);
 
 		if (IS_ERR(phy)) {
 			retval = PTR_ERR(phy);
@@ -2733,7 +2743,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	}
 
 	if (IS_ENABLED(CONFIG_GENERIC_PHY) && !hcd->phy) {
-		struct phy *phy = phy_get(hcd->self.controller, "usb");
+		struct phy *phy = phy_get(hcd->self.sysdev, "usb");
 
 		if (IS_ERR(phy)) {
 			retval = PTR_ERR(phy);
@@ -2781,7 +2791,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	 */
 	retval = hcd_buffer_create(hcd);
 	if (retval != 0) {
-		dev_dbg(hcd->self.controller, "pool alloc failed\n");
+		dev_dbg(hcd->self.sysdev, "pool alloc failed\n");
 		goto err_create_buf;
 	}
 
@@ -2791,7 +2801,7 @@ int usb_add_hcd(struct usb_hcd *hcd,
 
 	rhdev = usb_alloc_dev(NULL, &hcd->self, 0);
 	if (rhdev == NULL) {
-		dev_err(hcd->self.controller, "unable to allocate root hub\n");
+		dev_err(hcd->self.sysdev, "unable to allocate root hub\n");
 		retval = -ENOMEM;
 		goto err_allocate_root_hub;
 	}
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index a2ccc69fb45c..4cd6e0e4b66d 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -453,9 +453,9 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent,
 	 * Note: calling dma_set_mask() on a USB device would set the
 	 * mask for the entire HCD, so don't do that.
 	 */
-	dev->dev.dma_mask = bus->controller->dma_mask;
-	dev->dev.dma_pfn_offset = bus->controller->dma_pfn_offset;
-	set_dev_node(&dev->dev, dev_to_node(bus->controller));
+	dev->dev.dma_mask = bus->sysdev->dma_mask;
+	dev->dev.dma_pfn_offset = bus->sysdev->dma_pfn_offset;
+	set_dev_node(&dev->dev, dev_to_node(bus->sysdev));
 	dev->state = USB_STATE_ATTACHED;
 	dev->lpm_disable_count = 1;
 	atomic_set(&dev->urbnum, 0);
@@ -803,7 +803,7 @@ struct urb *usb_buffer_map(struct urb *urb)
 	if (!urb
 			|| !urb->dev
 			|| !(bus = urb->dev->bus)
-			|| !(controller = bus->controller))
+			|| !(controller = bus->sysdev))
 		return NULL;
 
 	if (controller->dma_mask) {
@@ -841,7 +841,7 @@ void usb_buffer_dmasync(struct urb *urb)
 			|| !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)
 			|| !urb->dev
 			|| !(bus = urb->dev->bus)
-			|| !(controller = bus->controller))
+			|| !(controller = bus->sysdev))
 		return;
 
 	if (controller->dma_mask) {
@@ -875,7 +875,7 @@ void usb_buffer_unmap(struct urb *urb)
 			|| !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP)
 			|| !urb->dev
 			|| !(bus = urb->dev->bus)
-			|| !(controller = bus->controller))
+			|| !(controller = bus->sysdev))
 		return;
 
 	if (controller->dma_mask) {
@@ -925,7 +925,7 @@ int usb_buffer_map_sg(const struct usb_device *dev, int is_in,
 
 	if (!dev
 			|| !(bus = dev->bus)
-			|| !(controller = bus->controller)
+			|| !(controller = bus->sysdev)
 			|| !controller->dma_mask)
 		return -EINVAL;
 
@@ -961,7 +961,7 @@ void usb_buffer_dmasync_sg(const struct usb_device *dev, int is_in,
 
 	if (!dev
 			|| !(bus = dev->bus)
-			|| !(controller = bus->controller)
+			|| !(controller = bus->sysdev)
 			|| !controller->dma_mask)
 		return;
 
@@ -989,7 +989,7 @@ void usb_buffer_unmap_sg(const struct usb_device *dev, int is_in,
 
 	if (!dev
 			|| !(bus = dev->bus)
-			|| !(controller = bus->controller)
+			|| !(controller = bus->sysdev)
 			|| !controller->dma_mask)
 		return;
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7e68259360de..148752640693 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -354,6 +354,7 @@ struct usb_devmap {
  */
 struct usb_bus {
 	struct device *controller;	/* host/master side hardware */
+	struct device *sysdev;		/* as seen from firmware or bus */
 	int busnum;			/* Bus number (in order of reg) */
 	const char *bus_name;		/* stable id (PCI slot_name etc) */
 	u8 uses_dma;			/* Does the host controller use DMA? */
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index dff130151235..a469999a106d 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -437,6 +437,9 @@ extern int usb_hcd_alloc_bandwidth(struct usb_device *udev,
 		struct usb_host_interface *new_alt);
 extern int usb_hcd_get_frame_number(struct usb_device *udev);
 
+struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver,
+		struct device *sysdev, struct device *dev, const char *bus_name,
+		struct usb_hcd *primary_hcd);
 extern struct usb_hcd *usb_create_hcd(const struct hc_driver *driver,
 		struct device *dev, const char *bus_name);
 extern struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
-- 
cgit v1.2.3


From a9ebf306f52c756c4f9e50ee9a60cd6389d71344 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 1 Feb 2017 16:39:38 +0100
Subject: locking/atomic: Introduce atomic_try_cmpxchg()

Add a new cmpxchg interface:

  bool try_cmpxchg(u{8,16,32,64} *ptr, u{8,16,32,64} *val, u{8,16,32,64} new);

Where the boolean returns the result of the compare; and thus if the
exchange happened; and in case of failure, the new value of *ptr is
returned in *val.

This allows simplification/improvement of loops like:

	for (;;) {
		new = val $op $imm;
		old = cmpxchg(ptr, val, new);
		if (old == val)
			break;
		val = old;
	}

into:

	do {
	} while (!try_cmpxchg(ptr, &val, val $op $imm));

while also generating better code (GCC6 and onwards).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/atomic.h      |  6 ++++
 arch/x86/include/asm/atomic64_64.h |  6 ++++
 arch/x86/include/asm/cmpxchg.h     | 69 ++++++++++++++++++++++++++++++++++++++
 include/linux/atomic.h             | 42 +++++++++++++++++++++++
 4 files changed, 123 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 14635c5ea025..8410377c6869 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return cmpxchg(&v->counter, old, new);
 }
 
+#define atomic_try_cmpxchg atomic_try_cmpxchg
+static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+	return try_cmpxchg(&v->counter, old, new);
+}
+
 static inline int atomic_xchg(atomic_t *v, int new)
 {
 	return xchg(&v->counter, new);
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 89ed2f6ae2f7..12fb57413732 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 	return cmpxchg(&v->counter, old, new);
 }
 
+#define atomic64_try_cmpxchg atomic64_try_cmpxchg
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+{
+	return try_cmpxchg(&v->counter, old, new);
+}
+
 static inline long atomic64_xchg(atomic64_t *v, long new)
 {
 	return xchg(&v->counter, new);
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 97848cdfcb1a..fb961db51a2a 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -153,6 +153,75 @@ extern void __add_wrong_size(void)
 #define cmpxchg_local(ptr, old, new)					\
 	__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
 
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock)		\
+({									\
+	bool success;							\
+	__typeof__(_ptr) _old = (_pold);				\
+	__typeof__(*(_ptr)) __old = *_old;				\
+	__typeof__(*(_ptr)) __new = (_new);				\
+	switch (size) {							\
+	case __X86_CASE_B:						\
+	{								\
+		volatile u8 *__ptr = (volatile u8 *)(_ptr);		\
+		asm volatile(lock "cmpxchgb %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "q" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	case __X86_CASE_W:						\
+	{								\
+		volatile u16 *__ptr = (volatile u16 *)(_ptr);		\
+		asm volatile(lock "cmpxchgw %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "r" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	case __X86_CASE_L:						\
+	{								\
+		volatile u32 *__ptr = (volatile u32 *)(_ptr);		\
+		asm volatile(lock "cmpxchgl %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "r" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	case __X86_CASE_Q:						\
+	{								\
+		volatile u64 *__ptr = (volatile u64 *)(_ptr);		\
+		asm volatile(lock "cmpxchgq %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "r" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	default:							\
+		__cmpxchg_wrong_size();					\
+	}								\
+	*_old = __old;							\
+	success;							\
+})
+
+#define __try_cmpxchg(ptr, pold, new, size)				\
+	__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define try_cmpxchg(ptr, pold, new)					\
+	__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
 /*
  * xadd() adds "inc" to "*ptr" and atomically returns the previous
  * value of "*ptr".
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index e71835bf60a9..aae5953817d6 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -423,6 +423,27 @@
 #endif
 #endif /* atomic_cmpxchg_relaxed */
 
+#ifndef atomic_try_cmpxchg
+
+#define __atomic_try_cmpxchg(type, _p, _po, _n)				\
+({									\
+	typeof(_po) __po = (_po);					\
+	typeof(*(_po)) __o = *__po;					\
+	*__po = atomic_cmpxchg##type((_p), __o, (_n));			\
+	(*__po == __o);							\
+})
+
+#define atomic_try_cmpxchg(_p, _po, _n)		__atomic_try_cmpxchg(, _p, _po, _n)
+#define atomic_try_cmpxchg_relaxed(_p, _po, _n)	__atomic_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic_try_cmpxchg_acquire(_p, _po, _n)	__atomic_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic_try_cmpxchg_release(_p, _po, _n)	__atomic_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic_try_cmpxchg */
+#define atomic_try_cmpxchg_relaxed	atomic_try_cmpxchg
+#define atomic_try_cmpxchg_acquire	atomic_try_cmpxchg
+#define atomic_try_cmpxchg_release	atomic_try_cmpxchg
+#endif /* atomic_try_cmpxchg */
+
 /* cmpxchg_relaxed */
 #ifndef cmpxchg_relaxed
 #define  cmpxchg_relaxed		cmpxchg
@@ -996,6 +1017,27 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #endif
 #endif /* atomic64_cmpxchg_relaxed */
 
+#ifndef atomic64_try_cmpxchg
+
+#define __atomic64_try_cmpxchg(type, _p, _po, _n)			\
+({									\
+	typeof(_po) __po = (_po);					\
+	typeof(*(_po)) __o = *__po;					\
+	*__po = atomic64_cmpxchg##type((_p), __o, (_n));		\
+	(*__po == __o);							\
+})
+
+#define atomic64_try_cmpxchg(_p, _po, _n)		__atomic64_try_cmpxchg(, _p, _po, _n)
+#define atomic64_try_cmpxchg_relaxed(_p, _po, _n)	__atomic64_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic64_try_cmpxchg_acquire(_p, _po, _n)	__atomic64_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic64_try_cmpxchg_release(_p, _po, _n)	__atomic64_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic64_try_cmpxchg */
+#define atomic64_try_cmpxchg_relaxed	atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_acquire	atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_release	atomic64_try_cmpxchg
+#endif /* atomic64_try_cmpxchg */
+
 #ifndef atomic64_andnot
 static inline void atomic64_andnot(long long i, atomic64_t *v)
 {
-- 
cgit v1.2.3


From f6dd8449cd50de25881b76cecf1086bebeb11fe8 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Fri, 17 Mar 2017 10:05:18 +0000
Subject: mfd: wm831x: Add basic device tree binding

Add the basic ability to register the device through device tree, more
work is needed to get each individual sub-driver functioning correctly
but this is enough to get the device to probe from device tree.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/wm831x-core.c       | 29 +++++++++++++++++++++--------
 drivers/mfd/wm831x-i2c.c        | 19 ++++++++++++++++++-
 drivers/mfd/wm831x-irq.c        |  6 +++---
 drivers/mfd/wm831x-spi.c        | 18 ++++++++++++++++--
 include/linux/mfd/wm831x/core.h |  9 ++++++++-
 5 files changed, 66 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm831x-core.c b/drivers/mfd/wm831x-core.c
index 3e0e99ec5836..13a4c1190dca 100644
--- a/drivers/mfd/wm831x-core.c
+++ b/drivers/mfd/wm831x-core.c
@@ -19,6 +19,8 @@
 #include <linux/mfd/core.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 #include <linux/mfd/wm831x/core.h>
 #include <linux/mfd/wm831x/pdata.h>
@@ -1613,12 +1615,24 @@ struct regmap_config wm831x_regmap_config = {
 };
 EXPORT_SYMBOL_GPL(wm831x_regmap_config);
 
+const struct of_device_id wm831x_of_match[] = {
+	{ .compatible = "wlf,wm8310", .data = (void *)WM8310 },
+	{ .compatible = "wlf,wm8311", .data = (void *)WM8311 },
+	{ .compatible = "wlf,wm8312", .data = (void *)WM8312 },
+	{ .compatible = "wlf,wm8320", .data = (void *)WM8320 },
+	{ .compatible = "wlf,wm8321", .data = (void *)WM8321 },
+	{ .compatible = "wlf,wm8325", .data = (void *)WM8325 },
+	{ .compatible = "wlf,wm8326", .data = (void *)WM8326 },
+	{ },
+};
+EXPORT_SYMBOL_GPL(wm831x_of_match);
+
 /*
  * Instantiate the generic non-control parts of the device.
  */
-int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
+int wm831x_device_init(struct wm831x *wm831x, int irq)
 {
-	struct wm831x_pdata *pdata = dev_get_platdata(wm831x->dev);
+	struct wm831x_pdata *pdata = &wm831x->pdata;
 	int rev, wm831x_num;
 	enum wm831x_parent parent;
 	int ret, i;
@@ -1627,8 +1641,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	mutex_init(&wm831x->key_lock);
 	dev_set_drvdata(wm831x->dev, wm831x);
 
-	if (pdata)
-		wm831x->soft_shutdown = pdata->soft_shutdown;
+	wm831x->soft_shutdown = pdata->soft_shutdown;
 
 	ret = wm831x_reg_read(wm831x, WM831X_PARENT_ID);
 	if (ret < 0) {
@@ -1663,7 +1676,7 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	 */
 	if (ret == 0) {
 		dev_info(wm831x->dev, "Device is an engineering sample\n");
-		ret = id;
+		ret = wm831x->type;
 	}
 
 	switch (ret) {
@@ -1736,9 +1749,9 @@ int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq)
 	/* This will need revisiting in future but is OK for all
 	 * current parts.
 	 */
-	if (parent != id)
-		dev_warn(wm831x->dev, "Device was registered as a WM%lx\n",
-			 id);
+	if (parent != wm831x->type)
+		dev_warn(wm831x->dev, "Device was registered as a WM%x\n",
+			 wm831x->type);
 
 	/* Bootstrap the user key */
 	ret = wm831x_reg_read(wm831x, WM831X_SECURITY_KEY);
diff --git a/drivers/mfd/wm831x-i2c.c b/drivers/mfd/wm831x-i2c.c
index 824bcbaa9624..781af060f32d 100644
--- a/drivers/mfd/wm831x-i2c.c
+++ b/drivers/mfd/wm831x-i2c.c
@@ -19,6 +19,8 @@
 #include <linux/mfd/core.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/regmap.h>
 
 #include <linux/mfd/wm831x/core.h>
@@ -27,15 +29,26 @@
 static int wm831x_i2c_probe(struct i2c_client *i2c,
 			    const struct i2c_device_id *id)
 {
+	struct wm831x_pdata *pdata = dev_get_platdata(&i2c->dev);
+	const struct of_device_id *of_id;
 	struct wm831x *wm831x;
+	enum wm831x_parent type;
 	int ret;
 
+	if (i2c->dev.of_node) {
+		of_id = of_match_device(wm831x_of_match, &i2c->dev);
+		type = (enum wm831x_parent)of_id->data;
+	} else {
+		type = (enum wm831x_parent)id->driver_data;
+	}
+
 	wm831x = devm_kzalloc(&i2c->dev, sizeof(struct wm831x), GFP_KERNEL);
 	if (wm831x == NULL)
 		return -ENOMEM;
 
 	i2c_set_clientdata(i2c, wm831x);
 	wm831x->dev = &i2c->dev;
+	wm831x->type = type;
 
 	wm831x->regmap = devm_regmap_init_i2c(i2c, &wm831x_regmap_config);
 	if (IS_ERR(wm831x->regmap)) {
@@ -45,7 +58,10 @@ static int wm831x_i2c_probe(struct i2c_client *i2c,
 		return ret;
 	}
 
-	return wm831x_device_init(wm831x, id->driver_data, i2c->irq);
+	if (pdata)
+		memcpy(&wm831x->pdata, pdata, sizeof(*pdata));
+
+	return wm831x_device_init(wm831x, i2c->irq);
 }
 
 static int wm831x_i2c_remove(struct i2c_client *i2c)
@@ -94,6 +110,7 @@ static struct i2c_driver wm831x_i2c_driver = {
 	.driver = {
 		.name = "wm831x",
 		.pm = &wm831x_pm_ops,
+		.of_match_table = of_match_ptr(wm831x_of_match),
 	},
 	.probe = wm831x_i2c_probe,
 	.remove = wm831x_i2c_remove,
diff --git a/drivers/mfd/wm831x-irq.c b/drivers/mfd/wm831x-irq.c
index dfea8b9c2fe6..c01239a600db 100644
--- a/drivers/mfd/wm831x-irq.c
+++ b/drivers/mfd/wm831x-irq.c
@@ -564,7 +564,7 @@ static const struct irq_domain_ops wm831x_irq_domain_ops = {
 
 int wm831x_irq_init(struct wm831x *wm831x, int irq)
 {
-	struct wm831x_pdata *pdata = dev_get_platdata(wm831x->dev);
+	struct wm831x_pdata *pdata = &wm831x->pdata;
 	struct irq_domain *domain;
 	int i, ret, irq_base;
 
@@ -579,7 +579,7 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 	}
 
 	/* Try to dynamically allocate IRQs if no base is specified */
-	if (pdata && pdata->irq_base) {
+	if (pdata->irq_base) {
 		irq_base = irq_alloc_descs(pdata->irq_base, 0,
 					   WM831X_NUM_IRQS, 0);
 		if (irq_base < 0) {
@@ -608,7 +608,7 @@ int wm831x_irq_init(struct wm831x *wm831x, int irq)
 		return -EINVAL;
 	}
 
-	if (pdata && pdata->irq_cmos)
+	if (pdata->irq_cmos)
 		i = 0;
 	else
 		i = WM831X_IRQ_OD;
diff --git a/drivers/mfd/wm831x-spi.c b/drivers/mfd/wm831x-spi.c
index 80482aeb246a..c332e2885b26 100644
--- a/drivers/mfd/wm831x-spi.c
+++ b/drivers/mfd/wm831x-spi.c
@@ -14,6 +14,8 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/pm.h>
 #include <linux/spi/spi.h>
 #include <linux/regmap.h>
@@ -23,12 +25,19 @@
 
 static int wm831x_spi_probe(struct spi_device *spi)
 {
+	struct wm831x_pdata *pdata = dev_get_platdata(&spi->dev);
 	const struct spi_device_id *id = spi_get_device_id(spi);
+	const struct of_device_id *of_id;
 	struct wm831x *wm831x;
 	enum wm831x_parent type;
 	int ret;
 
-	type = (enum wm831x_parent)id->driver_data;
+	if (spi->dev.of_node) {
+		of_id = of_match_device(wm831x_of_match, &spi->dev);
+		type = (enum wm831x_parent)of_id->data;
+	} else {
+		type = (enum wm831x_parent)id->driver_data;
+	}
 
 	wm831x = devm_kzalloc(&spi->dev, sizeof(struct wm831x), GFP_KERNEL);
 	if (wm831x == NULL)
@@ -38,6 +47,7 @@ static int wm831x_spi_probe(struct spi_device *spi)
 
 	spi_set_drvdata(spi, wm831x);
 	wm831x->dev = &spi->dev;
+	wm831x->type = type;
 
 	wm831x->regmap = devm_regmap_init_spi(spi, &wm831x_regmap_config);
 	if (IS_ERR(wm831x->regmap)) {
@@ -47,7 +57,10 @@ static int wm831x_spi_probe(struct spi_device *spi)
 		return ret;
 	}
 
-	return wm831x_device_init(wm831x, type, spi->irq);
+	if (pdata)
+		memcpy(&wm831x->pdata, pdata, sizeof(*pdata));
+
+	return wm831x_device_init(wm831x, spi->irq);
 }
 
 static int wm831x_spi_remove(struct spi_device *spi)
@@ -97,6 +110,7 @@ static struct spi_driver wm831x_spi_driver = {
 	.driver = {
 		.name	= "wm831x",
 		.pm	= &wm831x_spi_pm,
+		.of_match_table = of_match_ptr(wm831x_of_match),
 	},
 	.id_table	= wm831x_spi_ids,
 	.probe		= wm831x_spi_probe,
diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h
index 76c22648436f..b49fa67612f1 100644
--- a/include/linux/mfd/wm831x/core.h
+++ b/include/linux/mfd/wm831x/core.h
@@ -21,6 +21,8 @@
 #include <linux/list.h>
 #include <linux/regmap.h>
 #include <linux/mfd/wm831x/auxadc.h>
+#include <linux/mfd/wm831x/pdata.h>
+#include <linux/of.h>
 
 /*
  * Register values.
@@ -367,6 +369,9 @@ struct wm831x {
 
 	struct regmap *regmap;
 
+	struct wm831x_pdata pdata;
+	enum wm831x_parent type;
+
 	int irq;  /* Our chip IRQ */
 	struct mutex irq_lock;
 	struct irq_domain *irq_domain;
@@ -412,7 +417,7 @@ int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg,
 int wm831x_bulk_read(struct wm831x *wm831x, unsigned short reg,
 		     int count, u16 *buf);
 
-int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq);
+int wm831x_device_init(struct wm831x *wm831x, int irq);
 void wm831x_device_exit(struct wm831x *wm831x);
 int wm831x_device_suspend(struct wm831x *wm831x);
 void wm831x_device_shutdown(struct wm831x *wm831x);
@@ -427,4 +432,6 @@ static inline int wm831x_irq(struct wm831x *wm831x, int irq)
 
 extern struct regmap_config wm831x_regmap_config;
 
+extern const struct of_device_id wm831x_of_match[];
+
 #endif
-- 
cgit v1.2.3


From ead25133e9352896af4de68d2f33f1ef68997e16 Mon Sep 17 00:00:00 2001
From: Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
Date: Thu, 16 Mar 2017 13:27:09 +0100
Subject: mfd: mxs-lradc: Add support for mxs-lradc

Add core files for low resolution analog-to-digital converter (mxs-lradc)
MFD driver.

Signed-off-by: Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
Reviewed-by: Marek Vasut <marex@denx.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig           |  17 +++
 drivers/mfd/Makefile          |   1 +
 drivers/mfd/mxs-lradc.c       | 267 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/mxs-lradc.h | 187 +++++++++++++++++++++++++++++
 4 files changed, 472 insertions(+)
 create mode 100644 drivers/mfd/mxs-lradc.c
 create mode 100644 include/linux/mfd/mxs-lradc.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 55ecdfb74d31..8bbc91b5186e 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -344,6 +344,23 @@ config MFD_MC13XXX_I2C
 	help
 	  Select this if your MC13xxx is connected via an I2C bus.
 
+config MFD_MXS_LRADC
+	tristate "Freescale i.MX23/i.MX28 LRADC"
+	depends on ARCH_MXS || COMPILE_TEST
+	select MFD_CORE
+	select STMP_DEVICE
+	help
+	  Say yes here to build support for the Low Resolution
+	  Analog-to-Digital Converter (LRADC) found on the i.MX23 and i.MX28
+	  processors. This driver provides common support for accessing the
+	  device, additional drivers must be enabled in order to use the
+	  functionality of the device:
+		mxs-lradc-adc for ADC readings
+		mxs-lradc-ts  for touchscreen support
+
+	  This driver can also be built as a module. If so, the module will be
+	  called mxs-lradc.
+
 config MFD_MX25_TSADC
 	tristate "Freescale i.MX25 integrated Touchscreen and ADC unit"
 	select REGMAP_MMIO
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 31ce07611a6f..790698a892ba 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -215,3 +215,4 @@ obj-$(CONFIG_MFD_ALTERA_A10SR)	+= altera-a10sr.o
 obj-$(CONFIG_MFD_SUN4I_GPADC)	+= sun4i-gpadc.o
 
 obj-$(CONFIG_MFD_STM32_TIMERS) 	+= stm32-timers.o
+obj-$(CONFIG_MFD_MXS_LRADC)     += mxs-lradc.o
diff --git a/drivers/mfd/mxs-lradc.c b/drivers/mfd/mxs-lradc.c
new file mode 100644
index 000000000000..630bd19b2c0a
--- /dev/null
+++ b/drivers/mfd/mxs-lradc.c
@@ -0,0 +1,267 @@
+/*
+ * Freescale MXS Low Resolution Analog-to-Digital Converter driver
+ *
+ * Copyright (c) 2012 DENX Software Engineering, GmbH.
+ * Copyright (c) 2017 Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
+ *
+ * Authors:
+ *  Marek Vasut <marex@denx.de>
+ *  Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/mxs-lradc.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+#define ADC_CELL		0
+#define TSC_CELL		1
+#define RES_MEM			0
+
+enum mx23_lradc_irqs {
+	MX23_LRADC_TS_IRQ = 0,
+	MX23_LRADC_CH0_IRQ,
+	MX23_LRADC_CH1_IRQ,
+	MX23_LRADC_CH2_IRQ,
+	MX23_LRADC_CH3_IRQ,
+	MX23_LRADC_CH4_IRQ,
+	MX23_LRADC_CH5_IRQ,
+	MX23_LRADC_CH6_IRQ,
+	MX23_LRADC_CH7_IRQ,
+};
+
+enum mx28_lradc_irqs {
+	MX28_LRADC_TS_IRQ = 0,
+	MX28_LRADC_TRESH0_IRQ,
+	MX28_LRADC_TRESH1_IRQ,
+	MX28_LRADC_CH0_IRQ,
+	MX28_LRADC_CH1_IRQ,
+	MX28_LRADC_CH2_IRQ,
+	MX28_LRADC_CH3_IRQ,
+	MX28_LRADC_CH4_IRQ,
+	MX28_LRADC_CH5_IRQ,
+	MX28_LRADC_CH6_IRQ,
+	MX28_LRADC_CH7_IRQ,
+	MX28_LRADC_BUTTON0_IRQ,
+	MX28_LRADC_BUTTON1_IRQ,
+};
+
+static struct resource mx23_adc_resources[] = {
+	DEFINE_RES_MEM(0x0, 0x0),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH0_IRQ, "mxs-lradc-channel0"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH1_IRQ, "mxs-lradc-channel1"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH2_IRQ, "mxs-lradc-channel2"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH3_IRQ, "mxs-lradc-channel3"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH4_IRQ, "mxs-lradc-channel4"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH5_IRQ, "mxs-lradc-channel5"),
+};
+
+static struct resource mx23_touchscreen_resources[] = {
+	DEFINE_RES_MEM(0x0, 0x0),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_TS_IRQ, "mxs-lradc-touchscreen"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH6_IRQ, "mxs-lradc-channel6"),
+	DEFINE_RES_IRQ_NAMED(MX23_LRADC_CH7_IRQ, "mxs-lradc-channel7"),
+};
+
+static struct resource mx28_adc_resources[] = {
+	DEFINE_RES_MEM(0x0, 0x0),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_TRESH0_IRQ, "mxs-lradc-thresh0"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_TRESH1_IRQ, "mxs-lradc-thresh1"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH0_IRQ, "mxs-lradc-channel0"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH1_IRQ, "mxs-lradc-channel1"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH2_IRQ, "mxs-lradc-channel2"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH3_IRQ, "mxs-lradc-channel3"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH4_IRQ, "mxs-lradc-channel4"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH5_IRQ, "mxs-lradc-channel5"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_BUTTON0_IRQ, "mxs-lradc-button0"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_BUTTON1_IRQ, "mxs-lradc-button1"),
+};
+
+static struct resource mx28_touchscreen_resources[] = {
+	DEFINE_RES_MEM(0x0, 0x0),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_TS_IRQ, "mxs-lradc-touchscreen"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH6_IRQ, "mxs-lradc-channel6"),
+	DEFINE_RES_IRQ_NAMED(MX28_LRADC_CH7_IRQ, "mxs-lradc-channel7"),
+};
+
+static struct mfd_cell mx23_cells[] = {
+	{
+		.name = "mxs-lradc-adc",
+		.resources = mx23_adc_resources,
+		.num_resources = ARRAY_SIZE(mx23_adc_resources),
+	},
+	{
+		.name = "mxs-lradc-ts",
+		.resources = mx23_touchscreen_resources,
+		.num_resources = ARRAY_SIZE(mx23_touchscreen_resources),
+	},
+};
+
+static struct mfd_cell mx28_cells[] = {
+	{
+		.name = "mxs-lradc-adc",
+		.resources = mx28_adc_resources,
+		.num_resources = ARRAY_SIZE(mx28_adc_resources),
+	},
+	{
+		.name = "mxs-lradc-ts",
+		.resources = mx28_touchscreen_resources,
+		.num_resources = ARRAY_SIZE(mx28_touchscreen_resources),
+	}
+};
+
+static const struct of_device_id mxs_lradc_dt_ids[] = {
+	{ .compatible = "fsl,imx23-lradc", .data = (void *)IMX23_LRADC, },
+	{ .compatible = "fsl,imx28-lradc", .data = (void *)IMX28_LRADC, },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, mxs_lradc_dt_ids);
+
+static int mxs_lradc_probe(struct platform_device *pdev)
+{
+	const struct of_device_id *of_id;
+	struct device *dev = &pdev->dev;
+	struct device_node *node = dev->of_node;
+	struct mxs_lradc *lradc;
+	struct mfd_cell *cells = NULL;
+	struct resource *res;
+	int ret = 0;
+	u32 ts_wires = 0;
+
+	lradc = devm_kzalloc(&pdev->dev, sizeof(*lradc), GFP_KERNEL);
+	if (!lradc)
+		return -ENOMEM;
+
+	of_id = of_match_device(mxs_lradc_dt_ids, &pdev->dev);
+	if (!of_id)
+		return -EINVAL;
+
+	lradc->soc = (enum mxs_lradc_id)of_id->data;
+
+	lradc->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(lradc->clk)) {
+		dev_err(dev, "Failed to get the delay unit clock\n");
+		return PTR_ERR(lradc->clk);
+	}
+
+	ret = clk_prepare_enable(lradc->clk);
+	if (ret) {
+		dev_err(dev, "Failed to enable the delay unit clock\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "fsl,lradc-touchscreen-wires",
+					 &ts_wires);
+
+	if (!ret) {
+		lradc->buffer_vchans = BUFFER_VCHANS_LIMITED;
+
+		switch (ts_wires) {
+		case 4:
+			lradc->touchscreen_wire = MXS_LRADC_TOUCHSCREEN_4WIRE;
+			break;
+		case 5:
+			if (lradc->soc == IMX28_LRADC) {
+				lradc->touchscreen_wire =
+					MXS_LRADC_TOUCHSCREEN_5WIRE;
+				break;
+			}
+			/* fall through to an error message for i.MX23 */
+		default:
+			dev_err(&pdev->dev,
+				"Unsupported number of touchscreen wires (%d)\n"
+				, ts_wires);
+			ret = -EINVAL;
+			goto err_clk;
+		}
+	} else {
+		lradc->buffer_vchans = BUFFER_VCHANS_ALL;
+	}
+
+	platform_set_drvdata(pdev, lradc);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOMEM;
+
+	switch (lradc->soc) {
+	case IMX23_LRADC:
+		mx23_adc_resources[RES_MEM] = *res;
+		mx23_touchscreen_resources[RES_MEM] = *res;
+		cells = mx23_cells;
+		break;
+	case IMX28_LRADC:
+		mx28_adc_resources[RES_MEM] = *res;
+		mx28_touchscreen_resources[RES_MEM] = *res;
+		cells = mx28_cells;
+		break;
+	default:
+		dev_err(dev, "Unsupported SoC\n");
+		ret = -ENODEV;
+		goto err_clk;
+	}
+
+	ret = devm_mfd_add_devices(&pdev->dev, PLATFORM_DEVID_NONE,
+				   &cells[ADC_CELL], 1, NULL, 0, NULL);
+	if (ret) {
+		dev_err(&pdev->dev, "Failed to add the ADC subdevice\n");
+		goto err_clk;
+	}
+
+	if (!lradc->touchscreen_wire)
+		return 0;
+
+	ret = devm_mfd_add_devices(&pdev->dev, PLATFORM_DEVID_NONE,
+				   &cells[TSC_CELL], 1, NULL, 0, NULL);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"Failed to add the touchscreen subdevice\n");
+		goto err_clk;
+	}
+
+	return 0;
+
+err_clk:
+	clk_disable_unprepare(lradc->clk);
+
+	return ret;
+}
+
+static int mxs_lradc_remove(struct platform_device *pdev)
+{
+	struct mxs_lradc *lradc = platform_get_drvdata(pdev);
+
+	clk_disable_unprepare(lradc->clk);
+
+	return 0;
+}
+
+static struct platform_driver mxs_lradc_driver = {
+	.driver = {
+		.name = "mxs-lradc",
+		.of_match_table = mxs_lradc_dt_ids,
+	},
+	.probe = mxs_lradc_probe,
+	.remove = mxs_lradc_remove,
+};
+module_platform_driver(mxs_lradc_driver);
+
+MODULE_AUTHOR("Ksenija Stanojevic <ksenija.stanojevic@gmail.com>");
+MODULE_DESCRIPTION("Freescale i.MX23/i.MX28 LRADC driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mxs-lradc");
diff --git a/include/linux/mfd/mxs-lradc.h b/include/linux/mfd/mxs-lradc.h
new file mode 100644
index 000000000000..661a4521f723
--- /dev/null
+++ b/include/linux/mfd/mxs-lradc.h
@@ -0,0 +1,187 @@
+/*
+ * Freescale MXS Low Resolution Analog-to-Digital Converter driver
+ *
+ * Copyright (c) 2012 DENX Software Engineering, GmbH.
+ * Copyright (c) 2016 Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
+ *
+ * Author: Marek Vasut <marex@denx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __MFD_MXS_LRADC_H
+#define __MFD_MXS_LRADC_H
+
+#include <linux/bitops.h>
+#include <linux/io.h>
+#include <linux/stmp_device.h>
+
+#define LRADC_MAX_DELAY_CHANS	4
+#define LRADC_MAX_MAPPED_CHANS	8
+#define LRADC_MAX_TOTAL_CHANS	16
+
+#define LRADC_DELAY_TIMER_HZ	2000
+
+#define LRADC_CTRL0				0x00
+# define LRADC_CTRL0_MX28_TOUCH_DETECT_ENABLE	BIT(23)
+# define LRADC_CTRL0_MX28_TOUCH_SCREEN_TYPE	BIT(22)
+# define LRADC_CTRL0_MX28_YNNSW /* YM */	BIT(21)
+# define LRADC_CTRL0_MX28_YPNSW /* YP */	BIT(20)
+# define LRADC_CTRL0_MX28_YPPSW /* YP */	BIT(19)
+# define LRADC_CTRL0_MX28_XNNSW /* XM */	BIT(18)
+# define LRADC_CTRL0_MX28_XNPSW /* XM */	BIT(17)
+# define LRADC_CTRL0_MX28_XPPSW /* XP */	BIT(16)
+
+# define LRADC_CTRL0_MX23_TOUCH_DETECT_ENABLE	BIT(20)
+# define LRADC_CTRL0_MX23_YM			BIT(19)
+# define LRADC_CTRL0_MX23_XM			BIT(18)
+# define LRADC_CTRL0_MX23_YP			BIT(17)
+# define LRADC_CTRL0_MX23_XP			BIT(16)
+
+# define LRADC_CTRL0_MX28_PLATE_MASK \
+		(LRADC_CTRL0_MX28_TOUCH_DETECT_ENABLE | \
+		LRADC_CTRL0_MX28_YNNSW | LRADC_CTRL0_MX28_YPNSW | \
+		LRADC_CTRL0_MX28_YPPSW | LRADC_CTRL0_MX28_XNNSW | \
+		LRADC_CTRL0_MX28_XNPSW | LRADC_CTRL0_MX28_XPPSW)
+
+# define LRADC_CTRL0_MX23_PLATE_MASK \
+		(LRADC_CTRL0_MX23_TOUCH_DETECT_ENABLE | \
+		LRADC_CTRL0_MX23_YM | LRADC_CTRL0_MX23_XM | \
+		LRADC_CTRL0_MX23_YP | LRADC_CTRL0_MX23_XP)
+
+#define LRADC_CTRL1				0x10
+#define LRADC_CTRL1_TOUCH_DETECT_IRQ_EN		BIT(24)
+#define LRADC_CTRL1_LRADC_IRQ_EN(n)		(1 << ((n) + 16))
+#define LRADC_CTRL1_MX28_LRADC_IRQ_EN_MASK	(0x1fff << 16)
+#define LRADC_CTRL1_MX23_LRADC_IRQ_EN_MASK	(0x01ff << 16)
+#define LRADC_CTRL1_LRADC_IRQ_EN_OFFSET		16
+#define LRADC_CTRL1_TOUCH_DETECT_IRQ		BIT(8)
+#define LRADC_CTRL1_LRADC_IRQ(n)		BIT(n)
+#define LRADC_CTRL1_MX28_LRADC_IRQ_MASK		0x1fff
+#define LRADC_CTRL1_MX23_LRADC_IRQ_MASK		0x01ff
+#define LRADC_CTRL1_LRADC_IRQ_OFFSET		0
+
+#define LRADC_CTRL2				0x20
+#define LRADC_CTRL2_DIVIDE_BY_TWO_OFFSET	24
+#define LRADC_CTRL2_TEMPSENSE_PWD		BIT(15)
+
+#define LRADC_STATUS				0x40
+#define LRADC_STATUS_TOUCH_DETECT_RAW		BIT(0)
+
+#define LRADC_CH(n)				(0x50 + (0x10 * (n)))
+#define LRADC_CH_ACCUMULATE			BIT(29)
+#define LRADC_CH_NUM_SAMPLES_MASK		(0x1f << 24)
+#define LRADC_CH_NUM_SAMPLES_OFFSET		24
+#define LRADC_CH_NUM_SAMPLES(x) \
+				((x) << LRADC_CH_NUM_SAMPLES_OFFSET)
+#define LRADC_CH_VALUE_MASK			0x3ffff
+#define LRADC_CH_VALUE_OFFSET			0
+
+#define LRADC_DELAY(n)				(0xd0 + (0x10 * (n)))
+#define LRADC_DELAY_TRIGGER_LRADCS_MASK		(0xffUL << 24)
+#define LRADC_DELAY_TRIGGER_LRADCS_OFFSET	24
+#define LRADC_DELAY_TRIGGER(x) \
+				(((x) << LRADC_DELAY_TRIGGER_LRADCS_OFFSET) & \
+				LRADC_DELAY_TRIGGER_LRADCS_MASK)
+#define LRADC_DELAY_KICK			BIT(20)
+#define LRADC_DELAY_TRIGGER_DELAYS_MASK		(0xf << 16)
+#define LRADC_DELAY_TRIGGER_DELAYS_OFFSET	16
+#define LRADC_DELAY_TRIGGER_DELAYS(x) \
+				(((x) << LRADC_DELAY_TRIGGER_DELAYS_OFFSET) & \
+				LRADC_DELAY_TRIGGER_DELAYS_MASK)
+#define LRADC_DELAY_LOOP_COUNT_MASK		(0x1f << 11)
+#define LRADC_DELAY_LOOP_COUNT_OFFSET		11
+#define LRADC_DELAY_LOOP(x) \
+				(((x) << LRADC_DELAY_LOOP_COUNT_OFFSET) & \
+				LRADC_DELAY_LOOP_COUNT_MASK)
+#define LRADC_DELAY_DELAY_MASK			0x7ff
+#define LRADC_DELAY_DELAY_OFFSET		0
+#define LRADC_DELAY_DELAY(x) \
+				(((x) << LRADC_DELAY_DELAY_OFFSET) & \
+				LRADC_DELAY_DELAY_MASK)
+
+#define LRADC_CTRL4				0x140
+#define LRADC_CTRL4_LRADCSELECT_MASK(n)		(0xf << ((n) * 4))
+#define LRADC_CTRL4_LRADCSELECT_OFFSET(n)	((n) * 4)
+#define LRADC_CTRL4_LRADCSELECT(n, x) \
+				(((x) << LRADC_CTRL4_LRADCSELECT_OFFSET(n)) & \
+				LRADC_CTRL4_LRADCSELECT_MASK(n))
+
+#define LRADC_RESOLUTION			12
+#define LRADC_SINGLE_SAMPLE_MASK		((1 << LRADC_RESOLUTION) - 1)
+
+#define BUFFER_VCHANS_LIMITED		0x3f
+#define BUFFER_VCHANS_ALL		0xff
+
+	/*
+	 * Certain LRADC channels are shared between touchscreen
+	 * and/or touch-buttons and generic LRADC block. Therefore when using
+	 * either of these, these channels are not available for the regular
+	 * sampling. The shared channels are as follows:
+	 *
+	 * CH0 -- Touch button #0
+	 * CH1 -- Touch button #1
+	 * CH2 -- Touch screen XPUL
+	 * CH3 -- Touch screen YPLL
+	 * CH4 -- Touch screen XNUL
+	 * CH5 -- Touch screen YNLR
+	 * CH6 -- Touch screen WIPER (5-wire only)
+	 *
+	 * The bit fields below represents which parts of the LRADC block are
+	 * switched into special mode of operation. These channels can not
+	 * be sampled as regular LRADC channels. The driver will refuse any
+	 * attempt to sample these channels.
+	 */
+#define CHAN_MASK_TOUCHBUTTON		(BIT(1) | BIT(0))
+#define CHAN_MASK_TOUCHSCREEN_4WIRE	(0xf << 2)
+#define CHAN_MASK_TOUCHSCREEN_5WIRE	(0x1f << 2)
+
+enum mxs_lradc_id {
+	IMX23_LRADC,
+	IMX28_LRADC,
+};
+
+enum mxs_lradc_ts_wires {
+	MXS_LRADC_TOUCHSCREEN_NONE = 0,
+	MXS_LRADC_TOUCHSCREEN_4WIRE,
+	MXS_LRADC_TOUCHSCREEN_5WIRE,
+};
+
+/**
+ * struct mxs_lradc
+ * @soc: soc type (IMX23 or IMX28)
+ * @clk: 2 kHz clock for delay units
+ * @buffer_vchans: channels that can be used during buffered capture
+ * @touchscreen_wire: touchscreen type (4-wire or 5-wire)
+ * @use_touchbutton: button state (on or off)
+ */
+struct mxs_lradc {
+	enum mxs_lradc_id	soc;
+	struct clk		*clk;
+	u8			buffer_vchans;
+
+	enum mxs_lradc_ts_wires	touchscreen_wire;
+	bool			use_touchbutton;
+};
+
+static inline u32 mxs_lradc_irq_mask(struct mxs_lradc *lradc)
+{
+	switch (lradc->soc) {
+	case IMX23_LRADC:
+		return LRADC_CTRL1_MX23_LRADC_IRQ_MASK;
+	case IMX28_LRADC:
+		return LRADC_CTRL1_MX28_LRADC_IRQ_MASK;
+	default:
+		return 0;
+	}
+}
+
+#endif /* __MXS_LRADC_H */
-- 
cgit v1.2.3


From e1fe7b6a7b376bfb54558725ddb2a89aaaa4adcc Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 21 Mar 2017 13:56:46 +0200
Subject: lib/string: add sysfs_match_string helper

Make a simple helper for matching strings with sysfs
attribute files. In most parts the same as match_string(),
except sysfs_match_string() uses sysfs_streq() instead of
strcmp() for matching. This is more convenient when used
with sysfs attributes.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/string.h | 10 ++++++++++
 lib/string.c           | 26 ++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/string.h b/include/linux/string.h
index 26b6f6a66f83..c4011b28f3d8 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -135,6 +135,16 @@ static inline int strtobool(const char *s, bool *res)
 }
 
 int match_string(const char * const *array, size_t n, const char *string);
+int __sysfs_match_string(const char * const *array, size_t n, const char *s);
+
+/**
+ * sysfs_match_string - matches given string in an array
+ * @_a: array of strings
+ * @_s: string to match with
+ *
+ * Helper for __sysfs_match_string(). Calculates the size of @a automatically.
+ */
+#define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s)
 
 #ifdef CONFIG_BINARY_PRINTF
 int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args);
diff --git a/lib/string.c b/lib/string.c
index ed83562a53ae..1a7d3fd52541 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -656,6 +656,32 @@ int match_string(const char * const *array, size_t n, const char *string)
 }
 EXPORT_SYMBOL(match_string);
 
+/**
+ * __sysfs_match_string - matches given string in an array
+ * @array: array of strings
+ * @n: number of strings in the array or -1 for NULL terminated arrays
+ * @str: string to match with
+ *
+ * Returns index of @str in the @array or -EINVAL, just like match_string().
+ * Uses sysfs_streq instead of strcmp for matching.
+ */
+int __sysfs_match_string(const char * const *array, size_t n, const char *str)
+{
+	const char *item;
+	int index;
+
+	for (index = 0; index < n; index++) {
+		item = array[index];
+		if (!item)
+			break;
+		if (sysfs_streq(item, str))
+			return index;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(__sysfs_match_string);
+
 #ifndef __HAVE_ARCH_MEMSET
 /**
  * memset - Fill a region of memory with the given value
-- 
cgit v1.2.3


From fab9288428ec0fbd09adb67d3a17c51d78196f9c Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 21 Mar 2017 13:56:47 +0200
Subject: usb: USB Type-C connector class

The purpose of USB Type-C connector class is to provide
unified interface for the user space to get the status and
basic information about USB Type-C connectors on a system,
control over data role swapping, and when the port supports
USB Power Delivery, also control over power role swapping
and Alternate Modes.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-and-Tested-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-typec |  276 ++++++
 Documentation/usb/typec.rst                 |  184 ++++
 MAINTAINERS                                 |    9 +
 drivers/usb/Kconfig                         |    2 +
 drivers/usb/Makefile                        |    2 +
 drivers/usb/typec/Kconfig                   |    7 +
 drivers/usb/typec/Makefile                  |    1 +
 drivers/usb/typec/typec.c                   | 1262 +++++++++++++++++++++++++++
 include/linux/usb/typec.h                   |  243 ++++++
 9 files changed, 1986 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-typec
 create mode 100644 Documentation/usb/typec.rst
 create mode 100644 drivers/usb/typec/Kconfig
 create mode 100644 drivers/usb/typec/Makefile
 create mode 100644 drivers/usb/typec/typec.c
 create mode 100644 include/linux/usb/typec.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec
new file mode 100644
index 000000000000..d4a3d23eb09c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -0,0 +1,276 @@
+USB Type-C port devices (eg. /sys/class/typec/port0/)
+
+What:		/sys/class/typec/<port>/data_role
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The supported USB data roles. This attribute can be used for
+		requesting data role swapping on the port. Swapping is supported
+		as synchronous operation, so write(2) to the attribute will not
+		return until the operation has finished. The attribute is
+		notified about role changes so that poll(2) on the attribute
+		wakes up. Change on the role will also generate uevent
+		KOBJ_CHANGE on the port. The current role is show in brackets,
+		for example "[host] device" when DRP port is in host mode.
+
+		Valid values: host, device
+
+What:		/sys/class/typec/<port>/power_role
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The supported power roles. This attribute can be used to request
+		power role swap on the port when the port supports USB Power
+		Delivery. Swapping is supported as synchronous operation, so
+		write(2) to the attribute will not return until the operation
+		has finished. The attribute is notified about role changes so
+		that poll(2) on the attribute wakes up. Change on the role will
+		also generate uevent KOBJ_CHANGE. The current role is show in
+		brackets, for example "[source] sink" when in source mode.
+
+		Valid values: source, sink
+
+What:		/sys/class/typec/<port>/vconn_source
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows is the port VCONN Source. This attribute can be used to
+		request VCONN swap to change the VCONN Source during connection
+		when both the port and the partner support USB Power Delivery.
+		Swapping is supported as synchronous operation, so write(2) to
+		the attribute will not return until the operation has finished.
+		The attribute is notified about VCONN source changes so that
+		poll(2) on the attribute wakes up. Change on VCONN source also
+		generates uevent KOBJ_CHANGE.
+
+		Valid values:
+		- "no" when the port is not the VCONN Source
+		- "yes" when the port is the VCONN Source
+
+What:		/sys/class/typec/<port>/power_operation_mode
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows the current power operational mode the port is in. The
+		power operation mode means current level for VBUS. In case USB
+		Power Delivery communication is used for negotiating the levels,
+		power operation mode should show "usb_power_delivery".
+
+		Valid values:
+		- default
+		- 1.5A
+		- 3.0A
+		- usb_power_delivery
+
+What:		/sys/class/typec/<port>/preferred_role
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The user space can notify the driver about the preferred role.
+		It should be handled as enabling of Try.SRC or Try.SNK, as
+		defined in USB Type-C specification, in the port drivers. By
+		default the preferred role should come from the platform.
+
+		Valid values: source, sink, none (to remove preference)
+
+What:		/sys/class/typec/<port>/supported_accessory_modes
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Space separated list of accessory modes, defined in the USB
+		Type-C specification, the port supports.
+
+What:		/sys/class/typec/<port>/usb_power_delivery_revision
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Revision number of the supported USB Power Delivery
+		specification, or 0 when USB Power Delivery is not supported.
+
+What:		/sys/class/typec/<port>/usb_typec_revision
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Revision number of the supported USB Type-C specification.
+
+
+USB Type-C partner devices (eg. /sys/class/typec/port0-partner/)
+
+What:		/sys/class/typec/<port>-partner/accessory_mode
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows the Accessory Mode name when the partner is an Accessory.
+		The Accessory Modes are defined in USB Type-C Specification.
+
+What:		/sys/class/typec/<port>-partner/supports_usb_power_delivery
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows if the partner supports USB Power Delivery communication:
+		Valid values: yes, no
+
+What:		/sys/class/typec/<port>-partner>/identity/
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This directory appears only if the port device driver is capable
+		of showing the result of Discover Identity USB power delivery
+		command. That will not always be possible even when USB power
+		delivery is supported, for example when USB power delivery
+		communication for the port is mostly handled in firmware. If the
+		directory exists, it will have an attribute file for every VDO
+		in Discover Identity command result.
+
+What:		/sys/class/typec/<port>-partner/identity/id_header
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		ID Header VDO part of Discover Identity command result. The
+		value will show 0 until Discover Identity command result becomes
+		available. The value can be polled.
+
+What:		/sys/class/typec/<port>-partner/identity/cert_stat
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Cert Stat VDO part of Discover Identity command result. The
+		value will show 0 until Discover Identity command result becomes
+		available. The value can be polled.
+
+What:		/sys/class/typec/<port>-partner/identity/product
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Product VDO part of Discover Identity command result. The value
+		will show 0 until Discover Identity command result becomes
+		available. The value can be polled.
+
+
+USB Type-C cable devices (eg. /sys/class/typec/port0-cable/)
+
+Note: Electronically Marked Cables will have a device also for one cable plug
+(eg. /sys/class/typec/port0-plug0). If the cable is active and has also SOP
+Double Prime controller (USB Power Deliver specification ch. 2.4) it will have
+second device also for the other plug. Both plugs may have alternate modes as
+described in USB Type-C and USB Power Delivery specifications.
+
+What:		/sys/class/typec/<port>-cable/type
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows if the cable is active.
+		Valid values: active, passive
+
+What:		/sys/class/typec/<port>-cable/plug_type
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows type of the plug on the cable:
+		- type-a - Standard A
+		- type-b - Standard B
+		- type-c
+		- captive
+
+What:		/sys/class/typec/<port>-cable/identity/
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		This directory appears only if the port device driver is capable
+		of showing the result of Discover Identity USB power delivery
+		command. That will not always be possible even when USB power
+		delivery is supported. If the directory exists, it will have an
+		attribute for every VDO returned by Discover Identity command.
+
+What:		/sys/class/typec/<port>-cable/identity/id_header
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		ID Header VDO part of Discover Identity command result. The
+		value will show 0 until Discover Identity command result becomes
+		available. The value can be polled.
+
+What:		/sys/class/typec/<port>-cable/identity/cert_stat
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Cert Stat VDO part of Discover Identity command result. The
+		value will show 0 until Discover Identity command result becomes
+		available. The value can be polled.
+
+What:		/sys/class/typec/<port>-cable/identity/product
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Product VDO part of Discover Identity command result. The value
+		will show 0 until Discover Identity command result becomes
+		available. The value can be polled.
+
+
+Alternate Mode devices.
+
+The alternate modes will have Standard or Vendor ID (SVID) assigned by USB-IF.
+The ports, partners and cable plugs can have alternate modes. A supported SVID
+will consist of a set of modes. Every SVID a port/partner/plug supports will
+have a device created for it, and every supported mode for a supported SVID will
+have its own directory under that device. Below <dev> refers to the device for
+the alternate mode.
+
+What:		/sys/class/typec/<port|partner|cable>/<dev>/svid
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		The SVID (Standard or Vendor ID) assigned by USB-IF for this
+		alternate mode.
+
+What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Every supported mode will have its own directory. The name of
+		a mode will be "mode<index>" (for example mode1), where <index>
+		is the actual index to the mode VDO returned by Discover Modes
+		USB power delivery command.
+
+What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/description
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows description of the mode. The description is optional for
+		the drivers, just like with the Billboard Devices.
+
+What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/vdo
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows the VDO in hexadecimal returned by Discover Modes command
+		for this mode.
+
+What:		/sys/class/typec/<port|partner|cable>/<dev>/mode<index>/active
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Shows if the mode is active or not. The attribute can be used
+		for entering/exiting the mode with partners and cable plugs, and
+		with the port alternate modes it can be used for disabling
+		support for specific alternate modes. Entering/exiting modes is
+		supported as synchronous operation so write(2) to the attribute
+		does not return until the enter/exit mode operation has
+		finished. The attribute is notified when the mode is
+		entered/exited so poll(2) on the attribute wakes up.
+		Entering/exiting a mode will also generate uevent KOBJ_CHANGE.
+
+		Valid values: yes, no
+
+What:		/sys/class/typec/<port>/<dev>/mode<index>/supported_roles
+Date:		April 2017
+Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Description:
+		Space separated list of the supported roles.
+
+		This attribute is available for the devices describing the
+		alternate modes a port supports, and it will not be exposed with
+		the devices presenting the alternate modes the partners or cable
+		plugs support.
+
+		Valid values: source, sink
diff --git a/Documentation/usb/typec.rst b/Documentation/usb/typec.rst
new file mode 100644
index 000000000000..b67a46779de9
--- /dev/null
+++ b/Documentation/usb/typec.rst
@@ -0,0 +1,184 @@
+
+USB Type-C connector class
+==========================
+
+Introduction
+------------
+
+The typec class is meant for describing the USB Type-C ports in a system to the
+user space in unified fashion. The class is designed to provide nothing else
+except the user space interface implementation in hope that it can be utilized
+on as many platforms as possible.
+
+The platforms are expected to register every USB Type-C port they have with the
+class. In a normal case the registration will be done by a USB Type-C or PD PHY
+driver, but it may be a driver for firmware interface such as UCSI, driver for
+USB PD controller or even driver for Thunderbolt3 controller. This document
+considers the component registering the USB Type-C ports with the class as "port
+driver".
+
+On top of showing the capabilities, the class also offer user space control over
+the roles and alternate modes of ports, partners and cable plugs when the port
+driver is capable of supporting those features.
+
+The class provides an API for the port drivers described in this document. The
+attributes are described in Documentation/ABI/testing/sysfs-class-typec.
+
+User space interface
+--------------------
+Every port will be presented as its own device under /sys/class/typec/. The
+first port will be named "port0", the second "port1" and so on.
+
+When connected, the partner will be presented also as its own device under
+/sys/class/typec/. The parent of the partner device will always be the port it
+is attached to. The partner attached to port "port0" will be named
+"port0-partner". Full path to the device would be
+/sys/class/typec/port0/port0-partner/.
+
+The cable and the two plugs on it may also be optionally presented as their own
+devices under /sys/class/typec/. The cable attached to the port "port0" port
+will be named port0-cable and the plug on the SOP Prime end (see USB Power
+Delivery Specification ch. 2.4) will be named "port0-plug0" and on the SOP
+Double Prime end "port0-plug1". The parent of a cable will always be the port,
+and the parent of the cable plugs will always be the cable.
+
+If the port, partner or cable plug supports Alternate Modes, every supported
+Alternate Mode SVID will have their own device describing them. Note that the
+Alternate Mode devices will not be attached to the typec class. The parent of an
+alternate mode will be the device that supports it, so for example an alternate
+mode of port0-partner will be presented under /sys/class/typec/port0-partner/.
+Every mode that is supported will have its own group under the Alternate Mode
+device named "mode<index>", for example /sys/class/typec/port0/<alternate
+mode>/mode1/. The requests for entering/exiting a mode can be done with "active"
+attribute file in that group.
+
+Driver API
+----------
+
+Registering the ports
+~~~~~~~~~~~~~~~~~~~~~
+
+The port drivers will describe every Type-C port they control with struct
+typec_capability data structure, and register them with the following API:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_register_port typec_unregister_port
+
+When registering the ports, the prefer_role member in struct typec_capability
+deserves special notice. If the port that is being registered does not have
+initial role preference, which means the port does not execute Try.SNK or
+Try.SRC by default, the member must have value TYPEC_NO_PREFERRED_ROLE.
+Otherwise if the port executes Try.SNK by default, the member must have value
+TYPEC_DEVICE, and with Try.SRC the value must be TYPEC_HOST.
+
+Registering Partners
+~~~~~~~~~~~~~~~~~~~~
+
+After successful connection of a partner, the port driver needs to register the
+partner with the class. Details about the partner need to be described in struct
+typec_partner_desc. The class copies the details of the partner during
+registration. The class offers the following API for registering/unregistering
+partners.
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_register_partner typec_unregister_partner
+
+The class will provide a handle to struct typec_partner if the registration was
+successful, or NULL.
+
+If the partner is USB Power Delivery capable, and the port driver is able to
+show the result of Discover Identity command, the partner descriptor structure
+should include handle to struct usb_pd_identity instance. The class will then
+create a sysfs directory for the identity under the partner device. The result
+of Discover Identity command can then be reported with the following API:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_partner_set_identity
+
+Registering Cables
+~~~~~~~~~~~~~~~~~~
+
+After successful connection of a cable that supports USB Power Delivery
+Structured VDM "Discover Identity", the port driver needs to register the cable
+and one or two plugs, depending if there is CC Double Prime controller present
+in the cable or not. So a cable capable of SOP Prime communication, but not SOP
+Double Prime communication, should only have one plug registered. For more
+information about SOP communication, please read chapter about it from the
+latest USB Power Delivery specification.
+
+The plugs are represented as their own devices. The cable is registered first,
+followed by registration of the cable plugs. The cable will be the parent device
+for the plugs. Details about the cable need to be described in struct
+typec_cable_desc and about a plug in struct typec_plug_desc. The class copies
+the details during registration. The class offers the following API for
+registering/unregistering cables and their plugs:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_register_cable typec_unregister_cable typec_register_plug
+   typec_unregister_plug
+
+The class will provide a handle to struct typec_cable and struct typec_plug if
+the registration is successful, or NULL if it isn't.
+
+If the cable is USB Power Delivery capable, and the port driver is able to show
+the result of Discover Identity command, the cable descriptor structure should
+include handle to struct usb_pd_identity instance. The class will then create a
+sysfs directory for the identity under the cable device. The result of Discover
+Identity command can then be reported with the following API:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_cable_set_identity
+
+Notifications
+~~~~~~~~~~~~~
+
+When the partner has executed a role change, or when the default roles change
+during connection of a partner or cable, the port driver must use the following
+APIs to report it to the class:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_set_data_role typec_set_pwr_role typec_set_vconn_role
+   typec_set_pwr_opmode
+
+Alternate Modes
+~~~~~~~~~~~~~~~
+
+USB Type-C ports, partners and cable plugs may support Alternate Modes. Each
+Alternate Mode will have identifier called SVID, which is either a Standard ID
+given by USB-IF or vendor ID, and each supported SVID can have 1 - 6 modes. The
+class provides struct typec_mode_desc for describing individual mode of a SVID,
+and struct typec_altmode_desc which is a container for all the supported modes.
+
+Ports that support Alternate Modes need to register each SVID they support with
+the following API:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_port_register_altmode
+
+If a partner or cable plug provides a list of SVIDs as response to USB Power
+Delivery Structured VDM Discover SVIDs message, each SVID needs to be
+registered.
+
+API for the partners:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_partner_register_altmode
+
+API for the Cable Plugs:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_plug_register_altmode
+
+So ports, partners and cable plugs will register the alternate modes with their
+own functions, but the registration will always return a handle to struct
+typec_altmode on success, or NULL. The unregistration will happen with the same
+function:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_unregister_altmode
+
+If a partner or cable plug enters or exits a mode, the port driver needs to
+notify the class with the following API:
+
+.. kernel-doc:: drivers/usb/typec/typec.c
+   :functions: typec_altmode_update_active
diff --git a/MAINTAINERS b/MAINTAINERS
index c776906f67a9..9f72fbede9dc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13100,6 +13100,15 @@ F:	drivers/usb/
 F:	include/linux/usb.h
 F:	include/linux/usb/
 
+USB TYPEC SUBSYSTEM
+M:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
+L:	linux-usb@vger.kernel.org
+S:	Maintained
+F:	Documentation/ABI/testing/sysfs-class-typec
+F:	Documentation/usb/typec.rst
+F:	drivers/usb/typec/
+F:	include/linux/usb/typec.h
+
 USB UHCI DRIVER
 M:	Alan Stern <stern@rowland.harvard.edu>
 L:	linux-usb@vger.kernel.org
diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig
index aba6ebd8dedf..939a63bca82f 100644
--- a/drivers/usb/Kconfig
+++ b/drivers/usb/Kconfig
@@ -162,6 +162,8 @@ source "drivers/usb/phy/Kconfig"
 
 source "drivers/usb/gadget/Kconfig"
 
+source "drivers/usb/typec/Kconfig"
+
 config USB_LED_TRIG
 	bool "USB LED Triggers"
 	depends on LEDS_CLASS && LEDS_TRIGGERS
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 4e1cf090fd20..34b50dfc8f40 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -62,3 +62,5 @@ obj-$(CONFIG_USB_GADGET)	+= gadget/
 obj-$(CONFIG_USB_COMMON)	+= common/
 
 obj-$(CONFIG_USBIP_CORE)	+= usbip/
+
+obj-$(CONFIG_TYPEC)		+= typec/
diff --git a/drivers/usb/typec/Kconfig b/drivers/usb/typec/Kconfig
new file mode 100644
index 000000000000..17792f9114c6
--- /dev/null
+++ b/drivers/usb/typec/Kconfig
@@ -0,0 +1,7 @@
+
+menu "USB Power Delivery and Type-C drivers"
+
+config TYPEC
+	tristate
+
+endmenu
diff --git a/drivers/usb/typec/Makefile b/drivers/usb/typec/Makefile
new file mode 100644
index 000000000000..1012a8bed6d5
--- /dev/null
+++ b/drivers/usb/typec/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_TYPEC)		+= typec.o
diff --git a/drivers/usb/typec/typec.c b/drivers/usb/typec/typec.c
new file mode 100644
index 000000000000..89e540bb7ff3
--- /dev/null
+++ b/drivers/usb/typec/typec.c
@@ -0,0 +1,1262 @@
+/*
+ * USB Type-C Connector Class
+ *
+ * Copyright (C) 2017, Intel Corporation
+ * Author: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/usb/typec.h>
+
+struct typec_mode {
+	int				index;
+	u32				vdo;
+	char				*desc;
+	enum typec_port_type		roles;
+
+	struct typec_altmode		*alt_mode;
+
+	unsigned int			active:1;
+
+	char				group_name[6];
+	struct attribute_group		group;
+	struct attribute		*attrs[5];
+	struct device_attribute		vdo_attr;
+	struct device_attribute		desc_attr;
+	struct device_attribute		active_attr;
+	struct device_attribute		roles_attr;
+};
+
+struct typec_altmode {
+	struct device			dev;
+	u16				svid;
+	int				n_modes;
+	struct typec_mode		modes[ALTMODE_MAX_MODES];
+	const struct attribute_group	*mode_groups[ALTMODE_MAX_MODES];
+};
+
+struct typec_plug {
+	struct device			dev;
+	enum typec_plug_index		index;
+};
+
+struct typec_cable {
+	struct device			dev;
+	enum typec_plug_type		type;
+	struct usb_pd_identity		*identity;
+	unsigned int			active:1;
+};
+
+struct typec_partner {
+	struct device			dev;
+	unsigned int			usb_pd:1;
+	struct usb_pd_identity		*identity;
+	enum typec_accessory		accessory;
+};
+
+struct typec_port {
+	unsigned int			id;
+	struct device			dev;
+
+	int				prefer_role;
+	enum typec_data_role		data_role;
+	enum typec_role			pwr_role;
+	enum typec_role			vconn_role;
+	enum typec_pwr_opmode		pwr_opmode;
+
+	const struct typec_capability	*cap;
+};
+
+#define to_typec_port(_dev_) container_of(_dev_, struct typec_port, dev)
+#define to_typec_plug(_dev_) container_of(_dev_, struct typec_plug, dev)
+#define to_typec_cable(_dev_) container_of(_dev_, struct typec_cable, dev)
+#define to_typec_partner(_dev_) container_of(_dev_, struct typec_partner, dev)
+#define to_altmode(_dev_) container_of(_dev_, struct typec_altmode, dev)
+
+static const struct device_type typec_partner_dev_type;
+static const struct device_type typec_cable_dev_type;
+static const struct device_type typec_plug_dev_type;
+static const struct device_type typec_port_dev_type;
+
+#define is_typec_partner(_dev_) (_dev_->type == &typec_partner_dev_type)
+#define is_typec_cable(_dev_) (_dev_->type == &typec_cable_dev_type)
+#define is_typec_plug(_dev_) (_dev_->type == &typec_plug_dev_type)
+#define is_typec_port(_dev_) (_dev_->type == &typec_port_dev_type)
+
+static DEFINE_IDA(typec_index_ida);
+static struct class *typec_class;
+
+/* Common attributes */
+
+static const char * const typec_accessory_modes[] = {
+	[TYPEC_ACCESSORY_NONE]		= "none",
+	[TYPEC_ACCESSORY_AUDIO]		= "analog_audio",
+	[TYPEC_ACCESSORY_DEBUG]		= "debug",
+};
+
+static struct usb_pd_identity *get_pd_identity(struct device *dev)
+{
+	if (is_typec_partner(dev)) {
+		struct typec_partner *partner = to_typec_partner(dev);
+
+		return partner->identity;
+	} else if (is_typec_cable(dev)) {
+		struct typec_cable *cable = to_typec_cable(dev);
+
+		return cable->identity;
+	}
+	return NULL;
+}
+
+static ssize_t id_header_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct usb_pd_identity *id = get_pd_identity(dev);
+
+	return sprintf(buf, "0x%08x\n", id->id_header);
+}
+static DEVICE_ATTR_RO(id_header);
+
+static ssize_t cert_stat_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct usb_pd_identity *id = get_pd_identity(dev);
+
+	return sprintf(buf, "0x%08x\n", id->cert_stat);
+}
+static DEVICE_ATTR_RO(cert_stat);
+
+static ssize_t product_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct usb_pd_identity *id = get_pd_identity(dev);
+
+	return sprintf(buf, "0x%08x\n", id->product);
+}
+static DEVICE_ATTR_RO(product);
+
+static struct attribute *usb_pd_id_attrs[] = {
+	&dev_attr_id_header.attr,
+	&dev_attr_cert_stat.attr,
+	&dev_attr_product.attr,
+	NULL
+};
+
+static const struct attribute_group usb_pd_id_group = {
+	.name = "identity",
+	.attrs = usb_pd_id_attrs,
+};
+
+static const struct attribute_group *usb_pd_id_groups[] = {
+	&usb_pd_id_group,
+	NULL,
+};
+
+static void typec_report_identity(struct device *dev)
+{
+	sysfs_notify(&dev->kobj, "identity", "id_header");
+	sysfs_notify(&dev->kobj, "identity", "cert_stat");
+	sysfs_notify(&dev->kobj, "identity", "product");
+}
+
+/* ------------------------------------------------------------------------- */
+/* Alternate Modes */
+
+/**
+ * typec_altmode_update_active - Report Enter/Exit mode
+ * @alt: Handle to the alternate mode
+ * @mode: Mode index
+ * @active: True when the mode has been entered
+ *
+ * If a partner or cable plug executes Enter/Exit Mode command successfully, the
+ * drivers use this routine to report the updated state of the mode.
+ */
+void typec_altmode_update_active(struct typec_altmode *alt, int mode,
+				 bool active)
+{
+	struct typec_mode *m = &alt->modes[mode];
+	char dir[6];
+
+	if (m->active == active)
+		return;
+
+	m->active = active;
+	snprintf(dir, sizeof(dir), "mode%d", mode);
+	sysfs_notify(&alt->dev.kobj, dir, "active");
+	kobject_uevent(&alt->dev.kobj, KOBJ_CHANGE);
+}
+EXPORT_SYMBOL_GPL(typec_altmode_update_active);
+
+/**
+ * typec_altmode2port - Alternate Mode to USB Type-C port
+ * @alt: The Alternate Mode
+ *
+ * Returns handle to the port that a cable plug or partner with @alt is
+ * connected to.
+ */
+struct typec_port *typec_altmode2port(struct typec_altmode *alt)
+{
+	if (is_typec_plug(alt->dev.parent))
+		return to_typec_port(alt->dev.parent->parent->parent);
+	if (is_typec_partner(alt->dev.parent))
+		return to_typec_port(alt->dev.parent->parent);
+	if (is_typec_port(alt->dev.parent))
+		return to_typec_port(alt->dev.parent);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(typec_altmode2port);
+
+static ssize_t
+typec_altmode_vdo_show(struct device *dev, struct device_attribute *attr,
+		       char *buf)
+{
+	struct typec_mode *mode = container_of(attr, struct typec_mode,
+					       vdo_attr);
+
+	return sprintf(buf, "0x%08x\n", mode->vdo);
+}
+
+static ssize_t
+typec_altmode_desc_show(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct typec_mode *mode = container_of(attr, struct typec_mode,
+					       desc_attr);
+
+	return sprintf(buf, "%s\n", mode->desc ? mode->desc : "");
+}
+
+static ssize_t
+typec_altmode_active_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct typec_mode *mode = container_of(attr, struct typec_mode,
+					       active_attr);
+
+	return sprintf(buf, "%s\n", mode->active ? "yes" : "no");
+}
+
+static ssize_t
+typec_altmode_active_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t size)
+{
+	struct typec_mode *mode = container_of(attr, struct typec_mode,
+					       active_attr);
+	struct typec_port *port = typec_altmode2port(mode->alt_mode);
+	bool activate;
+	int ret;
+
+	if (!port->cap->activate_mode)
+		return -EOPNOTSUPP;
+
+	ret = kstrtobool(buf, &activate);
+	if (ret)
+		return ret;
+
+	ret = port->cap->activate_mode(port->cap, mode->index, activate);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static ssize_t
+typec_altmode_roles_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct typec_mode *mode = container_of(attr, struct typec_mode,
+					       roles_attr);
+	ssize_t ret;
+
+	switch (mode->roles) {
+	case TYPEC_PORT_DFP:
+		ret = sprintf(buf, "source\n");
+		break;
+	case TYPEC_PORT_UFP:
+		ret = sprintf(buf, "sink\n");
+		break;
+	case TYPEC_PORT_DRP:
+	default:
+		ret = sprintf(buf, "source sink\n");
+		break;
+	}
+	return ret;
+}
+
+static void typec_init_modes(struct typec_altmode *alt,
+			     struct typec_mode_desc *desc, bool is_port)
+{
+	int i;
+
+	for (i = 0; i < alt->n_modes; i++, desc++) {
+		struct typec_mode *mode = &alt->modes[i];
+
+		/* Not considering the human readable description critical */
+		mode->desc = kstrdup(desc->desc, GFP_KERNEL);
+		if (desc->desc && !mode->desc)
+			dev_err(&alt->dev, "failed to copy mode%d desc\n", i);
+
+		mode->alt_mode = alt;
+		mode->vdo = desc->vdo;
+		mode->roles = desc->roles;
+		mode->index = desc->index;
+		sprintf(mode->group_name, "mode%d", desc->index);
+
+		sysfs_attr_init(&mode->vdo_attr.attr);
+		mode->vdo_attr.attr.name = "vdo";
+		mode->vdo_attr.attr.mode = 0444;
+		mode->vdo_attr.show = typec_altmode_vdo_show;
+
+		sysfs_attr_init(&mode->desc_attr.attr);
+		mode->desc_attr.attr.name = "description";
+		mode->desc_attr.attr.mode = 0444;
+		mode->desc_attr.show = typec_altmode_desc_show;
+
+		sysfs_attr_init(&mode->active_attr.attr);
+		mode->active_attr.attr.name = "active";
+		mode->active_attr.attr.mode = 0644;
+		mode->active_attr.show = typec_altmode_active_show;
+		mode->active_attr.store = typec_altmode_active_store;
+
+		mode->attrs[0] = &mode->vdo_attr.attr;
+		mode->attrs[1] = &mode->desc_attr.attr;
+		mode->attrs[2] = &mode->active_attr.attr;
+
+		/* With ports, list the roles that the mode is supported with */
+		if (is_port) {
+			sysfs_attr_init(&mode->roles_attr.attr);
+			mode->roles_attr.attr.name = "supported_roles";
+			mode->roles_attr.attr.mode = 0444;
+			mode->roles_attr.show = typec_altmode_roles_show;
+
+			mode->attrs[3] = &mode->roles_attr.attr;
+		}
+
+		mode->group.attrs = mode->attrs;
+		mode->group.name = mode->group_name;
+
+		alt->mode_groups[i] = &mode->group;
+	}
+}
+
+static ssize_t svid_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct typec_altmode *alt = to_altmode(dev);
+
+	return sprintf(buf, "%04x\n", alt->svid);
+}
+static DEVICE_ATTR_RO(svid);
+
+static struct attribute *typec_altmode_attrs[] = {
+	&dev_attr_svid.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(typec_altmode);
+
+static void typec_altmode_release(struct device *dev)
+{
+	struct typec_altmode *alt = to_altmode(dev);
+	int i;
+
+	for (i = 0; i < alt->n_modes; i++)
+		kfree(alt->modes[i].desc);
+	kfree(alt);
+}
+
+static const struct device_type typec_altmode_dev_type = {
+	.name = "typec_alternate_mode",
+	.groups = typec_altmode_groups,
+	.release = typec_altmode_release,
+};
+
+static struct typec_altmode *
+typec_register_altmode(struct device *parent, struct typec_altmode_desc *desc)
+{
+	struct typec_altmode *alt;
+	int ret;
+
+	alt = kzalloc(sizeof(*alt), GFP_KERNEL);
+	if (!alt)
+		return NULL;
+
+	alt->svid = desc->svid;
+	alt->n_modes = desc->n_modes;
+	typec_init_modes(alt, desc->modes, is_typec_port(parent));
+
+	alt->dev.parent = parent;
+	alt->dev.groups = alt->mode_groups;
+	alt->dev.type = &typec_altmode_dev_type;
+	dev_set_name(&alt->dev, "svid-%04x", alt->svid);
+
+	ret = device_register(&alt->dev);
+	if (ret) {
+		dev_err(parent, "failed to register alternate mode (%d)\n",
+			ret);
+		put_device(&alt->dev);
+		return NULL;
+	}
+
+	return alt;
+}
+
+/**
+ * typec_unregister_altmode - Unregister Alternate Mode
+ * @alt: The alternate mode to be unregistered
+ *
+ * Unregister device created with typec_partner_register_altmode(),
+ * typec_plug_register_altmode() or typec_port_register_altmode().
+ */
+void typec_unregister_altmode(struct typec_altmode *alt)
+{
+	if (alt)
+		device_unregister(&alt->dev);
+}
+EXPORT_SYMBOL_GPL(typec_unregister_altmode);
+
+/* ------------------------------------------------------------------------- */
+/* Type-C Partners */
+
+static ssize_t accessory_mode_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct typec_partner *p = to_typec_partner(dev);
+
+	return sprintf(buf, "%s\n", typec_accessory_modes[p->accessory]);
+}
+static DEVICE_ATTR_RO(accessory_mode);
+
+static ssize_t supports_usb_power_delivery_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct typec_partner *p = to_typec_partner(dev);
+
+	return sprintf(buf, "%s\n", p->usb_pd ? "yes" : "no");
+}
+static DEVICE_ATTR_RO(supports_usb_power_delivery);
+
+static struct attribute *typec_partner_attrs[] = {
+	&dev_attr_accessory_mode.attr,
+	&dev_attr_supports_usb_power_delivery.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(typec_partner);
+
+static void typec_partner_release(struct device *dev)
+{
+	struct typec_partner *partner = to_typec_partner(dev);
+
+	kfree(partner);
+}
+
+static const struct device_type typec_partner_dev_type = {
+	.name = "typec_partner",
+	.groups = typec_partner_groups,
+	.release = typec_partner_release,
+};
+
+/**
+ * typec_partner_set_identity - Report result from Discover Identity command
+ * @partner: The partner updated identity values
+ *
+ * This routine is used to report that the result of Discover Identity USB power
+ * delivery command has become available.
+ */
+int typec_partner_set_identity(struct typec_partner *partner)
+{
+	if (!partner->identity)
+		return -EINVAL;
+
+	typec_report_identity(&partner->dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_partner_set_identity);
+
+/**
+ * typec_partner_register_altmode - Register USB Type-C Partner Alternate Mode
+ * @partner: USB Type-C Partner that supports the alternate mode
+ * @desc: Description of the alternate mode
+ *
+ * This routine is used to register each alternate mode individually that
+ * @partner has listed in response to Discover SVIDs command. The modes for a
+ * SVID listed in response to Discover Modes command need to be listed in an
+ * array in @desc.
+ *
+ * Returns handle to the alternate mode on success or NULL on failure.
+ */
+struct typec_altmode *
+typec_partner_register_altmode(struct typec_partner *partner,
+			       struct typec_altmode_desc *desc)
+{
+	return typec_register_altmode(&partner->dev, desc);
+}
+EXPORT_SYMBOL_GPL(typec_partner_register_altmode);
+
+/**
+ * typec_register_partner - Register a USB Type-C Partner
+ * @port: The USB Type-C Port the partner is connected to
+ * @desc: Description of the partner
+ *
+ * Registers a device for USB Type-C Partner described in @desc.
+ *
+ * Returns handle to the partner on success or NULL on failure.
+ */
+struct typec_partner *typec_register_partner(struct typec_port *port,
+					     struct typec_partner_desc *desc)
+{
+	struct typec_partner *partner;
+	int ret;
+
+	partner = kzalloc(sizeof(*partner), GFP_KERNEL);
+	if (!partner)
+		return NULL;
+
+	partner->usb_pd = desc->usb_pd;
+	partner->accessory = desc->accessory;
+
+	if (desc->identity) {
+		/*
+		 * Creating directory for the identity only if the driver is
+		 * able to provide data to it.
+		 */
+		partner->dev.groups = usb_pd_id_groups;
+		partner->identity = desc->identity;
+	}
+
+	partner->dev.class = typec_class;
+	partner->dev.parent = &port->dev;
+	partner->dev.type = &typec_partner_dev_type;
+	dev_set_name(&partner->dev, "%s-partner", dev_name(&port->dev));
+
+	ret = device_register(&partner->dev);
+	if (ret) {
+		dev_err(&port->dev, "failed to register partner (%d)\n", ret);
+		put_device(&partner->dev);
+		return NULL;
+	}
+
+	return partner;
+}
+EXPORT_SYMBOL_GPL(typec_register_partner);
+
+/**
+ * typec_unregister_partner - Unregister a USB Type-C Partner
+ * @partner: The partner to be unregistered
+ *
+ * Unregister device created with typec_register_partner().
+ */
+void typec_unregister_partner(struct typec_partner *partner)
+{
+	if (partner)
+		device_unregister(&partner->dev);
+}
+EXPORT_SYMBOL_GPL(typec_unregister_partner);
+
+/* ------------------------------------------------------------------------- */
+/* Type-C Cable Plugs */
+
+static void typec_plug_release(struct device *dev)
+{
+	struct typec_plug *plug = to_typec_plug(dev);
+
+	kfree(plug);
+}
+
+static const struct device_type typec_plug_dev_type = {
+	.name = "typec_plug",
+	.release = typec_plug_release,
+};
+
+/**
+ * typec_plug_register_altmode - Register USB Type-C Cable Plug Alternate Mode
+ * @plug: USB Type-C Cable Plug that supports the alternate mode
+ * @desc: Description of the alternate mode
+ *
+ * This routine is used to register each alternate mode individually that @plug
+ * has listed in response to Discover SVIDs command. The modes for a SVID that
+ * the plug lists in response to Discover Modes command need to be listed in an
+ * array in @desc.
+ *
+ * Returns handle to the alternate mode on success or NULL on failure.
+ */
+struct typec_altmode *
+typec_plug_register_altmode(struct typec_plug *plug,
+			    struct typec_altmode_desc *desc)
+{
+	return typec_register_altmode(&plug->dev, desc);
+}
+EXPORT_SYMBOL_GPL(typec_plug_register_altmode);
+
+/**
+ * typec_register_plug - Register a USB Type-C Cable Plug
+ * @cable: USB Type-C Cable with the plug
+ * @desc: Description of the cable plug
+ *
+ * Registers a device for USB Type-C Cable Plug described in @desc. A USB Type-C
+ * Cable Plug represents a plug with electronics in it that can response to USB
+ * Power Delivery SOP Prime or SOP Double Prime packages.
+ *
+ * Returns handle to the cable plug on success or NULL on failure.
+ */
+struct typec_plug *typec_register_plug(struct typec_cable *cable,
+				       struct typec_plug_desc *desc)
+{
+	struct typec_plug *plug;
+	char name[8];
+	int ret;
+
+	plug = kzalloc(sizeof(*plug), GFP_KERNEL);
+	if (!plug)
+		return NULL;
+
+	sprintf(name, "plug%d", desc->index);
+
+	plug->index = desc->index;
+	plug->dev.class = typec_class;
+	plug->dev.parent = &cable->dev;
+	plug->dev.type = &typec_plug_dev_type;
+	dev_set_name(&plug->dev, "%s-%s", dev_name(cable->dev.parent), name);
+
+	ret = device_register(&plug->dev);
+	if (ret) {
+		dev_err(&cable->dev, "failed to register plug (%d)\n", ret);
+		put_device(&plug->dev);
+		return NULL;
+	}
+
+	return plug;
+}
+EXPORT_SYMBOL_GPL(typec_register_plug);
+
+/**
+ * typec_unregister_plug - Unregister a USB Type-C Cable Plug
+ * @plug: The cable plug to be unregistered
+ *
+ * Unregister device created with typec_register_plug().
+ */
+void typec_unregister_plug(struct typec_plug *plug)
+{
+	if (plug)
+		device_unregister(&plug->dev);
+}
+EXPORT_SYMBOL_GPL(typec_unregister_plug);
+
+/* Type-C Cables */
+
+static ssize_t
+type_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct typec_cable *cable = to_typec_cable(dev);
+
+	return sprintf(buf, "%s\n", cable->active ? "active" : "passive");
+}
+static DEVICE_ATTR_RO(type);
+
+static const char * const typec_plug_types[] = {
+	[USB_PLUG_NONE]		= "unknown",
+	[USB_PLUG_TYPE_A]	= "type-a",
+	[USB_PLUG_TYPE_B]	= "type-b",
+	[USB_PLUG_TYPE_C]	= "type-c",
+	[USB_PLUG_CAPTIVE]	= "captive",
+};
+
+static ssize_t plug_type_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct typec_cable *cable = to_typec_cable(dev);
+
+	return sprintf(buf, "%s\n", typec_plug_types[cable->type]);
+}
+static DEVICE_ATTR_RO(plug_type);
+
+static struct attribute *typec_cable_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_plug_type.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(typec_cable);
+
+static void typec_cable_release(struct device *dev)
+{
+	struct typec_cable *cable = to_typec_cable(dev);
+
+	kfree(cable);
+}
+
+static const struct device_type typec_cable_dev_type = {
+	.name = "typec_cable",
+	.groups = typec_cable_groups,
+	.release = typec_cable_release,
+};
+
+/**
+ * typec_cable_set_identity - Report result from Discover Identity command
+ * @cable: The cable updated identity values
+ *
+ * This routine is used to report that the result of Discover Identity USB power
+ * delivery command has become available.
+ */
+int typec_cable_set_identity(struct typec_cable *cable)
+{
+	if (!cable->identity)
+		return -EINVAL;
+
+	typec_report_identity(&cable->dev);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(typec_cable_set_identity);
+
+/**
+ * typec_register_cable - Register a USB Type-C Cable
+ * @port: The USB Type-C Port the cable is connected to
+ * @desc: Description of the cable
+ *
+ * Registers a device for USB Type-C Cable described in @desc. The cable will be
+ * parent for the optional cable plug devises.
+ *
+ * Returns handle to the cable on success or NULL on failure.
+ */
+struct typec_cable *typec_register_cable(struct typec_port *port,
+					 struct typec_cable_desc *desc)
+{
+	struct typec_cable *cable;
+	int ret;
+
+	cable = kzalloc(sizeof(*cable), GFP_KERNEL);
+	if (!cable)
+		return NULL;
+
+	cable->type = desc->type;
+	cable->active = desc->active;
+
+	if (desc->identity) {
+		/*
+		 * Creating directory for the identity only if the driver is
+		 * able to provide data to it.
+		 */
+		cable->dev.groups = usb_pd_id_groups;
+		cable->identity = desc->identity;
+	}
+
+	cable->dev.class = typec_class;
+	cable->dev.parent = &port->dev;
+	cable->dev.type = &typec_cable_dev_type;
+	dev_set_name(&cable->dev, "%s-cable", dev_name(&port->dev));
+
+	ret = device_register(&cable->dev);
+	if (ret) {
+		dev_err(&port->dev, "failed to register cable (%d)\n", ret);
+		put_device(&cable->dev);
+		return NULL;
+	}
+
+	return cable;
+}
+EXPORT_SYMBOL_GPL(typec_register_cable);
+
+/**
+ * typec_unregister_cable - Unregister a USB Type-C Cable
+ * @cable: The cable to be unregistered
+ *
+ * Unregister device created with typec_register_cable().
+ */
+void typec_unregister_cable(struct typec_cable *cable)
+{
+	if (cable)
+		device_unregister(&cable->dev);
+}
+EXPORT_SYMBOL_GPL(typec_unregister_cable);
+
+/* ------------------------------------------------------------------------- */
+/* USB Type-C ports */
+
+static const char * const typec_roles[] = {
+	[TYPEC_SINK]	= "sink",
+	[TYPEC_SOURCE]	= "source",
+};
+
+static const char * const typec_data_roles[] = {
+	[TYPEC_DEVICE]	= "device",
+	[TYPEC_HOST]	= "host",
+};
+
+static ssize_t
+preferred_role_store(struct device *dev, struct device_attribute *attr,
+		     const char *buf, size_t size)
+{
+	struct typec_port *port = to_typec_port(dev);
+	int role;
+	int ret;
+
+	if (port->cap->type != TYPEC_PORT_DRP) {
+		dev_dbg(dev, "Preferred role only supported with DRP ports\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!port->cap->try_role) {
+		dev_dbg(dev, "Setting preferred role not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	role = sysfs_match_string(typec_roles, buf);
+	if (role < 0) {
+		if (sysfs_streq(buf, "none"))
+			role = TYPEC_NO_PREFERRED_ROLE;
+		else
+			return -EINVAL;
+	}
+
+	ret = port->cap->try_role(port->cap, role);
+	if (ret)
+		return ret;
+
+	port->prefer_role = role;
+	return size;
+}
+
+static ssize_t
+preferred_role_show(struct device *dev, struct device_attribute *attr,
+		    char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+
+	if (port->cap->type != TYPEC_PORT_DRP)
+		return 0;
+
+	if (port->prefer_role < 0)
+		return 0;
+
+	return sprintf(buf, "%s\n", typec_roles[port->prefer_role]);
+}
+static DEVICE_ATTR_RW(preferred_role);
+
+static ssize_t data_role_store(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t size)
+{
+	struct typec_port *port = to_typec_port(dev);
+	int ret;
+
+	if (port->cap->type != TYPEC_PORT_DRP) {
+		dev_dbg(dev, "data role swap only supported with DRP ports\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!port->cap->dr_set) {
+		dev_dbg(dev, "data role swapping not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	ret = sysfs_match_string(typec_data_roles, buf);
+	if (ret < 0)
+		return ret;
+
+	ret = port->cap->dr_set(port->cap, ret);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static ssize_t data_role_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+
+	if (port->cap->type == TYPEC_PORT_DRP)
+		return sprintf(buf, "%s\n", port->data_role == TYPEC_HOST ?
+			       "[host] device" : "host [device]");
+
+	return sprintf(buf, "[%s]\n", typec_data_roles[port->data_role]);
+}
+static DEVICE_ATTR_RW(data_role);
+
+static ssize_t power_role_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t size)
+{
+	struct typec_port *port = to_typec_port(dev);
+	int ret = size;
+
+	if (!port->cap->pd_revision) {
+		dev_dbg(dev, "USB Power Delivery not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!port->cap->pr_set) {
+		dev_dbg(dev, "power role swapping not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (port->pwr_opmode != TYPEC_PWR_MODE_PD) {
+		dev_dbg(dev, "partner unable to swap power role\n");
+		return -EIO;
+	}
+
+	ret = sysfs_match_string(typec_roles, buf);
+	if (ret < 0)
+		return ret;
+
+	ret = port->cap->pr_set(port->cap, ret);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static ssize_t power_role_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+
+	if (port->cap->type == TYPEC_PORT_DRP)
+		return sprintf(buf, "%s\n", port->pwr_role == TYPEC_SOURCE ?
+			       "[source] sink" : "source [sink]");
+
+	return sprintf(buf, "[%s]\n", typec_roles[port->pwr_role]);
+}
+static DEVICE_ATTR_RW(power_role);
+
+static const char * const typec_pwr_opmodes[] = {
+	[TYPEC_PWR_MODE_USB]	= "default",
+	[TYPEC_PWR_MODE_1_5A]	= "1.5A",
+	[TYPEC_PWR_MODE_3_0A]	= "3.0A",
+	[TYPEC_PWR_MODE_PD]	= "usb_power_delivery",
+};
+
+static ssize_t power_operation_mode_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+
+	return sprintf(buf, "%s\n", typec_pwr_opmodes[port->pwr_opmode]);
+}
+static DEVICE_ATTR_RO(power_operation_mode);
+
+static ssize_t vconn_source_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t size)
+{
+	struct typec_port *port = to_typec_port(dev);
+	bool source;
+	int ret;
+
+	if (!port->cap->pd_revision) {
+		dev_dbg(dev, "VCONN swap depends on USB Power Delivery\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!port->cap->vconn_set) {
+		dev_dbg(dev, "VCONN swapping not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	ret = kstrtobool(buf, &source);
+	if (ret)
+		return ret;
+
+	ret = port->cap->vconn_set(port->cap, (enum typec_role)source);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static ssize_t vconn_source_show(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+
+	return sprintf(buf, "%s\n",
+		       port->vconn_role == TYPEC_SOURCE ? "yes" : "no");
+}
+static DEVICE_ATTR_RW(vconn_source);
+
+static ssize_t supported_accessory_modes_show(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+	ssize_t ret = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(port->cap->accessory); i++) {
+		if (port->cap->accessory[i])
+			ret += sprintf(buf + ret, "%s ",
+			       typec_accessory_modes[port->cap->accessory[i]]);
+	}
+
+	if (!ret)
+		return sprintf(buf, "none\n");
+
+	buf[ret - 1] = '\n';
+
+	return ret;
+}
+static DEVICE_ATTR_RO(supported_accessory_modes);
+
+static ssize_t usb_typec_revision_show(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct typec_port *port = to_typec_port(dev);
+	u16 rev = port->cap->revision;
+
+	return sprintf(buf, "%d.%d\n", (rev >> 8) & 0xff, (rev >> 4) & 0xf);
+}
+static DEVICE_ATTR_RO(usb_typec_revision);
+
+static ssize_t usb_power_delivery_revision_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	struct typec_port *p = to_typec_port(dev);
+
+	return sprintf(buf, "%d\n", (p->cap->pd_revision >> 8) & 0xff);
+}
+static DEVICE_ATTR_RO(usb_power_delivery_revision);
+
+static struct attribute *typec_attrs[] = {
+	&dev_attr_data_role.attr,
+	&dev_attr_power_operation_mode.attr,
+	&dev_attr_power_role.attr,
+	&dev_attr_preferred_role.attr,
+	&dev_attr_supported_accessory_modes.attr,
+	&dev_attr_usb_power_delivery_revision.attr,
+	&dev_attr_usb_typec_revision.attr,
+	&dev_attr_vconn_source.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(typec);
+
+static int typec_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	int ret;
+
+	ret = add_uevent_var(env, "TYPEC_PORT=%s", dev_name(dev));
+	if (ret)
+		dev_err(dev, "failed to add uevent TYPEC_PORT\n");
+
+	return ret;
+}
+
+static void typec_release(struct device *dev)
+{
+	struct typec_port *port = to_typec_port(dev);
+
+	ida_simple_remove(&typec_index_ida, port->id);
+	kfree(port);
+}
+
+static const struct device_type typec_port_dev_type = {
+	.name = "typec_port",
+	.groups = typec_groups,
+	.uevent = typec_uevent,
+	.release = typec_release,
+};
+
+/* --------------------------------------- */
+/* Driver callbacks to report role updates */
+
+/**
+ * typec_set_data_role - Report data role change
+ * @port: The USB Type-C Port where the role was changed
+ * @role: The new data role
+ *
+ * This routine is used by the port drivers to report data role changes.
+ */
+void typec_set_data_role(struct typec_port *port, enum typec_data_role role)
+{
+	if (port->data_role == role)
+		return;
+
+	port->data_role = role;
+	sysfs_notify(&port->dev.kobj, NULL, "data_role");
+	kobject_uevent(&port->dev.kobj, KOBJ_CHANGE);
+}
+EXPORT_SYMBOL_GPL(typec_set_data_role);
+
+/**
+ * typec_set_pwr_role - Report power role change
+ * @port: The USB Type-C Port where the role was changed
+ * @role: The new data role
+ *
+ * This routine is used by the port drivers to report power role changes.
+ */
+void typec_set_pwr_role(struct typec_port *port, enum typec_role role)
+{
+	if (port->pwr_role == role)
+		return;
+
+	port->pwr_role = role;
+	sysfs_notify(&port->dev.kobj, NULL, "power_role");
+	kobject_uevent(&port->dev.kobj, KOBJ_CHANGE);
+}
+EXPORT_SYMBOL_GPL(typec_set_pwr_role);
+
+/**
+ * typec_set_pwr_role - Report VCONN source change
+ * @port: The USB Type-C Port which VCONN role changed
+ * @role: Source when @port is sourcing VCONN, or Sink when it's not
+ *
+ * This routine is used by the port drivers to report if the VCONN source is
+ * changes.
+ */
+void typec_set_vconn_role(struct typec_port *port, enum typec_role role)
+{
+	if (port->vconn_role == role)
+		return;
+
+	port->vconn_role = role;
+	sysfs_notify(&port->dev.kobj, NULL, "vconn_source");
+	kobject_uevent(&port->dev.kobj, KOBJ_CHANGE);
+}
+EXPORT_SYMBOL_GPL(typec_set_vconn_role);
+
+/**
+ * typec_set_pwr_opmode - Report changed power operation mode
+ * @port: The USB Type-C Port where the mode was changed
+ * @opmode: New power operation mode
+ *
+ * This routine is used by the port drivers to report changed power operation
+ * mode in @port. The modes are USB (default), 1.5A, 3.0A as defined in USB
+ * Type-C specification, and "USB Power Delivery" when the power levels are
+ * negotiated with methods defined in USB Power Delivery specification.
+ */
+void typec_set_pwr_opmode(struct typec_port *port,
+			  enum typec_pwr_opmode opmode)
+{
+	if (port->pwr_opmode == opmode)
+		return;
+
+	port->pwr_opmode = opmode;
+	sysfs_notify(&port->dev.kobj, NULL, "power_operation_mode");
+	kobject_uevent(&port->dev.kobj, KOBJ_CHANGE);
+}
+EXPORT_SYMBOL_GPL(typec_set_pwr_opmode);
+
+/* --------------------------------------- */
+
+/**
+ * typec_port_register_altmode - Register USB Type-C Port Alternate Mode
+ * @port: USB Type-C Port that supports the alternate mode
+ * @desc: Description of the alternate mode
+ *
+ * This routine is used to register an alternate mode that @port is capable of
+ * supporting.
+ *
+ * Returns handle to the alternate mode on success or NULL on failure.
+ */
+struct typec_altmode *
+typec_port_register_altmode(struct typec_port *port,
+			    struct typec_altmode_desc *desc)
+{
+	return typec_register_altmode(&port->dev, desc);
+}
+EXPORT_SYMBOL_GPL(typec_port_register_altmode);
+
+/**
+ * typec_register_port - Register a USB Type-C Port
+ * @parent: Parent device
+ * @cap: Description of the port
+ *
+ * Registers a device for USB Type-C Port described in @cap.
+ *
+ * Returns handle to the port on success or NULL on failure.
+ */
+struct typec_port *typec_register_port(struct device *parent,
+				       const struct typec_capability *cap)
+{
+	struct typec_port *port;
+	int role;
+	int ret;
+	int id;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return NULL;
+
+	id = ida_simple_get(&typec_index_ida, 0, 0, GFP_KERNEL);
+	if (id < 0) {
+		kfree(port);
+		return NULL;
+	}
+
+	if (cap->type == TYPEC_PORT_DFP)
+		role = TYPEC_SOURCE;
+	else if (cap->type == TYPEC_PORT_UFP)
+		role = TYPEC_SINK;
+	else
+		role = cap->prefer_role;
+
+	if (role == TYPEC_SOURCE) {
+		port->data_role = TYPEC_HOST;
+		port->pwr_role = TYPEC_SOURCE;
+		port->vconn_role = TYPEC_SOURCE;
+	} else {
+		port->data_role = TYPEC_DEVICE;
+		port->pwr_role = TYPEC_SINK;
+		port->vconn_role = TYPEC_SINK;
+	}
+
+	port->id = id;
+	port->cap = cap;
+	port->prefer_role = cap->prefer_role;
+
+	port->dev.class = typec_class;
+	port->dev.parent = parent;
+	port->dev.fwnode = cap->fwnode;
+	port->dev.type = &typec_port_dev_type;
+	dev_set_name(&port->dev, "port%d", id);
+
+	ret = device_register(&port->dev);
+	if (ret) {
+		dev_err(parent, "failed to register port (%d)\n", ret);
+		put_device(&port->dev);
+		return NULL;
+	}
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(typec_register_port);
+
+/**
+ * typec_unregister_port - Unregister a USB Type-C Port
+ * @port: The port to be unregistered
+ *
+ * Unregister device created with typec_register_port().
+ */
+void typec_unregister_port(struct typec_port *port)
+{
+	if (port)
+		device_unregister(&port->dev);
+}
+EXPORT_SYMBOL_GPL(typec_unregister_port);
+
+static int __init typec_init(void)
+{
+	typec_class = class_create(THIS_MODULE, "typec");
+	return PTR_ERR_OR_ZERO(typec_class);
+}
+subsys_initcall(typec_init);
+
+static void __exit typec_exit(void)
+{
+	class_destroy(typec_class);
+	ida_destroy(&typec_index_ida);
+}
+module_exit(typec_exit);
+
+MODULE_AUTHOR("Heikki Krogerus <heikki.krogerus@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("USB Type-C Connector Class");
diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h
new file mode 100644
index 000000000000..ec78204964ab
--- /dev/null
+++ b/include/linux/usb/typec.h
@@ -0,0 +1,243 @@
+
+#ifndef __LINUX_USB_TYPEC_H
+#define __LINUX_USB_TYPEC_H
+
+#include <linux/types.h>
+
+/* XXX: Once we have a header for USB Power Delivery, this belongs there */
+#define ALTMODE_MAX_MODES	6
+
+/* USB Type-C Specification releases */
+#define USB_TYPEC_REV_1_0	0x100 /* 1.0 */
+#define USB_TYPEC_REV_1_1	0x110 /* 1.1 */
+#define USB_TYPEC_REV_1_2	0x120 /* 1.2 */
+
+struct typec_altmode;
+struct typec_partner;
+struct typec_cable;
+struct typec_plug;
+struct typec_port;
+
+struct fwnode_handle;
+
+enum typec_port_type {
+	TYPEC_PORT_DFP,
+	TYPEC_PORT_UFP,
+	TYPEC_PORT_DRP,
+};
+
+enum typec_plug_type {
+	USB_PLUG_NONE,
+	USB_PLUG_TYPE_A,
+	USB_PLUG_TYPE_B,
+	USB_PLUG_TYPE_C,
+	USB_PLUG_CAPTIVE,
+};
+
+enum typec_data_role {
+	TYPEC_DEVICE,
+	TYPEC_HOST,
+};
+
+enum typec_role {
+	TYPEC_SINK,
+	TYPEC_SOURCE,
+};
+
+enum typec_pwr_opmode {
+	TYPEC_PWR_MODE_USB,
+	TYPEC_PWR_MODE_1_5A,
+	TYPEC_PWR_MODE_3_0A,
+	TYPEC_PWR_MODE_PD,
+};
+
+enum typec_accessory {
+	TYPEC_ACCESSORY_NONE,
+	TYPEC_ACCESSORY_AUDIO,
+	TYPEC_ACCESSORY_DEBUG,
+};
+
+#define TYPEC_MAX_ACCESSORY	3
+
+/*
+ * struct usb_pd_identity - USB Power Delivery identity data
+ * @id_header: ID Header VDO
+ * @cert_stat: Cert Stat VDO
+ * @product: Product VDO
+ *
+ * USB power delivery Discover Identity command response data.
+ *
+ * REVISIT: This is USB Power Delivery specific information, so this structure
+ * probable belongs to USB Power Delivery header file once we have them.
+ */
+struct usb_pd_identity {
+	u32			id_header;
+	u32			cert_stat;
+	u32			product;
+};
+
+int typec_partner_set_identity(struct typec_partner *partner);
+int typec_cable_set_identity(struct typec_cable *cable);
+
+/*
+ * struct typec_mode_desc - Individual Mode of an Alternate Mode
+ * @index: Index of the Mode within the SVID
+ * @vdo: VDO returned by Discover Modes USB PD command
+ * @desc: Optional human readable description of the mode
+ * @roles: Only for ports. DRP if the mode is available in both roles
+ *
+ * Description of a mode of an Alternate Mode which a connector, cable plug or
+ * partner supports. Every mode will have it's own sysfs group. The details are
+ * the VDO returned by discover modes command, description for the mode and
+ * active flag telling has the mode being entered or not.
+ */
+struct typec_mode_desc {
+	int			index;
+	u32			vdo;
+	char			*desc;
+	/* Only used with ports */
+	enum typec_port_type	roles;
+};
+
+/*
+ * struct typec_altmode_desc - USB Type-C Alternate Mode Descriptor
+ * @svid: Standard or Vendor ID
+ * @n_modes: Number of modes
+ * @modes: Array of modes supported by the Alternate Mode
+ *
+ * Representation of an Alternate Mode that has SVID assigned by USB-IF. The
+ * array of modes will list the modes of a particular SVID that are supported by
+ * a connector, partner of a cable plug.
+ */
+struct typec_altmode_desc {
+	u16			svid;
+	int			n_modes;
+	struct typec_mode_desc	modes[ALTMODE_MAX_MODES];
+};
+
+struct typec_altmode
+*typec_partner_register_altmode(struct typec_partner *partner,
+				struct typec_altmode_desc *desc);
+struct typec_altmode
+*typec_plug_register_altmode(struct typec_plug *plug,
+			     struct typec_altmode_desc *desc);
+struct typec_altmode
+*typec_port_register_altmode(struct typec_port *port,
+			     struct typec_altmode_desc *desc);
+void typec_unregister_altmode(struct typec_altmode *altmode);
+
+struct typec_port *typec_altmode2port(struct typec_altmode *alt);
+
+void typec_altmode_update_active(struct typec_altmode *alt, int mode,
+				 bool active);
+
+enum typec_plug_index {
+	TYPEC_PLUG_SOP_P,
+	TYPEC_PLUG_SOP_PP,
+};
+
+/*
+ * struct typec_plug_desc - USB Type-C Cable Plug Descriptor
+ * @index: SOP Prime for the plug connected to DFP and SOP Double Prime for the
+ *         plug connected to UFP
+ *
+ * Represents USB Type-C Cable Plug.
+ */
+struct typec_plug_desc {
+	enum typec_plug_index	index;
+};
+
+/*
+ * struct typec_cable_desc - USB Type-C Cable Descriptor
+ * @type: The plug type from USB PD Cable VDO
+ * @active: Is the cable active or passive
+ * @identity: Result of Discover Identity command
+ *
+ * Represents USB Type-C Cable attached to USB Type-C port.
+ */
+struct typec_cable_desc {
+	enum typec_plug_type	type;
+	unsigned int		active:1;
+	struct usb_pd_identity	*identity;
+};
+
+/*
+ * struct typec_partner_desc - USB Type-C Partner Descriptor
+ * @usb_pd: USB Power Delivery support
+ * @accessory: Audio, Debug or none.
+ * @identity: Discover Identity command data
+ *
+ * Details about a partner that is attached to USB Type-C port. If @identity
+ * member exists when partner is registered, a directory named "identity" is
+ * created to sysfs for the partner device.
+ */
+struct typec_partner_desc {
+	unsigned int		usb_pd:1;
+	enum typec_accessory	accessory;
+	struct usb_pd_identity	*identity;
+};
+
+/*
+ * struct typec_capability - USB Type-C Port Capabilities
+ * @role: DFP (Host-only), UFP (Device-only) or DRP (Dual Role)
+ * @revision: USB Type-C Specification release. Binary coded decimal
+ * @pd_revision: USB Power Delivery Specification revision if supported
+ * @prefer_role: Initial role preference
+ * @accessory: Supported Accessory Modes
+ * @fwnode: Optional fwnode of the port
+ * @try_role: Set data role preference for DRP port
+ * @dr_set: Set Data Role
+ * @pr_set: Set Power Role
+ * @vconn_set: Set VCONN Role
+ * @activate_mode: Enter/exit given Alternate Mode
+ *
+ * Static capabilities of a single USB Type-C port.
+ */
+struct typec_capability {
+	enum typec_port_type	type;
+	u16			revision; /* 0120H = "1.2" */
+	u16			pd_revision; /* 0300H = "3.0" */
+	int			prefer_role;
+	enum typec_accessory	accessory[TYPEC_MAX_ACCESSORY];
+
+	struct fwnode_handle	*fwnode;
+
+	int		(*try_role)(const struct typec_capability *,
+				    int role);
+
+	int		(*dr_set)(const struct typec_capability *,
+				  enum typec_data_role);
+	int		(*pr_set)(const struct typec_capability *,
+				  enum typec_role);
+	int		(*vconn_set)(const struct typec_capability *,
+				     enum typec_role);
+
+	int		(*activate_mode)(const struct typec_capability *,
+					 int mode, int activate);
+};
+
+/* Specific to try_role(). Indicates the user want's to clear the preference. */
+#define TYPEC_NO_PREFERRED_ROLE	(-1)
+
+struct typec_port *typec_register_port(struct device *parent,
+				       const struct typec_capability *cap);
+void typec_unregister_port(struct typec_port *port);
+
+struct typec_partner *typec_register_partner(struct typec_port *port,
+					     struct typec_partner_desc *desc);
+void typec_unregister_partner(struct typec_partner *partner);
+
+struct typec_cable *typec_register_cable(struct typec_port *port,
+					 struct typec_cable_desc *desc);
+void typec_unregister_cable(struct typec_cable *cable);
+
+struct typec_plug *typec_register_plug(struct typec_cable *cable,
+				       struct typec_plug_desc *desc);
+void typec_unregister_plug(struct typec_plug *plug);
+
+void typec_set_data_role(struct typec_port *port, enum typec_data_role role);
+void typec_set_pwr_role(struct typec_port *port, enum typec_role role);
+void typec_set_vconn_role(struct typec_port *port, enum typec_role role);
+void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode);
+
+#endif /* __LINUX_USB_TYPEC_H */
-- 
cgit v1.2.3


From 66a359390e7e34f9a4c489467234b107b3d76169 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Fri, 17 Mar 2017 11:35:30 +0100
Subject: USB: core: add helpers to retrieve endpoints

Many USB drivers iterate over the available endpoints to find required
endpoints of a specific type and direction. Typically the endpoints are
required for proper function and a missing endpoint should abort probe.

To facilitate code reuse, add a helper to retrieve common endpoints
(bulk or interrupt, in or out) and four wrappers to find a single
endpoint.

Note that the helpers are marked as __must_check to serve as a reminder
to always verify that all expected endpoints are indeed present. This
also means that any optional endpoints, typically need to be looked up
through separate calls.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/usb.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb.h    | 35 +++++++++++++++++++++
 2 files changed, 118 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 4cd6e0e4b66d..5d65504770f5 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -75,6 +75,89 @@ MODULE_PARM_DESC(autosuspend, "default autosuspend delay");
 #endif
 
 
+/**
+ * usb_find_common_endpoints() -- look up common endpoint descriptors
+ * @alt:	alternate setting to search
+ * @bulk_in:	pointer to descriptor pointer, or NULL
+ * @bulk_out:	pointer to descriptor pointer, or NULL
+ * @int_in:	pointer to descriptor pointer, or NULL
+ * @int_out:	pointer to descriptor pointer, or NULL
+ *
+ * Search the alternate setting's endpoint descriptors for the first bulk-in,
+ * bulk-out, interrupt-in and interrupt-out endpoints and return them in the
+ * provided pointers (unless they are NULL).
+ *
+ * If a requested endpoint is not found, the corresponding pointer is set to
+ * NULL.
+ *
+ * Return: Zero if all requested descriptors were found, or -ENXIO otherwise.
+ */
+int usb_find_common_endpoints(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_in,
+		struct usb_endpoint_descriptor **bulk_out,
+		struct usb_endpoint_descriptor **int_in,
+		struct usb_endpoint_descriptor **int_out)
+{
+	struct usb_endpoint_descriptor *epd;
+	int i;
+
+	if (bulk_in)
+		*bulk_in = NULL;
+	if (bulk_out)
+		*bulk_out = NULL;
+	if (int_in)
+		*int_in = NULL;
+	if (int_out)
+		*int_out = NULL;
+
+	for (i = 0; i < alt->desc.bNumEndpoints; ++i) {
+		epd = &alt->endpoint[i].desc;
+
+		switch (usb_endpoint_type(epd)) {
+		case USB_ENDPOINT_XFER_BULK:
+			if (usb_endpoint_dir_in(epd)) {
+				if (bulk_in && !*bulk_in) {
+					*bulk_in = epd;
+					break;
+				}
+			} else {
+				if (bulk_out && !*bulk_out) {
+					*bulk_out = epd;
+					break;
+				}
+			}
+
+			continue;
+		case USB_ENDPOINT_XFER_INT:
+			if (usb_endpoint_dir_in(epd)) {
+				if (int_in && !*int_in) {
+					*int_in = epd;
+					break;
+				}
+			} else {
+				if (int_out && !*int_out) {
+					*int_out = epd;
+					break;
+				}
+			}
+
+			continue;
+		default:
+			continue;
+		}
+
+		if ((!bulk_in || *bulk_in) &&
+				(!bulk_out || *bulk_out) &&
+				(!int_in || *int_in) &&
+				(!int_out || *int_out)) {
+			return 0;
+		}
+	}
+
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(usb_find_common_endpoints);
+
 /**
  * usb_find_alt_setting() - Given a configuration, find the alternate setting
  * for the given interface.
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 148752640693..7041cc950737 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -99,6 +99,41 @@ enum usb_interface_condition {
 	USB_INTERFACE_UNBINDING,
 };
 
+int __must_check
+usb_find_common_endpoints(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_in,
+		struct usb_endpoint_descriptor **bulk_out,
+		struct usb_endpoint_descriptor **int_in,
+		struct usb_endpoint_descriptor **int_out);
+
+static inline int __must_check
+usb_find_bulk_in_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_in)
+{
+	return usb_find_common_endpoints(alt, bulk_in, NULL, NULL, NULL);
+}
+
+static inline int __must_check
+usb_find_bulk_out_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_out)
+{
+	return usb_find_common_endpoints(alt, NULL, bulk_out, NULL, NULL);
+}
+
+static inline int __must_check
+usb_find_int_in_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **int_in)
+{
+	return usb_find_common_endpoints(alt, NULL, NULL, int_in, NULL);
+}
+
+static inline int __must_check
+usb_find_int_out_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **int_out)
+{
+	return usb_find_common_endpoints(alt, NULL, NULL, NULL, int_out);
+}
+
 /**
  * struct usb_interface - what usb device drivers talk to
  * @altsetting: array of interface structures, one for each alternate
-- 
cgit v1.2.3


From 279daf4e053470f22c9421a4ab05f8e5a9e9eeec Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Fri, 17 Mar 2017 11:35:31 +0100
Subject: USB: core: add helpers to retrieve endpoints in reverse order

Several drivers have implemented their endpoint look-up loops in such a
way that they have picked the last endpoint descriptor of the specified
type should more than one such descriptor exist.

To avoid any regressions, add corresponding helpers to lookup endpoints
by searching the endpoint descriptors in reverse order.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/usb.c | 112 +++++++++++++++++++++++++++++++++----------------
 include/linux/usb.h    |  35 ++++++++++++++++
 2 files changed, 111 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 5d65504770f5..1ec9d248781e 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -74,6 +74,48 @@ MODULE_PARM_DESC(autosuspend, "default autosuspend delay");
 #define usb_autosuspend_delay		0
 #endif
 
+static bool match_endpoint(struct usb_endpoint_descriptor *epd,
+		struct usb_endpoint_descriptor **bulk_in,
+		struct usb_endpoint_descriptor **bulk_out,
+		struct usb_endpoint_descriptor **int_in,
+		struct usb_endpoint_descriptor **int_out)
+{
+	switch (usb_endpoint_type(epd)) {
+	case USB_ENDPOINT_XFER_BULK:
+		if (usb_endpoint_dir_in(epd)) {
+			if (bulk_in && !*bulk_in) {
+				*bulk_in = epd;
+				break;
+			}
+		} else {
+			if (bulk_out && !*bulk_out) {
+				*bulk_out = epd;
+				break;
+			}
+		}
+
+		return false;
+	case USB_ENDPOINT_XFER_INT:
+		if (usb_endpoint_dir_in(epd)) {
+			if (int_in && !*int_in) {
+				*int_in = epd;
+				break;
+			}
+		} else {
+			if (int_out && !*int_out) {
+				*int_out = epd;
+				break;
+			}
+		}
+
+		return false;
+	default:
+		return false;
+	}
+
+	return (!bulk_in || *bulk_in) && (!bulk_out || *bulk_out) &&
+			(!int_in || *int_in) && (!int_out || *int_out);
+}
 
 /**
  * usb_find_common_endpoints() -- look up common endpoint descriptors
@@ -113,50 +155,48 @@ int usb_find_common_endpoints(struct usb_host_interface *alt,
 	for (i = 0; i < alt->desc.bNumEndpoints; ++i) {
 		epd = &alt->endpoint[i].desc;
 
-		switch (usb_endpoint_type(epd)) {
-		case USB_ENDPOINT_XFER_BULK:
-			if (usb_endpoint_dir_in(epd)) {
-				if (bulk_in && !*bulk_in) {
-					*bulk_in = epd;
-					break;
-				}
-			} else {
-				if (bulk_out && !*bulk_out) {
-					*bulk_out = epd;
-					break;
-				}
-			}
+		if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out))
+			return 0;
+	}
 
-			continue;
-		case USB_ENDPOINT_XFER_INT:
-			if (usb_endpoint_dir_in(epd)) {
-				if (int_in && !*int_in) {
-					*int_in = epd;
-					break;
-				}
-			} else {
-				if (int_out && !*int_out) {
-					*int_out = epd;
-					break;
-				}
-			}
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(usb_find_common_endpoints);
 
-			continue;
-		default:
-			continue;
-		}
+/**
+ * usb_find_common_endpoints_reverse() -- look up common endpoint descriptors
+ *
+ * Same as usb_find_common_endpoints(), but the endpoint descriptors are
+ * searched in reverse order (see usb_find_common_endpoints() for details).
+ */
+int usb_find_common_endpoints_reverse(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_in,
+		struct usb_endpoint_descriptor **bulk_out,
+		struct usb_endpoint_descriptor **int_in,
+		struct usb_endpoint_descriptor **int_out)
+{
+	struct usb_endpoint_descriptor *epd;
+	int i;
+
+	if (bulk_in)
+		*bulk_in = NULL;
+	if (bulk_out)
+		*bulk_out = NULL;
+	if (int_in)
+		*int_in = NULL;
+	if (int_out)
+		*int_out = NULL;
+
+	for (i = alt->desc.bNumEndpoints - 1; i >= 0; --i) {
+		epd = &alt->endpoint[i].desc;
 
-		if ((!bulk_in || *bulk_in) &&
-				(!bulk_out || *bulk_out) &&
-				(!int_in || *int_in) &&
-				(!int_out || *int_out)) {
+		if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out))
 			return 0;
-		}
 	}
 
 	return -ENXIO;
 }
-EXPORT_SYMBOL_GPL(usb_find_common_endpoints);
+EXPORT_SYMBOL_GPL(usb_find_common_endpoints_reverse);
 
 /**
  * usb_find_alt_setting() - Given a configuration, find the alternate setting
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 7041cc950737..226557362d36 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -106,6 +106,13 @@ usb_find_common_endpoints(struct usb_host_interface *alt,
 		struct usb_endpoint_descriptor **int_in,
 		struct usb_endpoint_descriptor **int_out);
 
+int __must_check
+usb_find_common_endpoints_reverse(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_in,
+		struct usb_endpoint_descriptor **bulk_out,
+		struct usb_endpoint_descriptor **int_in,
+		struct usb_endpoint_descriptor **int_out);
+
 static inline int __must_check
 usb_find_bulk_in_endpoint(struct usb_host_interface *alt,
 		struct usb_endpoint_descriptor **bulk_in)
@@ -134,6 +141,34 @@ usb_find_int_out_endpoint(struct usb_host_interface *alt,
 	return usb_find_common_endpoints(alt, NULL, NULL, NULL, int_out);
 }
 
+static inline int __must_check
+usb_find_last_bulk_in_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_in)
+{
+	return usb_find_common_endpoints_reverse(alt, bulk_in, NULL, NULL, NULL);
+}
+
+static inline int __must_check
+usb_find_last_bulk_out_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **bulk_out)
+{
+	return usb_find_common_endpoints_reverse(alt, NULL, bulk_out, NULL, NULL);
+}
+
+static inline int __must_check
+usb_find_last_int_in_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **int_in)
+{
+	return usb_find_common_endpoints_reverse(alt, NULL, NULL, int_in, NULL);
+}
+
+static inline int __must_check
+usb_find_last_int_out_endpoint(struct usb_host_interface *alt,
+		struct usb_endpoint_descriptor **int_out)
+{
+	return usb_find_common_endpoints_reverse(alt, NULL, NULL, NULL, int_out);
+}
+
 /**
  * struct usb_interface - what usb device drivers talk to
  * @altsetting: array of interface structures, one for each alternate
-- 
cgit v1.2.3


From 41334f54a43ab00cbb294e6a08d0f57068f43025 Mon Sep 17 00:00:00 2001
From: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
Date: Fri, 17 Mar 2017 15:53:19 +0100
Subject: PCI: Include pci.h for struct pci_ops definition

struct pci_ecam_ops embeds a struct pci_ops.  Explicitly request the
definition for struct pci_ops, otherwise gcc might complain:

  include/linux/pci-ecam.h:29:19: error: field 'pci_ops' has incomplete type

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci-ecam.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index f0d2b9451270..b8f11d783a11 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -16,6 +16,7 @@
 #ifndef DRIVERS_PCI_ECAM_H
 #define DRIVERS_PCI_ECAM_H
 
+#include <linux/pci.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 
-- 
cgit v1.2.3


From 7a88fa191944589b2ed795bbed32ca6e9e2df31f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 23 Mar 2017 13:24:55 +0300
Subject: block: make nr_iovecs unsigned in bio_alloc_bioset()

There isn't a bug here, but Smatch is not smart enough to know that
"nr_iovecs" can't be negative so it complains about underflows.
Really, it's slightly cleaner to make this parameter unsigned.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 3 ++-
 include/linux/bio.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..6194a8cf2aab 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -427,7 +427,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *   RETURNS:
  *   Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
 	unsigned front_pad;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..4931756d86d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 extern mempool_t *biovec_create_pool(int pool_entries);
 
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
-- 
cgit v1.2.3


From 0695bd99c0d22bef4d9d4c72cf537b722ba98531 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Mon, 6 Feb 2017 22:12:04 +0100
Subject: clockevents: Make clockevents_config() static

A clockevent device's rate should be configured before or at registration
and changed afterwards through clockevents_update_freq() only.

For the configuration at registration, we already have
clockevents_config_and_register().

Right now, there are no clockevents_config() users outside of the
clockevents core.

To mitigiate the risk of drivers errorneously reconfiguring their rates
through clockevents_config() *after* device registration, make
clockevents_config() static.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/clockchips.h | 1 -
 kernel/time/clockevents.c  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 5d3053c34fb3..eef1569e5cd0 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -182,7 +182,6 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *e
 extern void clockevents_register_device(struct clock_event_device *dev);
 extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
 
-extern void clockevents_config(struct clock_event_device *dev, u32 freq);
 extern void clockevents_config_and_register(struct clock_event_device *dev,
 					    u32 freq, unsigned long min_delta,
 					    unsigned long max_delta);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
 
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
 {
 	u64 sec;
 
-- 
cgit v1.2.3


From 83e007a0c6a3f4bfdf8f3f8d0fc266cda189b3d6 Mon Sep 17 00:00:00 2001
From: Carlo Caione <carlo@endlessm.com>
Date: Fri, 3 Mar 2017 16:17:58 +0100
Subject: firmware: meson-sm: Check for buffer output size

After the data is read by the secure monitor driver it is being copied
in the output buffer checking only the size of the bounce buffer but not
the size of the output buffer.

Fix this in the secure monitor driver slightly changing the API. Fix
also the efuse driver that it is the only driver using this API to not
break bisectability.

Signed-off-by: Carlo Caione <carlo@endlessm.com>
Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> # for nvmem
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Kevin Hilman <khilman@baylibre.com>
---
 drivers/firmware/meson/meson_sm.c       | 10 +++++++---
 drivers/nvmem/meson-efuse.c             |  2 +-
 include/linux/firmware/meson/meson_sm.h |  4 ++--
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/meson/meson_sm.c b/drivers/firmware/meson/meson_sm.c
index b0d254930ed3..5f30a5774e57 100644
--- a/drivers/firmware/meson/meson_sm.c
+++ b/drivers/firmware/meson/meson_sm.c
@@ -127,6 +127,7 @@ EXPORT_SYMBOL(meson_sm_call);
  * meson_sm_call_read - retrieve data from secure-monitor
  *
  * @buffer:	Buffer to store the retrieved data
+ * @bsize:	Size of the buffer
  * @cmd_index:	Index of the SMC32 function ID
  * @arg0:	SMC32 Argument 0
  * @arg1:	SMC32 Argument 1
@@ -136,8 +137,8 @@ EXPORT_SYMBOL(meson_sm_call);
  *
  * Return:	size of read data on success, a negative value on error
  */
-int meson_sm_call_read(void *buffer, unsigned int cmd_index, u32 arg0,
-		       u32 arg1, u32 arg2, u32 arg3, u32 arg4)
+int meson_sm_call_read(void *buffer, unsigned int bsize, unsigned int cmd_index,
+		       u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4)
 {
 	u32 size;
 
@@ -147,10 +148,13 @@ int meson_sm_call_read(void *buffer, unsigned int cmd_index, u32 arg0,
 	if (!fw.chip->cmd_shmem_out_base)
 		return -EINVAL;
 
+	if (bsize > fw.chip->shmem_size)
+		return -EINVAL;
+
 	if (meson_sm_call(cmd_index, &size, arg0, arg1, arg2, arg3, arg4) < 0)
 		return -EINVAL;
 
-	if (!size || size > fw.chip->shmem_size)
+	if (!size || size > bsize)
 		return -EINVAL;
 
 	if (buffer)
diff --git a/drivers/nvmem/meson-efuse.c b/drivers/nvmem/meson-efuse.c
index f207c3b10482..70bfc9839bb2 100644
--- a/drivers/nvmem/meson-efuse.c
+++ b/drivers/nvmem/meson-efuse.c
@@ -27,7 +27,7 @@ static int meson_efuse_read(void *context, unsigned int offset,
 	u8 *buf = val;
 	int ret;
 
-	ret = meson_sm_call_read(buf, SM_EFUSE_READ, offset,
+	ret = meson_sm_call_read(buf, bytes, SM_EFUSE_READ, offset,
 				 bytes, 0, 0, 0);
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/firmware/meson/meson_sm.h b/include/linux/firmware/meson/meson_sm.h
index 8e953c6f394a..37a5eaea69dd 100644
--- a/include/linux/firmware/meson/meson_sm.h
+++ b/include/linux/firmware/meson/meson_sm.h
@@ -25,7 +25,7 @@ int meson_sm_call(unsigned int cmd_index, u32 *ret, u32 arg0, u32 arg1,
 		  u32 arg2, u32 arg3, u32 arg4);
 int meson_sm_call_write(void *buffer, unsigned int b_size, unsigned int cmd_index,
 			u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
-int meson_sm_call_read(void *buffer, unsigned int cmd_index, u32 arg0, u32 arg1,
-		       u32 arg2, u32 arg3, u32 arg4);
+int meson_sm_call_read(void *buffer, unsigned int bsize, unsigned int cmd_index,
+		       u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
 
 #endif /* _MESON_SM_FW_H_ */
-- 
cgit v1.2.3


From 91b8270f2a4d1d9b268de90451cdca63a70052d6 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 22 Mar 2017 17:27:34 -0700
Subject: Add a helper function to get socket cookie in eBPF

Retrieve the socket cookie generated by sock_gen_cookie() from a sk_buff
with a known socket. Generates a new cookie if one was not yet set.If
the socket pointer inside sk_buff is NULL, 0 is returned. The helper
function coud be useful in monitoring per socket networking traffic
statistics and provide a unique socket identifier per namespace.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h      |  1 +
 include/uapi/linux/bpf.h       |  9 ++++++++-
 net/core/filter.c              | 17 +++++++++++++++++
 net/core/sock_diag.c           |  2 +-
 tools/include/uapi/linux/bpf.h |  3 ++-
 5 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index a0596ca0e80a..a2f8109bb215 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -24,6 +24,7 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
 
+u64 sock_gen_cookie(struct sock *sk);
 int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
 void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ce6f029ac368..cdfc5595fbc1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -459,6 +459,12 @@ union bpf_attr {
  *     Return:
  *       > 0 length of the string including the trailing NUL on success
  *       < 0 error
+ *
+ * u64 bpf_bpf_get_socket_cookie(skb)
+ *     Get the cookie for the socket stored inside sk_buff.
+ *     @skb: pointer to skb
+ *     Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ *     field is missing inside sk_buff
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -506,7 +512,8 @@ union bpf_attr {
 	FN(get_numa_node_id),		\
 	FN(skb_change_head),		\
 	FN(xdp_adjust_head),		\
-	FN(probe_read_str),
+	FN(probe_read_str),		\
+	FN(get_socket_cookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index c7f0ccd1c0d3..35b0f97c3fdf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/fcntl.h>
 #include <linux/socket.h>
+#include <linux/sock_diag.h>
 #include <linux/in.h>
 #include <linux/inet.h>
 #include <linux/netdevice.h>
@@ -2606,6 +2607,18 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
 	.arg5_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
+{
+	return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
+	.func           = bpf_get_socket_cookie,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -2640,6 +2653,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 	switch (func_id) {
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -2699,6 +2714,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
+	case BPF_FUNC_get_socket_cookie:
+		return &bpf_get_socket_cookie_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 8d11ee75a100..fb9d0e2fd148 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
 static DEFINE_MUTEX(sock_diag_table_mutex);
 static struct workqueue_struct *broadcast_wq;
 
-static u64 sock_gen_cookie(struct sock *sk)
+u64 sock_gen_cookie(struct sock *sk)
 {
 	while (1) {
 		u64 res = atomic64_read(&sk->sk_cookie);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ce6f029ac368..a3851859e5f3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -506,7 +506,8 @@ union bpf_attr {
 	FN(get_numa_node_id),		\
 	FN(skb_change_head),		\
 	FN(xdp_adjust_head),		\
-	FN(probe_read_str),
+	FN(probe_read_str),		\
+	FN(get_socket_cookie),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
-- 
cgit v1.2.3


From 380639c7cc786ec663e43eb3896ccf9172a46900 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 31 Aug 2016 08:49:44 +0100
Subject: gpio: add generic single-register fixed-direction GPIO driver

Add a simple, generic, single register fixed-direction GPIO driver.
This is able to support a single register with a mixture of inputs
and outputs.

This is different from gpio-mmio and gpio-74xx-mmio:
* gpio-mmio doesn't allow a fixed direction, it assumes there is always
  a direction register.
* gpio-74xx-mmio only supports all-in or all-out setups
* gpio-74xx-mmio is DT only, this needs to support legacy too
* they don't double-read when getting the GPIO value, as required by
  some implementations that this driver supports
* we need to always do 32-bit reads, which bgpio doesn't guarantee
* the current output state may not be readable from the hardware
  register - reading may reflect input status but not output status.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/Kconfig          |   6 ++
 drivers/gpio/Makefile         |   1 +
 drivers/gpio/gpio-reg.c       | 164 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/gpio/gpio-reg.h |  12 ++++
 4 files changed, 183 insertions(+)
 create mode 100644 drivers/gpio/gpio-reg.c
 create mode 100644 include/linux/gpio/gpio-reg.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index d6e3cfd54848..63ceed246b6f 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -380,6 +380,12 @@ config GPIO_RCAR
 	help
 	  Say yes here to support GPIO on Renesas R-Car SoCs.
 
+config GPIO_REG
+	bool
+	help
+	  A 32-bit single register GPIO fixed in/out implementation.  This
+	  can be used to represent any register as a set of GPIO signals.
+
 config GPIO_SPEAR_SPICS
 	bool "ST SPEAr13xx SPI Chip Select as GPIO support"
 	depends on PLAT_SPEAR
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index bd995dc2a84a..095598e856ca 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -98,6 +98,7 @@ obj-$(CONFIG_GPIO_PXA)		+= gpio-pxa.o
 obj-$(CONFIG_GPIO_RC5T583)	+= gpio-rc5t583.o
 obj-$(CONFIG_GPIO_RDC321X)	+= gpio-rdc321x.o
 obj-$(CONFIG_GPIO_RCAR)		+= gpio-rcar.o
+obj-$(CONFIG_GPIO_REG)		+= gpio-reg.o
 obj-$(CONFIG_ARCH_SA1100)	+= gpio-sa1100.o
 obj-$(CONFIG_GPIO_SCH)		+= gpio-sch.o
 obj-$(CONFIG_GPIO_SCH311X)	+= gpio-sch311x.o
diff --git a/drivers/gpio/gpio-reg.c b/drivers/gpio/gpio-reg.c
new file mode 100644
index 000000000000..209f73695a47
--- /dev/null
+++ b/drivers/gpio/gpio-reg.c
@@ -0,0 +1,164 @@
+/*
+ * gpio-reg: single register individually fixed-direction GPIOs
+ *
+ * Copyright (C) 2016 Russell King
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ */
+#include <linux/gpio/driver.h>
+#include <linux/gpio/gpio-reg.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+struct gpio_reg {
+	struct gpio_chip gc;
+	spinlock_t lock;
+	u32 direction;
+	u32 out;
+	void __iomem *reg;
+};
+
+#define to_gpio_reg(x) container_of(x, struct gpio_reg, gc)
+
+static int gpio_reg_get_direction(struct gpio_chip *gc, unsigned offset)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+
+	return r->direction & BIT(offset) ? 1 : 0;
+}
+
+static int gpio_reg_direction_output(struct gpio_chip *gc, unsigned offset,
+	int value)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+
+	if (r->direction & BIT(offset))
+		return -ENOTSUPP;
+
+	gc->set(gc, offset, value);
+	return 0;
+}
+
+static int gpio_reg_direction_input(struct gpio_chip *gc, unsigned offset)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+
+	return r->direction & BIT(offset) ? 0 : -ENOTSUPP;
+}
+
+static void gpio_reg_set(struct gpio_chip *gc, unsigned offset, int value)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+	unsigned long flags;
+	u32 val, mask = BIT(offset);
+
+	spin_lock_irqsave(&r->lock, flags);
+	val = r->out;
+	if (value)
+		val |= mask;
+	else
+		val &= ~mask;
+	r->out = val;
+	writel_relaxed(val, r->reg);
+	spin_unlock_irqrestore(&r->lock, flags);
+}
+
+static int gpio_reg_get(struct gpio_chip *gc, unsigned offset)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+	u32 val, mask = BIT(offset);
+
+	if (r->direction & mask) {
+		/*
+		 * double-read the value, some registers latch after the
+		 * first read.
+		 */
+		readl_relaxed(r->reg);
+		val = readl_relaxed(r->reg);
+	} else {
+		val = r->out;
+	}
+	return !!(val & mask);
+}
+
+static void gpio_reg_set_multiple(struct gpio_chip *gc, unsigned long *mask,
+	unsigned long *bits)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+	unsigned long flags;
+
+	spin_lock_irqsave(&r->lock, flags);
+	r->out = (r->out & ~*mask) | (*bits & *mask);
+	writel_relaxed(r->out, r->reg);
+	spin_unlock_irqrestore(&r->lock, flags);
+}
+
+/**
+ * gpio_reg_init - add a fixed in/out register as gpio
+ * @dev: optional struct device associated with this register
+ * @base: start gpio number, or -1 to allocate
+ * @num: number of GPIOs, maximum 32
+ * @label: GPIO chip label
+ * @direction: bitmask of fixed direction, one per GPIO signal, 1 = in
+ * @def_out: initial GPIO output value
+ * @names: array of %num strings describing each GPIO signal
+ *
+ * Add a single-register GPIO device containing up to 32 GPIO signals,
+ * where each GPIO has a fixed input or output configuration.  Only
+ * input GPIOs are assumed to be readable from the register, and only
+ * then after a double-read.  Output values are assumed not to be
+ * readable.
+ */
+struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
+	int base, int num, const char *label, u32 direction, u32 def_out,
+	const char *const *names)
+{
+	struct gpio_reg *r;
+	int ret;
+
+	if (dev)
+		r = devm_kzalloc(dev, sizeof(*r), GFP_KERNEL);
+	else
+		r = kzalloc(sizeof(*r), GFP_KERNEL);
+
+	if (!r)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&r->lock);
+
+	r->gc.label = label;
+	r->gc.get_direction = gpio_reg_get_direction;
+	r->gc.direction_input = gpio_reg_direction_input;
+	r->gc.direction_output = gpio_reg_direction_output;
+	r->gc.set = gpio_reg_set;
+	r->gc.get = gpio_reg_get;
+	r->gc.set_multiple = gpio_reg_set_multiple;
+	r->gc.base = base;
+	r->gc.ngpio = num;
+	r->gc.names = names;
+	r->direction = direction;
+	r->out = def_out;
+	r->reg = reg;
+
+	if (dev)
+		ret = devm_gpiochip_add_data(dev, &r->gc, r);
+	else
+		ret = gpiochip_add_data(&r->gc, r);
+
+	return ret ? ERR_PTR(ret) : &r->gc;
+}
+
+int gpio_reg_resume(struct gpio_chip *gc)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+	unsigned long flags;
+
+	spin_lock_irqsave(&r->lock, flags);
+	writel_relaxed(r->out, r->reg);
+	spin_unlock_irqrestore(&r->lock, flags);
+
+	return 0;
+}
diff --git a/include/linux/gpio/gpio-reg.h b/include/linux/gpio/gpio-reg.h
new file mode 100644
index 000000000000..0352bec7319a
--- /dev/null
+++ b/include/linux/gpio/gpio-reg.h
@@ -0,0 +1,12 @@
+#ifndef GPIO_REG_H
+#define GPIO_REG_H
+
+struct device;
+
+struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
+	int base, int num, const char *label, u32 direction, u32 def_out,
+	const char *const *names);
+
+int gpio_reg_resume(struct gpio_chip *gc);
+
+#endif
-- 
cgit v1.2.3


From 0e3cb6ee386f384a9131f0c7db52a0a961d2ded9 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 2 Sep 2016 12:05:56 +0100
Subject: gpio: gpio-reg: add irq mapping for gpio-reg users

Add support for mapping gpio-reg gpios to interrupts.  This may be a
non-linear mapping - some gpios in the register may not even have
corresponding interrupts associated with them, so we need to pass an
array.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-reg.c       | 25 +++++++++++++++++++++++--
 include/linux/gpio/gpio-reg.h |  3 ++-
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-reg.c b/drivers/gpio/gpio-reg.c
index 209f73695a47..e85903eddc68 100644
--- a/drivers/gpio/gpio-reg.c
+++ b/drivers/gpio/gpio-reg.c
@@ -19,6 +19,8 @@ struct gpio_reg {
 	u32 direction;
 	u32 out;
 	void __iomem *reg;
+	struct irq_domain *irqdomain;
+	const int *irqs;
 };
 
 #define to_gpio_reg(x) container_of(x, struct gpio_reg, gc)
@@ -96,6 +98,17 @@ static void gpio_reg_set_multiple(struct gpio_chip *gc, unsigned long *mask,
 	spin_unlock_irqrestore(&r->lock, flags);
 }
 
+static int gpio_reg_to_irq(struct gpio_chip *gc, unsigned offset)
+{
+	struct gpio_reg *r = to_gpio_reg(gc);
+	int irq = r->irqs[offset];
+
+	if (irq >= 0 && r->irqdomain)
+		irq = irq_find_mapping(r->irqdomain, irq);
+
+	return irq;
+}
+
 /**
  * gpio_reg_init - add a fixed in/out register as gpio
  * @dev: optional struct device associated with this register
@@ -104,7 +117,12 @@ static void gpio_reg_set_multiple(struct gpio_chip *gc, unsigned long *mask,
  * @label: GPIO chip label
  * @direction: bitmask of fixed direction, one per GPIO signal, 1 = in
  * @def_out: initial GPIO output value
- * @names: array of %num strings describing each GPIO signal
+ * @names: array of %num strings describing each GPIO signal or %NULL
+ * @irqdom: irq domain or %NULL
+ * @irqs: array of %num ints describing the interrupt mapping for each
+ *        GPIO signal, or %NULL.  If @irqdom is %NULL, then this
+ *        describes the Linux interrupt number, otherwise it describes
+ *        the hardware interrupt number in the specified irq domain.
  *
  * Add a single-register GPIO device containing up to 32 GPIO signals,
  * where each GPIO has a fixed input or output configuration.  Only
@@ -114,7 +132,7 @@ static void gpio_reg_set_multiple(struct gpio_chip *gc, unsigned long *mask,
  */
 struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
 	int base, int num, const char *label, u32 direction, u32 def_out,
-	const char *const *names)
+	const char *const *names, struct irq_domain *irqdom, const int *irqs)
 {
 	struct gpio_reg *r;
 	int ret;
@@ -136,12 +154,15 @@ struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
 	r->gc.set = gpio_reg_set;
 	r->gc.get = gpio_reg_get;
 	r->gc.set_multiple = gpio_reg_set_multiple;
+	if (irqs)
+		r->gc.to_irq = gpio_reg_to_irq;
 	r->gc.base = base;
 	r->gc.ngpio = num;
 	r->gc.names = names;
 	r->direction = direction;
 	r->out = def_out;
 	r->reg = reg;
+	r->irqs = irqs;
 
 	if (dev)
 		ret = devm_gpiochip_add_data(dev, &r->gc, r);
diff --git a/include/linux/gpio/gpio-reg.h b/include/linux/gpio/gpio-reg.h
index 0352bec7319a..90e0b9060e6d 100644
--- a/include/linux/gpio/gpio-reg.h
+++ b/include/linux/gpio/gpio-reg.h
@@ -2,10 +2,11 @@
 #define GPIO_REG_H
 
 struct device;
+struct irq_domain;
 
 struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg,
 	int base, int num, const char *label, u32 direction, u32 def_out,
-	const char *const *names);
+	const char *const *names, struct irq_domain *irqdom, const int *irqs);
 
 int gpio_reg_resume(struct gpio_chip *gc);
 
-- 
cgit v1.2.3


From ccebcf3f224a44ec8e9c5bfca9d8e5d29298a5a8 Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Wed, 15 Mar 2017 13:20:43 -0500
Subject: crypto: ccp - Add SHA-2 384- and 512-bit support

Incorporate 384-bit and 512-bit hashing for a version 5 CCP
device

Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/ccp-crypto-sha.c | 22 ++++++++++++
 drivers/crypto/ccp/ccp-crypto.h     |  8 +++--
 drivers/crypto/ccp/ccp-ops.c        | 72 +++++++++++++++++++++++++++++++++++++
 include/linux/ccp.h                 |  2 ++
 4 files changed, 101 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c
index 84a652be4274..6b46eea94932 100644
--- a/drivers/crypto/ccp/ccp-crypto-sha.c
+++ b/drivers/crypto/ccp/ccp-crypto-sha.c
@@ -146,6 +146,12 @@ static int ccp_do_sha_update(struct ahash_request *req, unsigned int nbytes,
 	case CCP_SHA_TYPE_256:
 		rctx->cmd.u.sha.ctx_len = SHA256_DIGEST_SIZE;
 		break;
+	case CCP_SHA_TYPE_384:
+		rctx->cmd.u.sha.ctx_len = SHA384_DIGEST_SIZE;
+		break;
+	case CCP_SHA_TYPE_512:
+		rctx->cmd.u.sha.ctx_len = SHA512_DIGEST_SIZE;
+		break;
 	default:
 		/* Should never get here */
 		break;
@@ -393,6 +399,22 @@ static struct ccp_sha_def sha_algs[] = {
 		.digest_size	= SHA256_DIGEST_SIZE,
 		.block_size	= SHA256_BLOCK_SIZE,
 	},
+	{
+		.version	= CCP_VERSION(5, 0),
+		.name		= "sha384",
+		.drv_name	= "sha384-ccp",
+		.type		= CCP_SHA_TYPE_384,
+		.digest_size	= SHA384_DIGEST_SIZE,
+		.block_size	= SHA384_BLOCK_SIZE,
+	},
+	{
+		.version	= CCP_VERSION(5, 0),
+		.name		= "sha512",
+		.drv_name	= "sha512-ccp",
+		.type		= CCP_SHA_TYPE_512,
+		.digest_size	= SHA512_DIGEST_SIZE,
+		.block_size	= SHA512_BLOCK_SIZE,
+	},
 };
 
 static int ccp_register_hmac_alg(struct list_head *head,
diff --git a/drivers/crypto/ccp/ccp-crypto.h b/drivers/crypto/ccp/ccp-crypto.h
index 8335b32e815e..95cce2764139 100644
--- a/drivers/crypto/ccp/ccp-crypto.h
+++ b/drivers/crypto/ccp/ccp-crypto.h
@@ -137,9 +137,11 @@ struct ccp_aes_cmac_exp_ctx {
 	u8 buf[AES_BLOCK_SIZE];
 };
 
-/***** SHA related defines *****/
-#define MAX_SHA_CONTEXT_SIZE	SHA256_DIGEST_SIZE
-#define MAX_SHA_BLOCK_SIZE	SHA256_BLOCK_SIZE
+/* SHA-related defines
+ * These values must be large enough to accommodate any variant
+ */
+#define MAX_SHA_CONTEXT_SIZE	SHA512_DIGEST_SIZE
+#define MAX_SHA_BLOCK_SIZE	SHA512_BLOCK_SIZE
 
 struct ccp_sha_ctx {
 	struct scatterlist opad_sg;
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index f1396c3aedac..0d820802cd92 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -41,6 +41,20 @@ static const __be32 ccp_sha256_init[SHA256_DIGEST_SIZE / sizeof(__be32)] = {
 	cpu_to_be32(SHA256_H6), cpu_to_be32(SHA256_H7),
 };
 
+static const __be64 ccp_sha384_init[SHA512_DIGEST_SIZE / sizeof(__be64)] = {
+	cpu_to_be64(SHA384_H0), cpu_to_be64(SHA384_H1),
+	cpu_to_be64(SHA384_H2), cpu_to_be64(SHA384_H3),
+	cpu_to_be64(SHA384_H4), cpu_to_be64(SHA384_H5),
+	cpu_to_be64(SHA384_H6), cpu_to_be64(SHA384_H7),
+};
+
+static const __be64 ccp_sha512_init[SHA512_DIGEST_SIZE / sizeof(__be64)] = {
+	cpu_to_be64(SHA512_H0), cpu_to_be64(SHA512_H1),
+	cpu_to_be64(SHA512_H2), cpu_to_be64(SHA512_H3),
+	cpu_to_be64(SHA512_H4), cpu_to_be64(SHA512_H5),
+	cpu_to_be64(SHA512_H6), cpu_to_be64(SHA512_H7),
+};
+
 #define	CCP_NEW_JOBID(ccp)	((ccp->vdata->version == CCP_VERSION(3, 0)) ? \
 					ccp_gen_jobid(ccp) : 0)
 
@@ -955,6 +969,18 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 			return -EINVAL;
 		block_size = SHA256_BLOCK_SIZE;
 		break;
+	case CCP_SHA_TYPE_384:
+		if (cmd_q->ccp->vdata->version < CCP_VERSION(4, 0)
+		    || sha->ctx_len < SHA384_DIGEST_SIZE)
+			return -EINVAL;
+		block_size = SHA384_BLOCK_SIZE;
+		break;
+	case CCP_SHA_TYPE_512:
+		if (cmd_q->ccp->vdata->version < CCP_VERSION(4, 0)
+		    || sha->ctx_len < SHA512_DIGEST_SIZE)
+			return -EINVAL;
+		block_size = SHA512_BLOCK_SIZE;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1042,6 +1068,21 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 		sb_count = 1;
 		ooffset = ioffset = 0;
 		break;
+	case CCP_SHA_TYPE_384:
+		digest_size = SHA384_DIGEST_SIZE;
+		init = (void *) ccp_sha384_init;
+		ctx_size = SHA512_DIGEST_SIZE;
+		sb_count = 2;
+		ioffset = 0;
+		ooffset = 2 * CCP_SB_BYTES - SHA384_DIGEST_SIZE;
+		break;
+	case CCP_SHA_TYPE_512:
+		digest_size = SHA512_DIGEST_SIZE;
+		init = (void *) ccp_sha512_init;
+		ctx_size = SHA512_DIGEST_SIZE;
+		sb_count = 2;
+		ooffset = ioffset = 0;
+		break;
 	default:
 		ret = -EINVAL;
 		goto e_data;
@@ -1060,6 +1101,11 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	op.u.sha.type = sha->type;
 	op.u.sha.msg_bits = sha->msg_bits;
 
+	/* For SHA1/224/256 the context fits in a single (32-byte) SB entry;
+	 * SHA384/512 require 2 adjacent SB slots, with the right half in the
+	 * first slot, and the left half in the second. Each portion must then
+	 * be in little endian format: use the 256-bit byte swap option.
+	 */
 	ret = ccp_init_dm_workarea(&ctx, cmd_q, sb_count * CCP_SB_BYTES,
 				   DMA_BIDIRECTIONAL);
 	if (ret)
@@ -1071,6 +1117,13 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 		case CCP_SHA_TYPE_256:
 			memcpy(ctx.address + ioffset, init, ctx_size);
 			break;
+		case CCP_SHA_TYPE_384:
+		case CCP_SHA_TYPE_512:
+			memcpy(ctx.address + ctx_size / 2, init,
+			       ctx_size / 2);
+			memcpy(ctx.address, init + ctx_size / 2,
+			       ctx_size / 2);
+			break;
 		default:
 			ret = -EINVAL;
 			goto e_ctx;
@@ -1137,6 +1190,15 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 					sha->ctx, 0,
 					digest_size);
 			break;
+		case CCP_SHA_TYPE_384:
+		case CCP_SHA_TYPE_512:
+			ccp_get_dm_area(&ctx, 0,
+					sha->ctx, LSB_ITEM_SIZE - ooffset,
+					LSB_ITEM_SIZE);
+			ccp_get_dm_area(&ctx, LSB_ITEM_SIZE + ooffset,
+					sha->ctx, 0,
+					LSB_ITEM_SIZE - ooffset);
+			break;
 		default:
 			ret = -EINVAL;
 			goto e_ctx;
@@ -1174,6 +1236,16 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 			       ctx.address + ooffset,
 			       digest_size);
 			break;
+		case CCP_SHA_TYPE_384:
+		case CCP_SHA_TYPE_512:
+			memcpy(hmac_buf + block_size,
+			       ctx.address + LSB_ITEM_SIZE + ooffset,
+			       LSB_ITEM_SIZE);
+			memcpy(hmac_buf + block_size +
+			       (LSB_ITEM_SIZE - ooffset),
+			       ctx.address,
+			       LSB_ITEM_SIZE);
+			break;
 		default:
 			ret = -EINVAL;
 			goto e_ctx;
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index c71dd8fa5764..90a1fbe84219 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -249,6 +249,8 @@ enum ccp_sha_type {
 	CCP_SHA_TYPE_1 = 1,
 	CCP_SHA_TYPE_224,
 	CCP_SHA_TYPE_256,
+	CCP_SHA_TYPE_384,
+	CCP_SHA_TYPE_512,
 	CCP_SHA_TYPE__LAST,
 };
 
-- 
cgit v1.2.3


From 990672d48515ce09c76fcf1ceccee48b0dd1942b Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Wed, 15 Mar 2017 13:20:52 -0500
Subject: crypto: ccp - Enable 3DES function on v5 CCPs

Wire up support for Triple DES in ECB mode.

Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/Makefile          |   1 +
 drivers/crypto/ccp/ccp-crypto-des3.c | 254 +++++++++++++++++++++++++++++++++++
 drivers/crypto/ccp/ccp-crypto-main.c |  10 ++
 drivers/crypto/ccp/ccp-crypto.h      |  22 +++
 drivers/crypto/ccp/ccp-dev-v3.c      |   1 +
 drivers/crypto/ccp/ccp-dev-v5.c      |  54 ++++++++
 drivers/crypto/ccp/ccp-dev.h         |  14 +-
 drivers/crypto/ccp/ccp-ops.c         | 198 +++++++++++++++++++++++++++
 include/linux/ccp.h                  |  57 +++++++-
 9 files changed, 608 insertions(+), 3 deletions(-)
 create mode 100644 drivers/crypto/ccp/ccp-crypto-des3.c

(limited to 'include/linux')

diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index 346ceb8f17bd..d2044b793aa8 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -12,4 +12,5 @@ ccp-crypto-objs := ccp-crypto-main.o \
 		   ccp-crypto-aes.o \
 		   ccp-crypto-aes-cmac.o \
 		   ccp-crypto-aes-xts.o \
+		   ccp-crypto-des3.o \
 		   ccp-crypto-sha.o
diff --git a/drivers/crypto/ccp/ccp-crypto-des3.c b/drivers/crypto/ccp/ccp-crypto-des3.c
new file mode 100644
index 000000000000..5af7347ae03c
--- /dev/null
+++ b/drivers/crypto/ccp/ccp-crypto-des3.c
@@ -0,0 +1,254 @@
+/*
+ * AMD Cryptographic Coprocessor (CCP) DES3 crypto API support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <ghook@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/des.h>
+
+#include "ccp-crypto.h"
+
+static int ccp_des3_complete(struct crypto_async_request *async_req, int ret)
+{
+	struct ablkcipher_request *req = ablkcipher_request_cast(async_req);
+	struct ccp_ctx *ctx = crypto_tfm_ctx(req->base.tfm);
+	struct ccp_des3_req_ctx *rctx = ablkcipher_request_ctx(req);
+
+	if (ret)
+		return ret;
+
+	if (ctx->u.des3.mode != CCP_DES3_MODE_ECB)
+		memcpy(req->info, rctx->iv, DES3_EDE_BLOCK_SIZE);
+
+	return 0;
+}
+
+static int ccp_des3_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+		unsigned int key_len)
+{
+	struct ccp_ctx *ctx = crypto_tfm_ctx(crypto_ablkcipher_tfm(tfm));
+	struct ccp_crypto_ablkcipher_alg *alg =
+		ccp_crypto_ablkcipher_alg(crypto_ablkcipher_tfm(tfm));
+	u32 *flags = &tfm->base.crt_flags;
+
+
+	/* From des_generic.c:
+	 *
+	 * RFC2451:
+	 *   If the first two or last two independent 64-bit keys are
+	 *   equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the
+	 *   same as DES.  Implementers MUST reject keys that exhibit this
+	 *   property.
+	 */
+	const u32 *K = (const u32 *)key;
+
+	if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
+		     !((K[2] ^ K[4]) | (K[3] ^ K[5]))) &&
+		     (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) {
+		*flags |= CRYPTO_TFM_RES_WEAK_KEY;
+		return -EINVAL;
+	}
+
+	/* It's not clear that there is any support for a keysize of 112.
+	 * If needed, the caller should make K1 == K3
+	 */
+	ctx->u.des3.type = CCP_DES3_TYPE_168;
+	ctx->u.des3.mode = alg->mode;
+	ctx->u.des3.key_len = key_len;
+
+	memcpy(ctx->u.des3.key, key, key_len);
+	sg_init_one(&ctx->u.des3.key_sg, ctx->u.des3.key, key_len);
+
+	return 0;
+}
+
+static int ccp_des3_crypt(struct ablkcipher_request *req, bool encrypt)
+{
+	struct ccp_ctx *ctx = crypto_tfm_ctx(req->base.tfm);
+	struct ccp_des3_req_ctx *rctx = ablkcipher_request_ctx(req);
+	struct scatterlist *iv_sg = NULL;
+	unsigned int iv_len = 0;
+	int ret;
+
+	if (!ctx->u.des3.key_len)
+		return -EINVAL;
+
+	if (((ctx->u.des3.mode == CCP_DES3_MODE_ECB) ||
+	     (ctx->u.des3.mode == CCP_DES3_MODE_CBC)) &&
+	    (req->nbytes & (DES3_EDE_BLOCK_SIZE - 1)))
+		return -EINVAL;
+
+	if (ctx->u.des3.mode != CCP_DES3_MODE_ECB) {
+		if (!req->info)
+			return -EINVAL;
+
+		memcpy(rctx->iv, req->info, DES3_EDE_BLOCK_SIZE);
+		iv_sg = &rctx->iv_sg;
+		iv_len = DES3_EDE_BLOCK_SIZE;
+		sg_init_one(iv_sg, rctx->iv, iv_len);
+	}
+
+	memset(&rctx->cmd, 0, sizeof(rctx->cmd));
+	INIT_LIST_HEAD(&rctx->cmd.entry);
+	rctx->cmd.engine = CCP_ENGINE_DES3;
+	rctx->cmd.u.des3.type = ctx->u.des3.type;
+	rctx->cmd.u.des3.mode = ctx->u.des3.mode;
+	rctx->cmd.u.des3.action = (encrypt)
+				  ? CCP_DES3_ACTION_ENCRYPT
+				  : CCP_DES3_ACTION_DECRYPT;
+	rctx->cmd.u.des3.key = &ctx->u.des3.key_sg;
+	rctx->cmd.u.des3.key_len = ctx->u.des3.key_len;
+	rctx->cmd.u.des3.iv = iv_sg;
+	rctx->cmd.u.des3.iv_len = iv_len;
+	rctx->cmd.u.des3.src = req->src;
+	rctx->cmd.u.des3.src_len = req->nbytes;
+	rctx->cmd.u.des3.dst = req->dst;
+
+	ret = ccp_crypto_enqueue_request(&req->base, &rctx->cmd);
+
+	return ret;
+}
+
+static int ccp_des3_encrypt(struct ablkcipher_request *req)
+{
+	return ccp_des3_crypt(req, true);
+}
+
+static int ccp_des3_decrypt(struct ablkcipher_request *req)
+{
+	return ccp_des3_crypt(req, false);
+}
+
+static int ccp_des3_cra_init(struct crypto_tfm *tfm)
+{
+	struct ccp_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->complete = ccp_des3_complete;
+	ctx->u.des3.key_len = 0;
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct ccp_des3_req_ctx);
+
+	return 0;
+}
+
+static void ccp_des3_cra_exit(struct crypto_tfm *tfm)
+{
+}
+
+static struct crypto_alg ccp_des3_defaults = {
+	.cra_flags	= CRYPTO_ALG_TYPE_ABLKCIPHER |
+		CRYPTO_ALG_ASYNC |
+		CRYPTO_ALG_KERN_DRIVER_ONLY |
+		CRYPTO_ALG_NEED_FALLBACK,
+	.cra_blocksize	= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize	= sizeof(struct ccp_ctx),
+	.cra_priority	= CCP_CRA_PRIORITY,
+	.cra_type	= &crypto_ablkcipher_type,
+	.cra_init	= ccp_des3_cra_init,
+	.cra_exit	= ccp_des3_cra_exit,
+	.cra_module	= THIS_MODULE,
+	.cra_ablkcipher	= {
+		.setkey		= ccp_des3_setkey,
+		.encrypt	= ccp_des3_encrypt,
+		.decrypt	= ccp_des3_decrypt,
+		.min_keysize	= DES3_EDE_KEY_SIZE,
+		.max_keysize	= DES3_EDE_KEY_SIZE,
+	},
+};
+
+struct ccp_des3_def {
+	enum ccp_des3_mode mode;
+	unsigned int version;
+	const char *name;
+	const char *driver_name;
+	unsigned int blocksize;
+	unsigned int ivsize;
+	struct crypto_alg *alg_defaults;
+};
+
+static struct ccp_des3_def des3_algs[] = {
+	{
+		.mode		= CCP_DES3_MODE_ECB,
+		.version	= CCP_VERSION(5, 0),
+		.name		= "ecb(des3_ede)",
+		.driver_name	= "ecb-des3-ccp",
+		.blocksize	= DES3_EDE_BLOCK_SIZE,
+		.ivsize		= 0,
+		.alg_defaults	= &ccp_des3_defaults,
+	},
+	{
+		.mode		= CCP_DES3_MODE_CBC,
+		.version	= CCP_VERSION(5, 0),
+		.name		= "cbc(des3_ede)",
+		.driver_name	= "cbc-des3-ccp",
+		.blocksize	= DES3_EDE_BLOCK_SIZE,
+		.ivsize		= DES3_EDE_BLOCK_SIZE,
+		.alg_defaults	= &ccp_des3_defaults,
+	},
+};
+
+static int ccp_register_des3_alg(struct list_head *head,
+				 const struct ccp_des3_def *def)
+{
+	struct ccp_crypto_ablkcipher_alg *ccp_alg;
+	struct crypto_alg *alg;
+	int ret;
+
+	ccp_alg = kzalloc(sizeof(*ccp_alg), GFP_KERNEL);
+	if (!ccp_alg)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ccp_alg->entry);
+
+	ccp_alg->mode = def->mode;
+
+	/* Copy the defaults and override as necessary */
+	alg = &ccp_alg->alg;
+	*alg = *def->alg_defaults;
+	snprintf(alg->cra_name, CRYPTO_MAX_ALG_NAME, "%s", def->name);
+	snprintf(alg->cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
+			def->driver_name);
+	alg->cra_blocksize = def->blocksize;
+	alg->cra_ablkcipher.ivsize = def->ivsize;
+
+	ret = crypto_register_alg(alg);
+	if (ret) {
+		pr_err("%s ablkcipher algorithm registration error (%d)\n",
+				alg->cra_name, ret);
+		kfree(ccp_alg);
+		return ret;
+	}
+
+	list_add(&ccp_alg->entry, head);
+
+	return 0;
+}
+
+int ccp_register_des3_algs(struct list_head *head)
+{
+	int i, ret;
+	unsigned int ccpversion = ccp_version();
+
+	for (i = 0; i < ARRAY_SIZE(des3_algs); i++) {
+		if (des3_algs[i].version > ccpversion)
+			continue;
+		ret = ccp_register_des3_alg(head, &des3_algs[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/crypto/ccp/ccp-crypto-main.c b/drivers/crypto/ccp/ccp-crypto-main.c
index e0380e59c361..3f1e36d7a8bf 100644
--- a/drivers/crypto/ccp/ccp-crypto-main.c
+++ b/drivers/crypto/ccp/ccp-crypto-main.c
@@ -33,6 +33,10 @@ static unsigned int sha_disable;
 module_param(sha_disable, uint, 0444);
 MODULE_PARM_DESC(sha_disable, "Disable use of SHA - any non-zero value");
 
+static unsigned int des3_disable;
+module_param(des3_disable, uint, 0444);
+MODULE_PARM_DESC(des3_disable, "Disable use of 3DES - any non-zero value");
+
 /* List heads for the supported algorithms */
 static LIST_HEAD(hash_algs);
 static LIST_HEAD(cipher_algs);
@@ -337,6 +341,12 @@ static int ccp_register_algs(void)
 			return ret;
 	}
 
+	if (!des3_disable) {
+		ret = ccp_register_des3_algs(&cipher_algs);
+		if (ret)
+			return ret;
+	}
+
 	if (!sha_disable) {
 		ret = ccp_register_sha_algs(&hash_algs);
 		if (ret)
diff --git a/drivers/crypto/ccp/ccp-crypto.h b/drivers/crypto/ccp/ccp-crypto.h
index 95cce2764139..8c8bd3ff9f48 100644
--- a/drivers/crypto/ccp/ccp-crypto.h
+++ b/drivers/crypto/ccp/ccp-crypto.h
@@ -23,6 +23,8 @@
 #include <crypto/hash.h>
 #include <crypto/sha.h>
 
+#define	CCP_LOG_LEVEL	KERN_INFO
+
 #define CCP_CRA_PRIORITY	300
 
 struct ccp_crypto_ablkcipher_alg {
@@ -137,6 +139,24 @@ struct ccp_aes_cmac_exp_ctx {
 	u8 buf[AES_BLOCK_SIZE];
 };
 
+/***** 3DES related defines *****/
+struct ccp_des3_ctx {
+	enum ccp_engine engine;
+	enum ccp_des3_type type;
+	enum ccp_des3_mode mode;
+
+	struct scatterlist key_sg;
+	unsigned int key_len;
+	u8 key[AES_MAX_KEY_SIZE];
+};
+
+struct ccp_des3_req_ctx {
+	struct scatterlist iv_sg;
+	u8 iv[AES_BLOCK_SIZE];
+
+	struct ccp_cmd cmd;
+};
+
 /* SHA-related defines
  * These values must be large enough to accommodate any variant
  */
@@ -201,6 +221,7 @@ struct ccp_ctx {
 	union {
 		struct ccp_aes_ctx aes;
 		struct ccp_sha_ctx sha;
+		struct ccp_des3_ctx des3;
 	} u;
 };
 
@@ -213,5 +234,6 @@ int ccp_register_aes_algs(struct list_head *head);
 int ccp_register_aes_cmac_algs(struct list_head *head);
 int ccp_register_aes_xts_algs(struct list_head *head);
 int ccp_register_sha_algs(struct list_head *head);
+int ccp_register_des3_algs(struct list_head *head);
 
 #endif
diff --git a/drivers/crypto/ccp/ccp-dev-v3.c b/drivers/crypto/ccp/ccp-dev-v3.c
index 7bc09989e18a..a3689a66cb1d 100644
--- a/drivers/crypto/ccp/ccp-dev-v3.c
+++ b/drivers/crypto/ccp/ccp-dev-v3.c
@@ -553,6 +553,7 @@ static irqreturn_t ccp_irq_handler(int irq, void *data)
 static const struct ccp_actions ccp3_actions = {
 	.aes = ccp_perform_aes,
 	.xts_aes = ccp_perform_xts_aes,
+	.des3 = NULL,
 	.sha = ccp_perform_sha,
 	.rsa = ccp_perform_rsa,
 	.passthru = ccp_perform_passthru,
diff --git a/drivers/crypto/ccp/ccp-dev-v5.c b/drivers/crypto/ccp/ccp-dev-v5.c
index 41cc853f8569..fc5666eb59f2 100644
--- a/drivers/crypto/ccp/ccp-dev-v5.c
+++ b/drivers/crypto/ccp/ccp-dev-v5.c
@@ -107,6 +107,12 @@ union ccp_function {
 		u16 rsvd:5;
 		u16 type:2;
 	} aes_xts;
+	struct {
+		u16 size:7;
+		u16 encrypt:1;
+		u16 mode:5;
+		u16 type:2;
+	} des3;
 	struct {
 		u16 rsvd1:10;
 		u16 type:4;
@@ -139,6 +145,10 @@ union ccp_function {
 #define	CCP_AES_TYPE(p)		((p)->aes.type)
 #define	CCP_XTS_SIZE(p)		((p)->aes_xts.size)
 #define	CCP_XTS_ENCRYPT(p)	((p)->aes_xts.encrypt)
+#define	CCP_DES3_SIZE(p)	((p)->des3.size)
+#define	CCP_DES3_ENCRYPT(p)	((p)->des3.encrypt)
+#define	CCP_DES3_MODE(p)	((p)->des3.mode)
+#define	CCP_DES3_TYPE(p)	((p)->des3.type)
 #define	CCP_SHA_TYPE(p)		((p)->sha.type)
 #define	CCP_RSA_SIZE(p)		((p)->rsa.size)
 #define	CCP_PT_BYTESWAP(p)	((p)->pt.byteswap)
@@ -388,6 +398,47 @@ static int ccp5_perform_sha(struct ccp_op *op)
 	return ccp5_do_cmd(&desc, op->cmd_q);
 }
 
+static int ccp5_perform_des3(struct ccp_op *op)
+{
+	struct ccp5_desc desc;
+	union ccp_function function;
+	u32 key_addr = op->sb_key * LSB_ITEM_SIZE;
+
+	/* Zero out all the fields of the command desc */
+	memset(&desc, 0, sizeof(struct ccp5_desc));
+
+	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_DES3;
+
+	CCP5_CMD_SOC(&desc) = op->soc;
+	CCP5_CMD_IOC(&desc) = 1;
+	CCP5_CMD_INIT(&desc) = op->init;
+	CCP5_CMD_EOM(&desc) = op->eom;
+	CCP5_CMD_PROT(&desc) = 0;
+
+	function.raw = 0;
+	CCP_DES3_ENCRYPT(&function) = op->u.des3.action;
+	CCP_DES3_MODE(&function) = op->u.des3.mode;
+	CCP_DES3_TYPE(&function) = op->u.des3.type;
+	CCP5_CMD_FUNCTION(&desc) = cpu_to_le32(function.raw);
+
+	CCP5_CMD_LEN(&desc) = cpu_to_le32(op->src.u.dma.length);
+
+	CCP5_CMD_SRC_LO(&desc) = cpu_to_le32(ccp_addr_lo(&op->src.u.dma));
+	CCP5_CMD_SRC_HI(&desc) = cpu_to_le32(ccp_addr_hi(&op->src.u.dma));
+	CCP5_CMD_SRC_MEM(&desc) = cpu_to_le32(CCP_MEMTYPE_SYSTEM);
+
+	CCP5_CMD_DST_LO(&desc) = cpu_to_le32(ccp_addr_lo(&op->dst.u.dma));
+	CCP5_CMD_DST_HI(&desc) = cpu_to_le32(ccp_addr_hi(&op->dst.u.dma));
+	CCP5_CMD_DST_MEM(&desc) = cpu_to_le32(CCP_MEMTYPE_SYSTEM);
+
+	CCP5_CMD_KEY_LO(&desc) = cpu_to_le32(lower_32_bits(key_addr));
+	CCP5_CMD_KEY_HI(&desc) = 0;
+	CCP5_CMD_KEY_MEM(&desc) = cpu_to_le32(CCP_MEMTYPE_SB);
+	CCP5_CMD_LSB_ID(&desc) = cpu_to_le32(op->sb_ctx);
+
+	return ccp5_do_cmd(&desc, op->cmd_q);
+}
+
 static int ccp5_perform_rsa(struct ccp_op *op)
 {
 	struct ccp5_desc desc;
@@ -435,6 +486,7 @@ static int ccp5_perform_passthru(struct ccp_op *op)
 	struct ccp_dma_info *saddr = &op->src.u.dma;
 	struct ccp_dma_info *daddr = &op->dst.u.dma;
 
+
 	memset(&desc, 0, Q_DESC_SIZE);
 
 	CCP5_CMD_ENGINE(&desc) = CCP_ENGINE_PASSTHRU;
@@ -729,6 +781,7 @@ static int ccp5_init(struct ccp_device *ccp)
 
 		dev_dbg(dev, "queue #%u available\n", i);
 	}
+
 	if (ccp->cmd_q_count == 0) {
 		dev_notice(dev, "no command queues available\n");
 		ret = -EIO;
@@ -994,6 +1047,7 @@ static const struct ccp_actions ccp5_actions = {
 	.aes = ccp5_perform_aes,
 	.xts_aes = ccp5_perform_xts_aes,
 	.sha = ccp5_perform_sha,
+	.des3 = ccp5_perform_des3,
 	.rsa = ccp5_perform_rsa,
 	.passthru = ccp5_perform_passthru,
 	.ecc = ccp5_perform_ecc,
diff --git a/drivers/crypto/ccp/ccp-dev.h b/drivers/crypto/ccp/ccp-dev.h
index 2b5c01fade05..754e9c2f6ee3 100644
--- a/drivers/crypto/ccp/ccp-dev.h
+++ b/drivers/crypto/ccp/ccp-dev.h
@@ -190,6 +190,9 @@
 #define CCP_XTS_AES_KEY_SB_COUNT	1
 #define CCP_XTS_AES_CTX_SB_COUNT	1
 
+#define CCP_DES3_KEY_SB_COUNT		1
+#define CCP_DES3_CTX_SB_COUNT		1
+
 #define CCP_SHA_SB_COUNT		1
 
 #define CCP_RSA_MAX_WIDTH		4096
@@ -475,6 +478,12 @@ struct ccp_xts_aes_op {
 	enum ccp_xts_aes_unit_size unit_size;
 };
 
+struct ccp_des3_op {
+	enum ccp_des3_type type;
+	enum ccp_des3_mode mode;
+	enum ccp_des3_action action;
+};
+
 struct ccp_sha_op {
 	enum ccp_sha_type type;
 	u64 msg_bits;
@@ -512,6 +521,7 @@ struct ccp_op {
 	union {
 		struct ccp_aes_op aes;
 		struct ccp_xts_aes_op xts;
+		struct ccp_des3_op des3;
 		struct ccp_sha_op sha;
 		struct ccp_rsa_op rsa;
 		struct ccp_passthru_op passthru;
@@ -620,13 +630,13 @@ void ccp_dmaengine_unregister(struct ccp_device *ccp);
 struct ccp_actions {
 	int (*aes)(struct ccp_op *);
 	int (*xts_aes)(struct ccp_op *);
+	int (*des3)(struct ccp_op *);
 	int (*sha)(struct ccp_op *);
 	int (*rsa)(struct ccp_op *);
 	int (*passthru)(struct ccp_op *);
 	int (*ecc)(struct ccp_op *);
 	u32 (*sballoc)(struct ccp_cmd_queue *, unsigned int);
-	void (*sbfree)(struct ccp_cmd_queue *, unsigned int,
-			       unsigned int);
+	void (*sbfree)(struct ccp_cmd_queue *, unsigned int, unsigned int);
 	unsigned int (*get_free_slots)(struct ccp_cmd_queue *);
 	int (*init)(struct ccp_device *);
 	void (*destroy)(struct ccp_device *);
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index 0d820802cd92..0de961a22ab4 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -16,6 +16,7 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <crypto/scatterwalk.h>
+#include <crypto/des.h>
 #include <linux/ccp.h>
 
 #include "ccp-dev.h"
@@ -939,6 +940,200 @@ e_key:
 	return ret;
 }
 
+static int ccp_run_des3_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
+{
+	struct ccp_des3_engine *des3 = &cmd->u.des3;
+
+	struct ccp_dm_workarea key, ctx;
+	struct ccp_data src, dst;
+	struct ccp_op op;
+	unsigned int dm_offset;
+	unsigned int len_singlekey;
+	bool in_place = false;
+	int ret;
+
+	/* Error checks */
+	if (!cmd_q->ccp->vdata->perform->des3)
+		return -EINVAL;
+
+	if (des3->key_len != DES3_EDE_KEY_SIZE)
+		return -EINVAL;
+
+	if (((des3->mode == CCP_DES3_MODE_ECB) ||
+		(des3->mode == CCP_DES3_MODE_CBC)) &&
+		(des3->src_len & (DES3_EDE_BLOCK_SIZE - 1)))
+		return -EINVAL;
+
+	if (!des3->key || !des3->src || !des3->dst)
+		return -EINVAL;
+
+	if (des3->mode != CCP_DES3_MODE_ECB) {
+		if (des3->iv_len != DES3_EDE_BLOCK_SIZE)
+			return -EINVAL;
+
+		if (!des3->iv)
+			return -EINVAL;
+	}
+
+	ret = -EIO;
+	/* Zero out all the fields of the command desc */
+	memset(&op, 0, sizeof(op));
+
+	/* Set up the Function field */
+	op.cmd_q = cmd_q;
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
+	op.sb_key = cmd_q->sb_key;
+
+	op.init = (des3->mode == CCP_DES3_MODE_ECB) ? 0 : 1;
+	op.u.des3.type = des3->type;
+	op.u.des3.mode = des3->mode;
+	op.u.des3.action = des3->action;
+
+	/*
+	 * All supported key sizes fit in a single (32-byte) KSB entry and
+	 * (like AES) must be in little endian format. Use the 256-bit byte
+	 * swap passthru option to convert from big endian to little endian.
+	 */
+	ret = ccp_init_dm_workarea(&key, cmd_q,
+				   CCP_DES3_KEY_SB_COUNT * CCP_SB_BYTES,
+				   DMA_TO_DEVICE);
+	if (ret)
+		return ret;
+
+	/*
+	 * The contents of the key triplet are in the reverse order of what
+	 * is required by the engine. Copy the 3 pieces individually to put
+	 * them where they belong.
+	 */
+	dm_offset = CCP_SB_BYTES - des3->key_len; /* Basic offset */
+
+	len_singlekey = des3->key_len / 3;
+	ccp_set_dm_area(&key, dm_offset + 2 * len_singlekey,
+			des3->key, 0, len_singlekey);
+	ccp_set_dm_area(&key, dm_offset + len_singlekey,
+			des3->key, len_singlekey, len_singlekey);
+	ccp_set_dm_area(&key, dm_offset,
+			des3->key, 2 * len_singlekey, len_singlekey);
+
+	/* Copy the key to the SB */
+	ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
+	if (ret) {
+		cmd->engine_error = cmd_q->cmd_error;
+		goto e_key;
+	}
+
+	/*
+	 * The DES3 context fits in a single (32-byte) KSB entry and
+	 * must be in little endian format. Use the 256-bit byte swap
+	 * passthru option to convert from big endian to little endian.
+	 */
+	if (des3->mode != CCP_DES3_MODE_ECB) {
+		u32 load_mode;
+
+		op.sb_ctx = cmd_q->sb_ctx;
+
+		ret = ccp_init_dm_workarea(&ctx, cmd_q,
+					   CCP_DES3_CTX_SB_COUNT * CCP_SB_BYTES,
+					   DMA_BIDIRECTIONAL);
+		if (ret)
+			goto e_key;
+
+		/* Load the context into the LSB */
+		dm_offset = CCP_SB_BYTES - des3->iv_len;
+		ccp_set_dm_area(&ctx, dm_offset, des3->iv, 0, des3->iv_len);
+
+		if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0))
+			load_mode = CCP_PASSTHRU_BYTESWAP_NOOP;
+		else
+			load_mode = CCP_PASSTHRU_BYTESWAP_256BIT;
+		ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+				     load_mode);
+		if (ret) {
+			cmd->engine_error = cmd_q->cmd_error;
+			goto e_ctx;
+		}
+	}
+
+	/*
+	 * Prepare the input and output data workareas. For in-place
+	 * operations we need to set the dma direction to BIDIRECTIONAL
+	 * and copy the src workarea to the dst workarea.
+	 */
+	if (sg_virt(des3->src) == sg_virt(des3->dst))
+		in_place = true;
+
+	ret = ccp_init_data(&src, cmd_q, des3->src, des3->src_len,
+			DES3_EDE_BLOCK_SIZE,
+			in_place ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE);
+	if (ret)
+		goto e_ctx;
+
+	if (in_place)
+		dst = src;
+	else {
+		ret = ccp_init_data(&dst, cmd_q, des3->dst, des3->src_len,
+				DES3_EDE_BLOCK_SIZE, DMA_FROM_DEVICE);
+		if (ret)
+			goto e_src;
+	}
+
+	/* Send data to the CCP DES3 engine */
+	while (src.sg_wa.bytes_left) {
+		ccp_prepare_data(&src, &dst, &op, DES3_EDE_BLOCK_SIZE, true);
+		if (!src.sg_wa.bytes_left) {
+			op.eom = 1;
+
+			/* Since we don't retrieve the context in ECB mode
+			 * we have to wait for the operation to complete
+			 * on the last piece of data
+			 */
+			op.soc = 0;
+		}
+
+		ret = cmd_q->ccp->vdata->perform->des3(&op);
+		if (ret) {
+			cmd->engine_error = cmd_q->cmd_error;
+			goto e_dst;
+		}
+
+		ccp_process_data(&src, &dst, &op);
+	}
+
+	if (des3->mode != CCP_DES3_MODE_ECB) {
+		/* Retrieve the context and make BE */
+		ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+				       CCP_PASSTHRU_BYTESWAP_256BIT);
+		if (ret) {
+			cmd->engine_error = cmd_q->cmd_error;
+			goto e_dst;
+		}
+
+		/* ...but we only need the last DES3_EDE_BLOCK_SIZE bytes */
+		if (cmd_q->ccp->vdata->version == CCP_VERSION(3, 0))
+			dm_offset = CCP_SB_BYTES - des3->iv_len;
+		else
+			dm_offset = 0;
+		ccp_get_dm_area(&ctx, dm_offset, des3->iv, 0,
+				DES3_EDE_BLOCK_SIZE);
+	}
+e_dst:
+	if (!in_place)
+		ccp_free_data(&dst, cmd_q);
+
+e_src:
+	ccp_free_data(&src, cmd_q);
+
+e_ctx:
+	if (des3->mode != CCP_DES3_MODE_ECB)
+		ccp_dm_free(&ctx);
+
+e_key:
+	ccp_dm_free(&key);
+
+	return ret;
+}
+
 static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 {
 	struct ccp_sha_engine *sha = &cmd->u.sha;
@@ -1903,6 +2098,9 @@ int ccp_run_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	case CCP_ENGINE_XTS_AES_128:
 		ret = ccp_run_xts_aes_cmd(cmd_q, cmd);
 		break;
+	case CCP_ENGINE_DES3:
+		ret = ccp_run_des3_cmd(cmd_q, cmd);
+		break;
 	case CCP_ENGINE_SHA:
 		ret = ccp_run_sha_cmd(cmd_q, cmd);
 		break;
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index 90a1fbe84219..fa0261748920 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -292,6 +292,60 @@ struct ccp_sha_engine {
 				 * final sha cmd */
 };
 
+/***** 3DES engine *****/
+enum ccp_des3_mode {
+	CCP_DES3_MODE_ECB = 0,
+	CCP_DES3_MODE_CBC,
+	CCP_DES3_MODE_CFB,
+	CCP_DES3_MODE__LAST,
+};
+
+enum ccp_des3_type {
+	CCP_DES3_TYPE_168 = 1,
+	CCP_DES3_TYPE__LAST,
+	};
+
+enum ccp_des3_action {
+	CCP_DES3_ACTION_DECRYPT = 0,
+	CCP_DES3_ACTION_ENCRYPT,
+	CCP_DES3_ACTION__LAST,
+};
+
+/**
+ * struct ccp_des3_engine - CCP SHA operation
+ * @type: Type of 3DES operation
+ * @mode: cipher mode
+ * @action: 3DES operation (decrypt/encrypt)
+ * @key: key to be used for this 3DES operation
+ * @key_len: length of key (in bytes)
+ * @iv: IV to be used for this AES operation
+ * @iv_len: length in bytes of iv
+ * @src: input data to be used for this operation
+ * @src_len: length of input data used for this operation (in bytes)
+ * @dst: output data produced by this operation
+ *
+ * Variables required to be set when calling ccp_enqueue_cmd():
+ *   - type, mode, action, key, key_len, src, dst, src_len
+ *   - iv, iv_len for any mode other than ECB
+ *
+ * The iv variable is used as both input and output. On completion of the
+ * 3DES operation the new IV overwrites the old IV.
+ */
+struct ccp_des3_engine {
+	enum ccp_des3_type type;
+	enum ccp_des3_mode mode;
+	enum ccp_des3_action action;
+
+	struct scatterlist *key;
+	u32 key_len;	    /* In bytes */
+
+	struct scatterlist *iv;
+	u32 iv_len;	     /* In bytes */
+
+	struct scatterlist *src, *dst;
+	u64 src_len;	    /* In bytes */
+};
+
 /***** RSA engine *****/
 /**
  * struct ccp_rsa_engine - CCP RSA operation
@@ -541,7 +595,7 @@ struct ccp_ecc_engine {
 enum ccp_engine {
 	CCP_ENGINE_AES = 0,
 	CCP_ENGINE_XTS_AES_128,
-	CCP_ENGINE_RSVD1,
+	CCP_ENGINE_DES3,
 	CCP_ENGINE_SHA,
 	CCP_ENGINE_RSA,
 	CCP_ENGINE_PASSTHRU,
@@ -589,6 +643,7 @@ struct ccp_cmd {
 	union {
 		struct ccp_aes_engine aes;
 		struct ccp_xts_aes_engine xts;
+		struct ccp_des3_engine des3;
 		struct ccp_sha_engine sha;
 		struct ccp_rsa_engine rsa;
 		struct ccp_passthru_engine passthru;
-- 
cgit v1.2.3


From 36cf515b9bbe298e1ce7384620f0d4ec45ad3328 Mon Sep 17 00:00:00 2001
From: Gary R Hook <gary.hook@amd.com>
Date: Wed, 15 Mar 2017 13:21:01 -0500
Subject: crypto: ccp - Enable support for AES GCM on v5 CCPs

A version 5 device provides the primitive commands
required for AES GCM. This patch adds support for
en/decryption.

Signed-off-by: Gary R Hook <gary.hook@amd.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/ccp/Makefile                |   1 +
 drivers/crypto/ccp/ccp-crypto-aes-galois.c | 252 +++++++++++++++++++++++++++++
 drivers/crypto/ccp/ccp-crypto-main.c       |  12 ++
 drivers/crypto/ccp/ccp-crypto.h            |  14 ++
 drivers/crypto/ccp/ccp-ops.c               | 252 +++++++++++++++++++++++++++++
 include/linux/ccp.h                        |   9 ++
 6 files changed, 540 insertions(+)
 create mode 100644 drivers/crypto/ccp/ccp-crypto-aes-galois.c

(limited to 'include/linux')

diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile
index d2044b793aa8..60919a3ec53b 100644
--- a/drivers/crypto/ccp/Makefile
+++ b/drivers/crypto/ccp/Makefile
@@ -12,5 +12,6 @@ ccp-crypto-objs := ccp-crypto-main.o \
 		   ccp-crypto-aes.o \
 		   ccp-crypto-aes-cmac.o \
 		   ccp-crypto-aes-xts.o \
+		   ccp-crypto-aes-galois.o \
 		   ccp-crypto-des3.o \
 		   ccp-crypto-sha.o
diff --git a/drivers/crypto/ccp/ccp-crypto-aes-galois.c b/drivers/crypto/ccp/ccp-crypto-aes-galois.c
new file mode 100644
index 000000000000..38ee6f348ea9
--- /dev/null
+++ b/drivers/crypto/ccp/ccp-crypto-aes-galois.c
@@ -0,0 +1,252 @@
+/*
+ * AMD Cryptographic Coprocessor (CCP) AES GCM crypto API support
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Gary R Hook <gary.hook@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/crypto.h>
+#include <crypto/internal/aead.h>
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+#include <crypto/ctr.h>
+#include <crypto/scatterwalk.h>
+#include <linux/delay.h>
+
+#include "ccp-crypto.h"
+
+#define	AES_GCM_IVSIZE	12
+
+static int ccp_aes_gcm_complete(struct crypto_async_request *async_req, int ret)
+{
+	return ret;
+}
+
+static int ccp_aes_gcm_setkey(struct crypto_aead *tfm, const u8 *key,
+			      unsigned int key_len)
+{
+	struct ccp_ctx *ctx = crypto_aead_ctx(tfm);
+
+	switch (key_len) {
+	case AES_KEYSIZE_128:
+		ctx->u.aes.type = CCP_AES_TYPE_128;
+		break;
+	case AES_KEYSIZE_192:
+		ctx->u.aes.type = CCP_AES_TYPE_192;
+		break;
+	case AES_KEYSIZE_256:
+		ctx->u.aes.type = CCP_AES_TYPE_256;
+		break;
+	default:
+		crypto_aead_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+
+	ctx->u.aes.mode = CCP_AES_MODE_GCM;
+	ctx->u.aes.key_len = key_len;
+
+	memcpy(ctx->u.aes.key, key, key_len);
+	sg_init_one(&ctx->u.aes.key_sg, ctx->u.aes.key, key_len);
+
+	return 0;
+}
+
+static int ccp_aes_gcm_setauthsize(struct crypto_aead *tfm,
+				   unsigned int authsize)
+{
+	return 0;
+}
+
+static int ccp_aes_gcm_crypt(struct aead_request *req, bool encrypt)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct ccp_ctx *ctx = crypto_aead_ctx(tfm);
+	struct ccp_aes_req_ctx *rctx = aead_request_ctx(req);
+	struct scatterlist *iv_sg = NULL;
+	unsigned int iv_len = 0;
+	int i;
+	int ret = 0;
+
+	if (!ctx->u.aes.key_len)
+		return -EINVAL;
+
+	if (ctx->u.aes.mode != CCP_AES_MODE_GCM)
+		return -EINVAL;
+
+	if (!req->iv)
+		return -EINVAL;
+
+	/*
+	 * 5 parts:
+	 *   plaintext/ciphertext input
+	 *   AAD
+	 *   key
+	 *   IV
+	 *   Destination+tag buffer
+	 */
+
+	/* Prepare the IV: 12 bytes + an integer (counter) */
+	memcpy(rctx->iv, req->iv, AES_GCM_IVSIZE);
+	for (i = 0; i < 3; i++)
+		rctx->iv[i + AES_GCM_IVSIZE] = 0;
+	rctx->iv[AES_BLOCK_SIZE - 1] = 1;
+
+	/* Set up a scatterlist for the IV */
+	iv_sg = &rctx->iv_sg;
+	iv_len = AES_BLOCK_SIZE;
+	sg_init_one(iv_sg, rctx->iv, iv_len);
+
+	/* The AAD + plaintext are concatenated in the src buffer */
+	memset(&rctx->cmd, 0, sizeof(rctx->cmd));
+	INIT_LIST_HEAD(&rctx->cmd.entry);
+	rctx->cmd.engine = CCP_ENGINE_AES;
+	rctx->cmd.u.aes.type = ctx->u.aes.type;
+	rctx->cmd.u.aes.mode = ctx->u.aes.mode;
+	rctx->cmd.u.aes.action = encrypt;
+	rctx->cmd.u.aes.key = &ctx->u.aes.key_sg;
+	rctx->cmd.u.aes.key_len = ctx->u.aes.key_len;
+	rctx->cmd.u.aes.iv = iv_sg;
+	rctx->cmd.u.aes.iv_len = iv_len;
+	rctx->cmd.u.aes.src = req->src;
+	rctx->cmd.u.aes.src_len = req->cryptlen;
+	rctx->cmd.u.aes.aad_len = req->assoclen;
+
+	/* The cipher text + the tag are in the dst buffer */
+	rctx->cmd.u.aes.dst = req->dst;
+
+	ret = ccp_crypto_enqueue_request(&req->base, &rctx->cmd);
+
+	return ret;
+}
+
+static int ccp_aes_gcm_encrypt(struct aead_request *req)
+{
+	return ccp_aes_gcm_crypt(req, CCP_AES_ACTION_ENCRYPT);
+}
+
+static int ccp_aes_gcm_decrypt(struct aead_request *req)
+{
+	return ccp_aes_gcm_crypt(req, CCP_AES_ACTION_DECRYPT);
+}
+
+static int ccp_aes_gcm_cra_init(struct crypto_aead *tfm)
+{
+	struct ccp_ctx *ctx = crypto_aead_ctx(tfm);
+
+	ctx->complete = ccp_aes_gcm_complete;
+	ctx->u.aes.key_len = 0;
+
+	crypto_aead_set_reqsize(tfm, sizeof(struct ccp_aes_req_ctx));
+
+	return 0;
+}
+
+static void ccp_aes_gcm_cra_exit(struct crypto_tfm *tfm)
+{
+}
+
+static struct aead_alg ccp_aes_gcm_defaults = {
+	.setkey = ccp_aes_gcm_setkey,
+	.setauthsize = ccp_aes_gcm_setauthsize,
+	.encrypt = ccp_aes_gcm_encrypt,
+	.decrypt = ccp_aes_gcm_decrypt,
+	.init = ccp_aes_gcm_cra_init,
+	.ivsize = AES_GCM_IVSIZE,
+	.maxauthsize = AES_BLOCK_SIZE,
+	.base = {
+		.cra_flags	= CRYPTO_ALG_TYPE_ABLKCIPHER |
+				  CRYPTO_ALG_ASYNC |
+				  CRYPTO_ALG_KERN_DRIVER_ONLY |
+				  CRYPTO_ALG_NEED_FALLBACK,
+		.cra_blocksize	= AES_BLOCK_SIZE,
+		.cra_ctxsize	= sizeof(struct ccp_ctx),
+		.cra_priority	= CCP_CRA_PRIORITY,
+		.cra_type	= &crypto_ablkcipher_type,
+		.cra_exit	= ccp_aes_gcm_cra_exit,
+		.cra_module	= THIS_MODULE,
+	},
+};
+
+struct ccp_aes_aead_def {
+	enum ccp_aes_mode mode;
+	unsigned int version;
+	const char *name;
+	const char *driver_name;
+	unsigned int blocksize;
+	unsigned int ivsize;
+	struct aead_alg *alg_defaults;
+};
+
+static struct ccp_aes_aead_def aes_aead_algs[] = {
+	{
+		.mode		= CCP_AES_MODE_GHASH,
+		.version	= CCP_VERSION(5, 0),
+		.name		= "gcm(aes)",
+		.driver_name	= "gcm-aes-ccp",
+		.blocksize	= 1,
+		.ivsize		= AES_BLOCK_SIZE,
+		.alg_defaults	= &ccp_aes_gcm_defaults,
+	},
+};
+
+static int ccp_register_aes_aead(struct list_head *head,
+				 const struct ccp_aes_aead_def *def)
+{
+	struct ccp_crypto_aead *ccp_aead;
+	struct aead_alg *alg;
+	int ret;
+
+	ccp_aead = kzalloc(sizeof(*ccp_aead), GFP_KERNEL);
+	if (!ccp_aead)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ccp_aead->entry);
+
+	ccp_aead->mode = def->mode;
+
+	/* Copy the defaults and override as necessary */
+	alg = &ccp_aead->alg;
+	*alg = *def->alg_defaults;
+	snprintf(alg->base.cra_name, CRYPTO_MAX_ALG_NAME, "%s", def->name);
+	snprintf(alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s",
+		 def->driver_name);
+	alg->base.cra_blocksize = def->blocksize;
+	alg->base.cra_ablkcipher.ivsize = def->ivsize;
+
+	ret = crypto_register_aead(alg);
+	if (ret) {
+		pr_err("%s ablkcipher algorithm registration error (%d)\n",
+		       alg->base.cra_name, ret);
+		kfree(ccp_aead);
+		return ret;
+	}
+
+	list_add(&ccp_aead->entry, head);
+
+	return 0;
+}
+
+int ccp_register_aes_aeads(struct list_head *head)
+{
+	int i, ret;
+	unsigned int ccpversion = ccp_version();
+
+	for (i = 0; i < ARRAY_SIZE(aes_aead_algs); i++) {
+		if (aes_aead_algs[i].version > ccpversion)
+			continue;
+		ret = ccp_register_aes_aead(head, &aes_aead_algs[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/crypto/ccp/ccp-crypto-main.c b/drivers/crypto/ccp/ccp-crypto-main.c
index 3f1e36d7a8bf..8dccbddabef1 100644
--- a/drivers/crypto/ccp/ccp-crypto-main.c
+++ b/drivers/crypto/ccp/ccp-crypto-main.c
@@ -40,6 +40,7 @@ MODULE_PARM_DESC(des3_disable, "Disable use of 3DES - any non-zero value");
 /* List heads for the supported algorithms */
 static LIST_HEAD(hash_algs);
 static LIST_HEAD(cipher_algs);
+static LIST_HEAD(aead_algs);
 
 /* For any tfm, requests for that tfm must be returned on the order
  * received.  With multiple queues available, the CCP can process more
@@ -339,6 +340,10 @@ static int ccp_register_algs(void)
 		ret = ccp_register_aes_xts_algs(&cipher_algs);
 		if (ret)
 			return ret;
+
+		ret = ccp_register_aes_aeads(&aead_algs);
+		if (ret)
+			return ret;
 	}
 
 	if (!des3_disable) {
@@ -360,6 +365,7 @@ static void ccp_unregister_algs(void)
 {
 	struct ccp_crypto_ahash_alg *ahash_alg, *ahash_tmp;
 	struct ccp_crypto_ablkcipher_alg *ablk_alg, *ablk_tmp;
+	struct ccp_crypto_aead *aead_alg, *aead_tmp;
 
 	list_for_each_entry_safe(ahash_alg, ahash_tmp, &hash_algs, entry) {
 		crypto_unregister_ahash(&ahash_alg->alg);
@@ -372,6 +378,12 @@ static void ccp_unregister_algs(void)
 		list_del(&ablk_alg->entry);
 		kfree(ablk_alg);
 	}
+
+	list_for_each_entry_safe(aead_alg, aead_tmp, &aead_algs, entry) {
+		crypto_unregister_aead(&aead_alg->alg);
+		list_del(&aead_alg->entry);
+		kfree(aead_alg);
+	}
 }
 
 static int ccp_crypto_init(void)
diff --git a/drivers/crypto/ccp/ccp-crypto.h b/drivers/crypto/ccp/ccp-crypto.h
index 8c8bd3ff9f48..dd5bf15f06e5 100644
--- a/drivers/crypto/ccp/ccp-crypto.h
+++ b/drivers/crypto/ccp/ccp-crypto.h
@@ -19,6 +19,8 @@
 #include <linux/ccp.h>
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
+#include <crypto/internal/aead.h>
+#include <crypto/aead.h>
 #include <crypto/ctr.h>
 #include <crypto/hash.h>
 #include <crypto/sha.h>
@@ -35,6 +37,14 @@ struct ccp_crypto_ablkcipher_alg {
 	struct crypto_alg alg;
 };
 
+struct ccp_crypto_aead {
+	struct list_head entry;
+
+	u32 mode;
+
+	struct aead_alg alg;
+};
+
 struct ccp_crypto_ahash_alg {
 	struct list_head entry;
 
@@ -97,6 +107,9 @@ struct ccp_aes_req_ctx {
 	struct scatterlist iv_sg;
 	u8 iv[AES_BLOCK_SIZE];
 
+	struct scatterlist tag_sg;
+	u8 tag[AES_BLOCK_SIZE];
+
 	/* Fields used for RFC3686 requests */
 	u8 *rfc3686_info;
 	u8 rfc3686_iv[AES_BLOCK_SIZE];
@@ -233,6 +246,7 @@ struct scatterlist *ccp_crypto_sg_table_add(struct sg_table *table,
 int ccp_register_aes_algs(struct list_head *head);
 int ccp_register_aes_cmac_algs(struct list_head *head);
 int ccp_register_aes_xts_algs(struct list_head *head);
+int ccp_register_aes_aeads(struct list_head *head);
 int ccp_register_sha_algs(struct list_head *head);
 int ccp_register_des3_algs(struct list_head *head);
 
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c
index 0de961a22ab4..c0dfdacbdff5 100644
--- a/drivers/crypto/ccp/ccp-ops.c
+++ b/drivers/crypto/ccp/ccp-ops.c
@@ -601,6 +601,255 @@ e_key:
 	return ret;
 }
 
+static int ccp_run_aes_gcm_cmd(struct ccp_cmd_queue *cmd_q,
+			       struct ccp_cmd *cmd)
+{
+	struct ccp_aes_engine *aes = &cmd->u.aes;
+	struct ccp_dm_workarea key, ctx, final_wa, tag;
+	struct ccp_data src, dst;
+	struct ccp_data aad;
+	struct ccp_op op;
+
+	unsigned long long *final;
+	unsigned int dm_offset;
+	unsigned int ilen;
+	bool in_place = true; /* Default value */
+	int ret;
+
+	struct scatterlist *p_inp, sg_inp[2];
+	struct scatterlist *p_tag, sg_tag[2];
+	struct scatterlist *p_outp, sg_outp[2];
+	struct scatterlist *p_aad;
+
+	if (!aes->iv)
+		return -EINVAL;
+
+	if (!((aes->key_len == AES_KEYSIZE_128) ||
+		(aes->key_len == AES_KEYSIZE_192) ||
+		(aes->key_len == AES_KEYSIZE_256)))
+		return -EINVAL;
+
+	if (!aes->key) /* Gotta have a key SGL */
+		return -EINVAL;
+
+	/* First, decompose the source buffer into AAD & PT,
+	 * and the destination buffer into AAD, CT & tag, or
+	 * the input into CT & tag.
+	 * It is expected that the input and output SGs will
+	 * be valid, even if the AAD and input lengths are 0.
+	 */
+	p_aad = aes->src;
+	p_inp = scatterwalk_ffwd(sg_inp, aes->src, aes->aad_len);
+	p_outp = scatterwalk_ffwd(sg_outp, aes->dst, aes->aad_len);
+	if (aes->action == CCP_AES_ACTION_ENCRYPT) {
+		ilen = aes->src_len;
+		p_tag = scatterwalk_ffwd(sg_tag, p_outp, ilen);
+	} else {
+		/* Input length for decryption includes tag */
+		ilen = aes->src_len - AES_BLOCK_SIZE;
+		p_tag = scatterwalk_ffwd(sg_tag, p_inp, ilen);
+	}
+
+	memset(&op, 0, sizeof(op));
+	op.cmd_q = cmd_q;
+	op.jobid = CCP_NEW_JOBID(cmd_q->ccp);
+	op.sb_key = cmd_q->sb_key; /* Pre-allocated */
+	op.sb_ctx = cmd_q->sb_ctx; /* Pre-allocated */
+	op.init = 1;
+	op.u.aes.type = aes->type;
+
+	/* Copy the key to the LSB */
+	ret = ccp_init_dm_workarea(&key, cmd_q,
+				   CCP_AES_CTX_SB_COUNT * CCP_SB_BYTES,
+				   DMA_TO_DEVICE);
+	if (ret)
+		return ret;
+
+	dm_offset = CCP_SB_BYTES - aes->key_len;
+	ccp_set_dm_area(&key, dm_offset, aes->key, 0, aes->key_len);
+	ret = ccp_copy_to_sb(cmd_q, &key, op.jobid, op.sb_key,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
+	if (ret) {
+		cmd->engine_error = cmd_q->cmd_error;
+		goto e_key;
+	}
+
+	/* Copy the context (IV) to the LSB.
+	 * There is an assumption here that the IV is 96 bits in length, plus
+	 * a nonce of 32 bits. If no IV is present, use a zeroed buffer.
+	 */
+	ret = ccp_init_dm_workarea(&ctx, cmd_q,
+				   CCP_AES_CTX_SB_COUNT * CCP_SB_BYTES,
+				   DMA_BIDIRECTIONAL);
+	if (ret)
+		goto e_key;
+
+	dm_offset = CCP_AES_CTX_SB_COUNT * CCP_SB_BYTES - aes->iv_len;
+	ccp_set_dm_area(&ctx, dm_offset, aes->iv, 0, aes->iv_len);
+
+	ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
+	if (ret) {
+		cmd->engine_error = cmd_q->cmd_error;
+		goto e_ctx;
+	}
+
+	op.init = 1;
+	if (aes->aad_len > 0) {
+		/* Step 1: Run a GHASH over the Additional Authenticated Data */
+		ret = ccp_init_data(&aad, cmd_q, p_aad, aes->aad_len,
+				    AES_BLOCK_SIZE,
+				    DMA_TO_DEVICE);
+		if (ret)
+			goto e_ctx;
+
+		op.u.aes.mode = CCP_AES_MODE_GHASH;
+		op.u.aes.action = CCP_AES_GHASHAAD;
+
+		while (aad.sg_wa.bytes_left) {
+			ccp_prepare_data(&aad, NULL, &op, AES_BLOCK_SIZE, true);
+
+			ret = cmd_q->ccp->vdata->perform->aes(&op);
+			if (ret) {
+				cmd->engine_error = cmd_q->cmd_error;
+				goto e_aad;
+			}
+
+			ccp_process_data(&aad, NULL, &op);
+			op.init = 0;
+		}
+	}
+
+	op.u.aes.mode = CCP_AES_MODE_GCTR;
+	op.u.aes.action = aes->action;
+
+	if (ilen > 0) {
+		/* Step 2: Run a GCTR over the plaintext */
+		in_place = (sg_virt(p_inp) == sg_virt(p_outp)) ? true : false;
+
+		ret = ccp_init_data(&src, cmd_q, p_inp, ilen,
+				    AES_BLOCK_SIZE,
+				    in_place ? DMA_BIDIRECTIONAL
+					     : DMA_TO_DEVICE);
+		if (ret)
+			goto e_ctx;
+
+		if (in_place) {
+			dst = src;
+		} else {
+			ret = ccp_init_data(&dst, cmd_q, p_outp, ilen,
+					    AES_BLOCK_SIZE, DMA_FROM_DEVICE);
+			if (ret)
+				goto e_src;
+		}
+
+		op.soc = 0;
+		op.eom = 0;
+		op.init = 1;
+		while (src.sg_wa.bytes_left) {
+			ccp_prepare_data(&src, &dst, &op, AES_BLOCK_SIZE, true);
+			if (!src.sg_wa.bytes_left) {
+				unsigned int nbytes = aes->src_len
+						      % AES_BLOCK_SIZE;
+
+				if (nbytes) {
+					op.eom = 1;
+					op.u.aes.size = (nbytes * 8) - 1;
+				}
+			}
+
+			ret = cmd_q->ccp->vdata->perform->aes(&op);
+			if (ret) {
+				cmd->engine_error = cmd_q->cmd_error;
+				goto e_dst;
+			}
+
+			ccp_process_data(&src, &dst, &op);
+			op.init = 0;
+		}
+	}
+
+	/* Step 3: Update the IV portion of the context with the original IV */
+	ret = ccp_copy_from_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			       CCP_PASSTHRU_BYTESWAP_256BIT);
+	if (ret) {
+		cmd->engine_error = cmd_q->cmd_error;
+		goto e_dst;
+	}
+
+	ccp_set_dm_area(&ctx, dm_offset, aes->iv, 0, aes->iv_len);
+
+	ret = ccp_copy_to_sb(cmd_q, &ctx, op.jobid, op.sb_ctx,
+			     CCP_PASSTHRU_BYTESWAP_256BIT);
+	if (ret) {
+		cmd->engine_error = cmd_q->cmd_error;
+		goto e_dst;
+	}
+
+	/* Step 4: Concatenate the lengths of the AAD and source, and
+	 * hash that 16 byte buffer.
+	 */
+	ret = ccp_init_dm_workarea(&final_wa, cmd_q, AES_BLOCK_SIZE,
+				   DMA_BIDIRECTIONAL);
+	if (ret)
+		goto e_dst;
+	final = (unsigned long long *) final_wa.address;
+	final[0] = cpu_to_be64(aes->aad_len * 8);
+	final[1] = cpu_to_be64(ilen * 8);
+
+	op.u.aes.mode = CCP_AES_MODE_GHASH;
+	op.u.aes.action = CCP_AES_GHASHFINAL;
+	op.src.type = CCP_MEMTYPE_SYSTEM;
+	op.src.u.dma.address = final_wa.dma.address;
+	op.src.u.dma.length = AES_BLOCK_SIZE;
+	op.dst.type = CCP_MEMTYPE_SYSTEM;
+	op.dst.u.dma.address = final_wa.dma.address;
+	op.dst.u.dma.length = AES_BLOCK_SIZE;
+	op.eom = 1;
+	op.u.aes.size = 0;
+	ret = cmd_q->ccp->vdata->perform->aes(&op);
+	if (ret)
+		goto e_dst;
+
+	if (aes->action == CCP_AES_ACTION_ENCRYPT) {
+		/* Put the ciphered tag after the ciphertext. */
+		ccp_get_dm_area(&final_wa, 0, p_tag, 0, AES_BLOCK_SIZE);
+	} else {
+		/* Does this ciphered tag match the input? */
+		ret = ccp_init_dm_workarea(&tag, cmd_q, AES_BLOCK_SIZE,
+					   DMA_BIDIRECTIONAL);
+		if (ret)
+			goto e_tag;
+		ccp_set_dm_area(&tag, 0, p_tag, 0, AES_BLOCK_SIZE);
+
+		ret = memcmp(tag.address, final_wa.address, AES_BLOCK_SIZE);
+		ccp_dm_free(&tag);
+	}
+
+e_tag:
+	ccp_dm_free(&final_wa);
+
+e_dst:
+	if (aes->src_len && !in_place)
+		ccp_free_data(&dst, cmd_q);
+
+e_src:
+	if (aes->src_len)
+		ccp_free_data(&src, cmd_q);
+
+e_aad:
+	if (aes->aad_len)
+		ccp_free_data(&aad, cmd_q);
+
+e_ctx:
+	ccp_dm_free(&ctx);
+
+e_key:
+	ccp_dm_free(&key);
+
+	return ret;
+}
+
 static int ccp_run_aes_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 {
 	struct ccp_aes_engine *aes = &cmd->u.aes;
@@ -614,6 +863,9 @@ static int ccp_run_aes_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd)
 	if (aes->mode == CCP_AES_MODE_CMAC)
 		return ccp_run_aes_cmac_cmd(cmd_q, cmd);
 
+	if (aes->mode == CCP_AES_MODE_GCM)
+		return ccp_run_aes_gcm_cmd(cmd_q, cmd);
+
 	if (!((aes->key_len == AES_KEYSIZE_128) ||
 	      (aes->key_len == AES_KEYSIZE_192) ||
 	      (aes->key_len == AES_KEYSIZE_256)))
diff --git a/include/linux/ccp.h b/include/linux/ccp.h
index fa0261748920..dbe8aa0f2940 100644
--- a/include/linux/ccp.h
+++ b/include/linux/ccp.h
@@ -123,6 +123,10 @@ enum ccp_aes_mode {
 	CCP_AES_MODE_CFB,
 	CCP_AES_MODE_CTR,
 	CCP_AES_MODE_CMAC,
+	CCP_AES_MODE_GHASH,
+	CCP_AES_MODE_GCTR,
+	CCP_AES_MODE_GCM,
+	CCP_AES_MODE_GMAC,
 	CCP_AES_MODE__LAST,
 };
 
@@ -137,6 +141,9 @@ enum ccp_aes_action {
 	CCP_AES_ACTION_ENCRYPT,
 	CCP_AES_ACTION__LAST,
 };
+/* Overloaded field */
+#define	CCP_AES_GHASHAAD	CCP_AES_ACTION_DECRYPT
+#define	CCP_AES_GHASHFINAL	CCP_AES_ACTION_ENCRYPT
 
 /**
  * struct ccp_aes_engine - CCP AES operation
@@ -181,6 +188,8 @@ struct ccp_aes_engine {
 	struct scatterlist *cmac_key;	/* K1/K2 cmac key required for
 					 * final cmac cmd */
 	u32 cmac_key_len;	/* In bytes */
+
+	u32 aad_len;		/* In bytes */
 };
 
 /***** XTS-AES engine *****/
-- 
cgit v1.2.3


From 3c7eb3cc8360736123a3139a1ec727d746de3252 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 16 Mar 2017 15:18:57 +0100
Subject: md5: remove from lib and only live in crypto

The md5_transform function is no longer used any where in the tree,
except for the crypto api's actual implementation of md5, so we can drop
the function from lib and put it as a static function of the crypto
file, where it belongs. There should be no new users of md5_transform,
anyway, since there are more modern ways of doing what it once achieved.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/md5.c               | 95 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/cryptohash.h |  5 ---
 lib/Makefile               |  2 +-
 lib/md5.c                  | 95 ----------------------------------------------
 4 files changed, 95 insertions(+), 102 deletions(-)
 delete mode 100644 lib/md5.c

(limited to 'include/linux')

diff --git a/crypto/md5.c b/crypto/md5.c
index 2355a7c25c45..f7ae1a48225b 100644
--- a/crypto/md5.c
+++ b/crypto/md5.c
@@ -21,9 +21,11 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/types.h>
-#include <linux/cryptohash.h>
 #include <asm/byteorder.h>
 
+#define MD5_DIGEST_WORDS 4
+#define MD5_MESSAGE_BYTES 64
+
 const u8 md5_zero_message_hash[MD5_DIGEST_SIZE] = {
 	0xd4, 0x1d, 0x8c, 0xd9, 0x8f, 0x00, 0xb2, 0x04,
 	0xe9, 0x80, 0x09, 0x98, 0xec, 0xf8, 0x42, 0x7e,
@@ -47,6 +49,97 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
 	}
 }
 
+#define F1(x, y, z)	(z ^ (x & (y ^ z)))
+#define F2(x, y, z)	F1(z, x, y)
+#define F3(x, y, z)	(x ^ y ^ z)
+#define F4(x, y, z)	(y ^ (x | ~z))
+
+#define MD5STEP(f, w, x, y, z, in, s) \
+	(w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x)
+
+static void md5_transform(__u32 *hash, __u32 const *in)
+{
+	u32 a, b, c, d;
+
+	a = hash[0];
+	b = hash[1];
+	c = hash[2];
+	d = hash[3];
+
+	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+	hash[0] += a;
+	hash[1] += b;
+	hash[2] += c;
+	hash[3] += d;
+}
+
 static inline void md5_transform_helper(struct md5_state *ctx)
 {
 	le32_to_cpu_array(ctx->block, sizeof(ctx->block) / sizeof(u32));
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index 3252799832cf..df4d3e943d28 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -10,9 +10,4 @@
 void sha_init(__u32 *buf);
 void sha_transform(__u32 *digest, const char *data, __u32 *W);
 
-#define MD5_DIGEST_WORDS 4
-#define MD5_MESSAGE_BYTES 64
-
-void md5_transform(__u32 *hash, __u32 const *in);
-
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index 320ac46a8725..acbc16bed9af 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -19,7 +19,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
-	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
+	 sha1.o chacha20.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o seq_buf.o siphash.o \
diff --git a/lib/md5.c b/lib/md5.c
deleted file mode 100644
index bb0cd01d356d..000000000000
--- a/lib/md5.c
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/export.h>
-#include <linux/cryptohash.h>
-
-#define F1(x, y, z)	(z ^ (x & (y ^ z)))
-#define F2(x, y, z)	F1(z, x, y)
-#define F3(x, y, z)	(x ^ y ^ z)
-#define F4(x, y, z)	(y ^ (x | ~z))
-
-#define MD5STEP(f, w, x, y, z, in, s) \
-	(w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x)
-
-void md5_transform(__u32 *hash, __u32 const *in)
-{
-	u32 a, b, c, d;
-
-	a = hash[0];
-	b = hash[1];
-	c = hash[2];
-	d = hash[3];
-
-	MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
-	MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
-	MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
-	MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
-	MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
-	MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
-	MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
-	MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
-	MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
-	MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
-	MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
-	MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
-	MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
-	MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
-	MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
-	MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
-
-	MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
-	MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
-	MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
-	MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
-	MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
-	MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
-	MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
-	MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
-	MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
-	MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
-	MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
-	MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
-	MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
-	MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
-	MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
-	MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
-
-	MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
-	MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
-	MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
-	MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
-	MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
-	MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
-	MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
-	MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
-	MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
-	MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
-	MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
-	MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
-	MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
-	MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
-	MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
-	MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
-
-	MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
-	MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
-	MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
-	MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
-	MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
-	MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
-	MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
-	MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
-	MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
-	MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
-	MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
-	MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
-	MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
-	MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
-	MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
-	MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
-
-	hash[0] += a;
-	hash[1] += b;
-	hash[2] += c;
-	hash[3] += d;
-}
-EXPORT_SYMBOL(md5_transform);
-- 
cgit v1.2.3


From 0ba42a599fbf59a55c1ffedb980be3726c734433 Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Tue, 7 Mar 2017 20:48:02 +0900
Subject: treewide: Fix typo in xml/driver-api/basics.xml

This patch fix spelling typos found in
Documentation/output/xml/driver-api/basics.xml.
It is because the xml file was generated from comments in source,
so I had to fix the comments.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/sched.h | 2 +-
 kernel/sched/fair.c   | 2 +-
 kernel/time/hrtimer.c | 2 +-
 kernel/time/timer.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..5ce85e861901 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -186,7 +186,7 @@ extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
 /**
- * struct prev_cputime - snaphsot of system and user cputime
+ * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
  * @stime: time spent in system mode
  * @lock: protects the above two fields
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dea138964b91..3ae75f5a69fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7597,7 +7597,7 @@ next_group:
 
 /**
  * check_asym_packing - Check to see if the group is packed into the
- *			sched doman.
+ *			sched domain.
  *
  * This is primarily intended to used at the sibling level.  Some
  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..a7098f0737ed 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
  * Returns:
  *  0 when the timer was not active
  *  1 when the timer was active
- * -1 when the timer is currently excuting the callback function and
+ * -1 when the timer is currently executing the callback function and
  *    cannot be stopped
  */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1dc0256bfb6e..17610d2ddab1 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1120,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 EXPORT_SYMBOL_GPL(add_timer_on);
 
 /**
- * del_timer - deactive a timer.
+ * del_timer - deactivate a timer.
  * @timer: the timer to be deactivated
  *
  * del_timer() deactivates a timer - this works on both active and inactive
-- 
cgit v1.2.3


From 8f48cfabac57977338f5c828ed3e12fc34373c7d Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Fri, 24 Mar 2017 22:13:35 +0800
Subject: cgroup: drop duplicate header nsproxy.h

Drop duplicate header nsproxy.h from linux/cgroup.h.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 44129793c7b8..34b4a298e52e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -17,7 +17,6 @@
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
-#include <linux/nsproxy.h>
 #include <linux/types.h>
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
-- 
cgit v1.2.3


From e725c731e3bb1e892e7b564c945b121cb41d1087 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 3 Mar 2017 13:37:33 -0500
Subject: tracing: Split tracing initialization into two for early
 initialization

Create an early_trace_init() function that will initialize the buffers and
allow for ealier use of trace_printk(). This will also allow for future work
to have function tracing start earlier at boot up.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 ++
 init/main.c            | 5 ++++-
 kernel/trace/trace.c   | 6 +++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3633e8beff39..569db5589851 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -42,8 +42,10 @@
 /* Main tracing buffer and events set up */
 #ifdef CONFIG_TRACING
 void trace_init(void);
+void early_trace_init(void);
 #else
 static inline void trace_init(void) { }
+static inline void early_trace_init(void) { }
 #endif
 
 struct module;
diff --git a/init/main.c b/init/main.c
index f9c9d9948203..81a49e8d54cc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -545,6 +545,9 @@ asmlinkage __visible void __init start_kernel(void)
 	trap_init();
 	mm_init();
 
+	/* trace_printk can be enabled here */
+	early_trace_init();
+
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -570,7 +573,7 @@ asmlinkage __visible void __init start_kernel(void)
 
 	rcu_init();
 
-	/* trace_printk() and trace points may be used after this */
+	/* Trace events are available after this */
 	trace_init();
 
 	context_tracking_init();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f35109514a01..6757561d9617 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7999,7 +7999,7 @@ out:
 	return ret;
 }
 
-void __init trace_init(void)
+void __init early_trace_init(void)
 {
 	if (tracepoint_printk) {
 		tracepoint_print_iter =
@@ -8010,6 +8010,10 @@ void __init trace_init(void)
 			static_key_enable(&tracepoint_printk_key.key);
 	}
 	tracer_alloc_buffers();
+}
+
+void __init trace_init(void)
+{
 	trace_event_init();
 }
 
-- 
cgit v1.2.3


From 6f8802852f7e58a12177a86179803b9efaad98e2 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 17 Mar 2017 00:12:29 +0800
Subject: block: introduce bio_copy_data_partial

Turns out we can use bio_copy_data in raid1's write behind,
and we can make alloc_behind_pages() more clean/efficient,
but we need to partial version of bio_copy_data().

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 60 +++++++++++++++++++++++++++++++++++++++++------------
 include/linux/bio.h |  2 ++
 2 files changed, 49 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..1ccff0dace89 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1025,19 +1025,8 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
+static void __bio_copy_data(struct bio *dst, struct bio *src,
+			    int offset, int size)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1047,6 +1036,12 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
+	/* for supporting partial copy */
+	if (offset || size != src->bi_iter.bi_size) {
+		bio_advance_iter(src, &src_iter, offset);
+		src_iter.bi_size = size;
+	}
+
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1083,8 +1078,47 @@ void bio_copy_data(struct bio *dst, struct bio *src)
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
+}
 EXPORT_SYMBOL(bio_copy_data);
 
+/**
+ * bio_copy_data_partial - copy partial contents of data buffers from one
+ * chain of bios to another
+ * @dst: destination bio list
+ * @src: source bio list
+ * @offset: starting copy from the offset
+ * @size: how many bytes to copy
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data_partial(struct bio *dst, struct bio *src,
+			   int offset, int size)
+{
+	__bio_copy_data(dst, src, offset, size);
+
+}
+EXPORT_SYMBOL(bio_copy_data_partial);
+
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..42b62a0288b0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -468,6 +468,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
+				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
-- 
cgit v1.2.3


From 264b88c9e5c86c92ca1d67689779362760baf651 Mon Sep 17 00:00:00 2001
From: Harald Geyer <harald@ccbib.org>
Date: Thu, 23 Feb 2017 17:06:52 +0000
Subject: regulator: core: Add new notification for enabling of regulator

This is useful for devices, which need some time to start up, to help
the drivers track how long the supply has been up already. Ie whether
it can safely talk to the HW or needs to wait.

Signed-off-by: Harald Geyer <harald@ccbib.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c           | 2 ++
 include/linux/regulator/consumer.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 04baac9a165b..6b9bb1b00226 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2162,6 +2162,8 @@ static int _regulator_enable(struct regulator_dev *rdev)
 			if (ret < 0)
 				return ret;
 
+			_notifier_call_chain(rdev, REGULATOR_EVENT_ENABLE,
+					     NULL);
 		} else if (ret < 0) {
 			rdev_err(rdev, "is_enabled() failed: %d\n", ret);
 			return ret;
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index ea0fffa5faeb..df176d7c2b87 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -119,6 +119,7 @@ struct regmap;
 #define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE	0x200
 #define REGULATOR_EVENT_PRE_DISABLE		0x400
 #define REGULATOR_EVENT_ABORT_DISABLE		0x800
+#define REGULATOR_EVENT_ENABLE			0x1000
 
 /*
  * Regulator errors that can be queried using regulator_get_error_flags
-- 
cgit v1.2.3


From e6e14f63d744cede856ba5d517d6b266c9cfbf41 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 23 Mar 2017 10:01:17 -0700
Subject: of_mdio: Correct check against CONFIG_OF

CONFIG_OF_MDIO is actually what triggers the build of drivers/of/of_mdio.c, so
providing inline stubs when CONFIG_OF_MDIO=y should be based on that symbol as
well.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of_mdio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index a58cca8bcb29..ba35ba520487 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -12,7 +12,7 @@
 #include <linux/phy.h>
 #include <linux/of.h>
 
-#ifdef CONFIG_OF
+#if IS_ENABLED(CONFIG_OF_MDIO)
 extern int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np);
 extern struct phy_device *of_phy_find_device(struct device_node *phy_np);
 extern struct phy_device *of_phy_connect(struct net_device *dev,
@@ -32,7 +32,7 @@ extern int of_phy_register_fixed_link(struct device_node *np);
 extern void of_phy_deregister_fixed_link(struct device_node *np);
 extern bool of_phy_is_fixed_link(struct device_node *np);
 
-#else /* CONFIG_OF */
+#else /* CONFIG_OF_MDIO */
 static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 {
 	/*
-- 
cgit v1.2.3


From 90eff9096c01ba90cdae504a6b95ee87fe2556a3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 23 Mar 2017 10:01:19 -0700
Subject: net: phy: Allow splitting MDIO bus/device support from PHYs

Introduce a new configuration symbol: MDIO_DEVICE which allows building
the MDIO devices and bus code, without pulling in the entire Ethernet
PHY library and devices code.

PHYLIB nows select MDIO_DEVICE and the relevant Makefile files are
updated to reflect that.

When MDIO_DEVICE (MDIO bus/device only) is selected, but not PHYLIB, we
have mdio-bus.ko as a loadable module, and it does not have a
module_exit() function because the safety of removing a bus class is
unclear.

When both MDIO_DEVICE and PHYLIB are enabled, we need to assemble
everything into a common loadable module: libphy.ko because of nasty
circular dependencies between phy.c, phy_device.c and mdio_bus.c which
are really tough to untangle.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Makefile             |  2 +-
 drivers/net/phy/Kconfig          | 60 +++++++++++++++++++++++-----------------
 drivers/net/phy/Makefile         | 13 +++++++--
 drivers/net/phy/mdio-boardinfo.c |  1 +
 drivers/net/phy/mdio_bus.c       |  9 ++++++
 include/linux/phy.h              | 21 ++++++++++++--
 6 files changed, 76 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 98ed4d96987c..55f75aea283c 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -18,7 +18,7 @@ obj-$(CONFIG_MII) += mii.o
 obj-$(CONFIG_MDIO) += mdio.o
 obj-$(CONFIG_NET) += Space.o loopback.o
 obj-$(CONFIG_NETCONSOLE) += netconsole.o
-obj-$(CONFIG_PHYLIB) += phy/
+obj-$(CONFIG_MDIO_DEVICE) += phy/
 obj-$(CONFIG_RIONET) += rionet.o
 obj-$(CONFIG_NET_TEAM) += team/
 obj-$(CONFIG_TUN) += tun.o
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 7ab4b14a43b7..60ffc9da6a28 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -2,33 +2,12 @@
 # PHY Layer Configuration
 #
 
-menuconfig PHYLIB
-	tristate "PHY Device support and infrastructure"
-	depends on NETDEVICES
+menuconfig MDIO_DEVICE
+	tristate "MDIO bus device drivers"
 	help
-	  Ethernet controllers are usually attached to PHY
-	  devices.  This option provides infrastructure for
-	  managing PHY devices.
-
-if PHYLIB
-
-config SWPHY
-	bool
-
-config LED_TRIGGER_PHY
-	bool "Support LED triggers for tracking link state"
-	depends on LEDS_TRIGGERS
-	---help---
-	  Adds support for a set of LED trigger events per-PHY.  Link
-	  state change will trigger the events, for consumption by an
-	  LED class driver.  There are triggers for each link speed currently
-	  supported by the phy, and are of the form:
-	       <mii bus id>:<phy>:<speed>
-
-	  Where speed is in the form:
-		<Speed in megabits>Mbps or <Speed in gigabits>Gbps
+	   MDIO devices and driver infrastructure code.
 
-comment "MDIO bus device drivers"
+if MDIO_DEVICE
 
 config MDIO_BCM_IPROC
 	tristate "Broadcom iProc MDIO bus controller"
@@ -49,6 +28,7 @@ config MDIO_BCM_UNIMAC
 
 config MDIO_BITBANG
 	tristate "Bitbanged MDIO buses"
+	depends on !(MDIO_DEVICE=y && PHYLIB=m)
 	help
 	  This module implements the MDIO bus protocol in software,
 	  for use by low level drivers that export the ability to
@@ -160,6 +140,36 @@ config MDIO_XGENE
 	  This module provides a driver for the MDIO busses found in the
 	  APM X-Gene SoC's.
 
+endif
+
+menuconfig PHYLIB
+	tristate "PHY Device support and infrastructure"
+	depends on NETDEVICES
+	select MDIO_DEVICE
+	help
+	  Ethernet controllers are usually attached to PHY
+	  devices.  This option provides infrastructure for
+	  managing PHY devices.
+
+if PHYLIB
+
+config SWPHY
+	bool
+
+config LED_TRIGGER_PHY
+	bool "Support LED triggers for tracking link state"
+	depends on LEDS_TRIGGERS
+	---help---
+	  Adds support for a set of LED trigger events per-PHY.  Link
+	  state change will trigger the events, for consumption by an
+	  LED class driver.  There are triggers for each link speed currently
+	  supported by the phy, and are of the form:
+	       <mii bus id>:<phy>:<speed>
+
+	  Where speed is in the form:
+		<Speed in megabits>Mbps or <Speed in gigabits>Gbps
+
+
 comment "MII PHY device drivers"
 
 config AMD_PHY
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 82d915614646..0e1ec0438c23 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -1,7 +1,16 @@
 # Makefile for Linux PHY drivers and MDIO bus drivers
 
-libphy-y			:= phy.o phy_device.o mdio_bus.o mdio_device.o \
-				   mdio-boardinfo.o phy-core.o
+libphy-y			:= phy.o phy-core.o phy_device.o
+mdio-bus-y			+= mdio_bus.o mdio_device.o mdio-boardinfo.o
+
+# PHYLIB implies MDIO_DEVICE, in that case, we have a bunch of circular
+# dependencies that does not make it possible to split mdio-bus objects into a
+# dedicated loadable module, so we bundle them all together into libphy.ko
+ifdef CONFIG_PHYLIB
+libphy-y			+= $(mdio-bus-y)
+else
+obj-$(CONFIG_MDIO_DEVICE)	+= mdio-bus.o
+endif
 libphy-$(CONFIG_SWPHY)		+= swphy.o
 libphy-$(CONFIG_LED_TRIGGER_PHY)	+= phy_led_triggers.o
 
diff --git a/drivers/net/phy/mdio-boardinfo.c b/drivers/net/phy/mdio-boardinfo.c
index 6b988f77da08..61941e29daae 100644
--- a/drivers/net/phy/mdio-boardinfo.c
+++ b/drivers/net/phy/mdio-boardinfo.c
@@ -84,3 +84,4 @@ int mdiobus_register_board_info(const struct mdio_board_info *info,
 
 	return 0;
 }
+EXPORT_SYMBOL(mdiobus_register_board_info);
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index fa7d51f14869..46b468eb6e12 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -648,9 +648,18 @@ int __init mdio_bus_init(void)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mdio_bus_init);
 
+#if IS_ENABLED(CONFIG_PHYLIB)
 void mdio_bus_exit(void)
 {
 	class_unregister(&mdio_bus_class);
 	bus_unregister(&mdio_bus_type);
 }
+EXPORT_SYMBOL_GPL(mdio_bus_exit);
+#else
+module_init(mdio_bus_init);
+/* no module_exit, intentional */
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MDIO bus/device layer");
+#endif
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 2efca6b39fba..624cecf69c28 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -745,8 +745,24 @@ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val);
 struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id,
 				     bool is_c45,
 				     struct phy_c45_device_ids *c45_ids);
+#if IS_ENABLED(CONFIG_PHYLIB)
 struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45);
 int phy_device_register(struct phy_device *phy);
+void phy_device_free(struct phy_device *phydev);
+#else
+static inline
+struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45)
+{
+	return NULL;
+}
+
+static inline int phy_device_register(struct phy_device *phy)
+{
+	return 0;
+}
+
+static inline void phy_device_free(struct phy_device *phydev) { }
+#endif /* CONFIG_PHYLIB */
 void phy_device_remove(struct phy_device *phydev);
 int phy_init_hw(struct phy_device *phydev);
 int phy_suspend(struct phy_device *phydev);
@@ -827,7 +843,6 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev,
 int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd);
 int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
-void phy_device_free(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
@@ -854,8 +869,10 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev,
 				   const struct ethtool_link_ksettings *cmd);
 int phy_ethtool_nway_reset(struct net_device *ndev);
 
+#if IS_ENABLED(CONFIG_PHYLIB)
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
+#endif
 
 extern struct bus_type mdio_bus_type;
 
@@ -866,7 +883,7 @@ struct mdio_board_info {
 	const void	*platform_data;
 };
 
-#if IS_ENABLED(CONFIG_PHYLIB)
+#if IS_ENABLED(CONFIG_MDIO_DEVICE)
 int mdiobus_register_board_info(const struct mdio_board_info *info,
 				unsigned int n);
 #else
-- 
cgit v1.2.3


From 42c269c88dc146982a54a8267f71abc99f12852a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 3 Mar 2017 16:15:39 -0500
Subject: ftrace: Allow for function tracing to record init functions on boot
 up

Adding a hook into free_reserve_area() that informs ftrace that boot up init
text is being free, lets ftrace safely remove those init functions from its
records, which keeps ftrace from trying to modify text that no longer
exists.

Note, this still does not allow for tracing .init text of modules, as
modules require different work for freeing its init code.

Link: http://lkml.kernel.org/r/1488502497.7212.24.camel@linux.intel.com

Cc: linux-mm@kvack.org
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Requested-by: Todd Brandt <todd.e.brandt@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h  |  5 +++++
 include/linux/init.h    |  4 +++-
 kernel/trace/ftrace.c   | 44 ++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c         |  4 ++++
 scripts/recordmcount.c  |  1 +
 scripts/recordmcount.pl |  1 +
 6 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 569db5589851..0276a2c487e6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -146,6 +146,10 @@ struct ftrace_ops_hash {
 	struct ftrace_hash		*filter_hash;
 	struct mutex			regex_lock;
 };
+
+void ftrace_free_mem(void *start, void *end);
+#else
+static inline void ftrace_free_mem(void *start, void *end) { }
 #endif
 
 /*
@@ -262,6 +266,7 @@ static inline int ftrace_nr_registered_ops(void)
 }
 static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
+static inline void ftrace_free_mem(void *start, void *end) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_STACK_TRACER
diff --git a/include/linux/init.h b/include/linux/init.h
index 79af0962fd52..94769d687cf0 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -39,7 +39,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold notrace __latent_entropy
+#define __init		__section(.init.text) __cold __inittrace __latent_entropy
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
@@ -68,8 +68,10 @@
 
 #ifdef MODULE
 #define __exitused
+#define __inittrace notrace
 #else
 #define __exitused  __used
+#define __inittrace
 #endif
 
 #define __exit          __section(.exit.text) __exitused __cold notrace
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b9691ee8f6c1..0556a202c055 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5262,6 +5262,50 @@ void ftrace_module_init(struct module *mod)
 }
 #endif /* CONFIG_MODULES */
 
+void ftrace_free_mem(void *start_ptr, void *end_ptr)
+{
+	unsigned long start = (unsigned long)start_ptr;
+	unsigned long end = (unsigned long)end_ptr;
+	struct ftrace_page **last_pg = &ftrace_pages_start;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	struct dyn_ftrace key;
+	int order;
+
+	key.ip = start;
+	key.flags = end;	/* overload flags, as it is unsigned long */
+
+	mutex_lock(&ftrace_lock);
+
+	for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
+		if (end < pg->records[0].ip ||
+		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+			continue;
+ again:
+		rec = bsearch(&key, pg->records, pg->index,
+			      sizeof(struct dyn_ftrace),
+			      ftrace_cmp_recs);
+		if (!rec)
+			continue;
+		pg->index--;
+		if (!pg->index) {
+			*last_pg = pg->next;
+			order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+			free_pages((unsigned long)pg->records, order);
+			kfree(pg);
+			pg = container_of(last_pg, struct ftrace_page, next);
+			if (!(*last_pg))
+				ftrace_pages = pg;
+			continue;
+		}
+		memmove(rec, rec + 1,
+			(pg->index - (rec - pg->records)) * sizeof(*rec));
+		/* More than one function may be in this block */
+		goto again;
+	}
+	mutex_unlock(&ftrace_lock);
+}
+
 void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cbde310abed..eee82bfb7cd8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -65,6 +65,7 @@
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -6605,6 +6606,9 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 	void *pos;
 	unsigned long pages = 0;
 
+	/* This may be .init text, inform ftrace to remove it */
+	ftrace_free_mem(start, end);
+
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index aeb34223167c..16e086dcc567 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -412,6 +412,7 @@ static int
 is_mcounted_section_name(char const *const txtname)
 {
 	return strcmp(".text",           txtname) == 0 ||
+		strcmp(".init.text",     txtname) == 0 ||
 		strcmp(".ref.text",      txtname) == 0 ||
 		strcmp(".sched.text",    txtname) == 0 ||
 		strcmp(".spinlock.text", txtname) == 0 ||
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 0b6002b36f20..1633c3e6c0b9 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -130,6 +130,7 @@ if ($inputfile =~ m,kernel/trace/ftrace\.o$,) {
 # Acceptable sections to record.
 my %text_sections = (
      ".text" => 1,
+     ".init.text" => 1,
      ".ref.text" => 1,
      ".sched.text" => 1,
      ".spinlock.text" => 1,
-- 
cgit v1.2.3


From af0009fc16a45d091f896794e97a6457f9a7eddf Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 16 Mar 2017 11:01:06 -0400
Subject: tracing: Move trace_handle_return() out of line

Currently trace_handle_return() looks like this:

 static inline enum print_line_t trace_handle_return(struct trace_seq *s)
 {
        return trace_seq_has_overflowed(s) ?
                TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
 }

Where trace_seq_overflowed(s) is:

 static inline bool trace_seq_has_overflowed(struct trace_seq *s)
 {
	return s->full || seq_buf_has_overflowed(&s->seq);
 }

And seq_buf_has_overflowed(&s->seq) is:

 static inline bool
 seq_buf_has_overflowed(struct seq_buf *s)
 {
	return s->len > s->size;
 }

Making trace_handle_return() into:

 return (s->full || (s->seq->len > s->seq->size)) ?
           TRACE_TYPE_PARTIAL_LINE :
           TRACE_TYPE_HANDLED;

One would think this is not an issue to keep as an inline. But because this
is used in the TRACE_EVENT() macro, it is extended for every tracepoint in
the system. Taking a look at a single tracepoint x86_irq_vector (was the
first one I randomly chosen). As trace_handle_return is used in the
TRACE_EVENT() macro of trace_raw_output_##call() we disassemble
trace_raw_output_x86_irq_vector and do a diff:

- is the original
+ is the out-of-line code

I removed identical lines that were different just due to different
addresses.

--- /tmp/irq-vec-orig	2017-03-16 09:12:48.569384851 -0400
+++ /tmp/irq-vec-ool	2017-03-16 09:13:39.378153385 -0400
@@ -6,27 +6,23 @@
        53                      push   %rbx
        48 89 fb                mov    %rdi,%rbx
        4c 8b a7 c0 20 00 00    mov    0x20c0(%rdi),%r12
        e8 f7 72 13 00          callq  ffffffff81155c80 <trace_raw_output_prep>
        83 f8 01                cmp    $0x1,%eax
        74 05                   je     ffffffff8101e993 <trace_raw_output_x86_irq_vector+0x23>
        5b                      pop    %rbx
        41 5c                   pop    %r12
        5d                      pop    %rbp
        c3                      retq
        41 8b 54 24 08          mov    0x8(%r12),%edx
-       48 8d bb 98 10 00 00    lea    0x1098(%rbx),%rdi
+       48 81 c3 98 10 00 00    add    $0x1098,%rbx
-       48 c7 c6 7b 8a a0 81    mov    $0xffffffff81a08a7b,%rsi
+       48 c7 c6 ab 8a a0 81    mov    $0xffffffff81a08aab,%rsi
-       e8 c5 85 13 00          callq  ffffffff81156f70 <trace_seq_printf>

 === here's the start of the main difference ===

+       48 89 df                mov    %rbx,%rdi
+       e8 62 7e 13 00          callq  ffffffff81156810 <trace_seq_printf>
-       8b 93 b8 20 00 00       mov    0x20b8(%rbx),%edx
-       31 c0                   xor    %eax,%eax
-       85 d2                   test   %edx,%edx
-       75 11                   jne    ffffffff8101e9c8 <trace_raw_output_x86_irq_vector+0x58>
-       48 8b 83 a8 20 00 00    mov    0x20a8(%rbx),%rax
-       48 39 83 a0 20 00 00    cmp    %rax,0x20a0(%rbx)
-       0f 93 c0                setae  %al
+       48 89 df                mov    %rbx,%rdi
+       e8 4a c5 12 00          callq  ffffffff8114af00 <trace_handle_return>
        5b                      pop    %rbx
-       0f b6 c0                movzbl %al,%eax

 === end ===

        41 5c                   pop    %r12
        5d                      pop    %rbp
        c3                      retq

If you notice, the original has 22 bytes of text more than the out of line
version. As this is for every TRACE_EVENT() defined in the system, this can
become quite large.

   text	   data	    bss	    dec	    hex	filename
8690305	5450490	1298432	15439227	 eb957b	vmlinux-orig
8681725	5450490	1298432	15430647	 eb73f7	vmlinux-handle

This change has a total of 8580 bytes in savings.

 $ objdump -dr /tmp/vmlinux-orig | grep '^[0-9a-f]* <trace_raw_output' | wc -l
324

That's 324 tracepoints. But this does not include modules (which contain
many more tracepoints). For an allyesconfig build:

 $ objdump -dr vmlinux-allyes-orig | grep '^[0-9a-f]* <trace_raw_output' | wc -l
1401

That's 1401 tracepoints giving us:

   text    data     bss     dec     hex filename
137920629       140221067       53264384        331406080       13c0db00 vmlinux-allyes-orig
137827709       140221067       53264384        331313160       13bf7008 vmlinux-allyes-handle

92920 bytes in savings!!!

Link: http://lkml.kernel.org/r/20170315021431.13107-2-andi@firstfloor.org

Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 11 +----------
 kernel/trace/trace.c         | 12 ++++++++++++
 2 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0af63c4381b9..a556805eff8a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -138,16 +138,7 @@ enum print_line_t {
 	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
 };
 
-/*
- * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
- * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
- * simplifies those functions and keeps them in sync.
- */
-static inline enum print_line_t trace_handle_return(struct trace_seq *s)
-{
-	return trace_seq_has_overflowed(s) ?
-		TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
-}
+enum print_line_t trace_handle_return(struct trace_seq *s);
 
 void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4fa8e8f3c765..b5d4b80f2d45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1998,6 +1998,18 @@ void tracing_record_cmdline(struct task_struct *tsk)
 		__this_cpu_write(trace_cmdline_save, false);
 }
 
+/*
+ * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq
+ * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function
+ * simplifies those functions and keeps them in sync.
+ */
+enum print_line_t trace_handle_return(struct trace_seq *s)
+{
+	return trace_seq_has_overflowed(s) ?
+		TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED;
+}
+EXPORT_SYMBOL_GPL(trace_handle_return);
+
 void
 tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 			     int pc)
-- 
cgit v1.2.3


From aff2615763f206f897146e0ee1ddae8e22055ae3 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Sat, 25 Mar 2017 00:52:05 +0300
Subject: net/mlx5e: Single bfreg (UAR) for all mlx5e SQs and netdevs

One is sufficient since Blue Flame is not supported anymore.
This will also come in handy for switchdev mode to save resources, since
VF representors will use same single UAR as well for their own SQs.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h        |  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_common.c |  9 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 17 +++--------------
 include/linux/mlx5/driver.h                         |  1 +
 4 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 85261e9ccf4a..22e4bad03f05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -483,7 +483,6 @@ struct mlx5e_sq {
 
 	/* control path */
 	struct mlx5_wq_ctrl        wq_ctrl;
-	struct mlx5_sq_bfreg	   bfreg;
 	struct mlx5e_channel      *channel;
 	int                        tc;
 	u32                        rate_limit;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index bd898d8deda0..20bdbe685795 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -107,10 +107,18 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
 		goto err_dealloc_transport_domain;
 	}
 
+	err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false);
+	if (err) {
+		mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err);
+		goto err_destroy_mkey;
+	}
+
 	INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list);
 
 	return 0;
 
+err_destroy_mkey:
+	mlx5_core_destroy_mkey(mdev, &res->mkey);
 err_dealloc_transport_domain:
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
 err_dealloc_pd:
@@ -122,6 +130,7 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
 {
 	struct mlx5e_resources *res = &mdev->mlx5e_res;
 
+	mlx5_free_bfreg(mdev, &res->bfreg);
 	mlx5_core_destroy_mkey(mdev, &res->mkey);
 	mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
 	mlx5_core_dealloc_pd(mdev, res->pdn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index f9bcbd277adb..49c1769d13b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1016,18 +1016,14 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 	sq->mkey_be   = c->mkey_be;
 	sq->channel   = c;
 	sq->tc        = tc;
+	sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 
-	err = mlx5_alloc_bfreg(mdev, &sq->bfreg, false, false);
-	if (err)
-		return err;
-
-	sq->uar_map = sq->bfreg.map;
 	param->wq.db_numa_node = cpu_to_node(c->cpu);
 
 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
 				 &sq->wq_ctrl);
 	if (err)
-		goto err_unmap_free_uar;
+		return err;
 
 	sq->wq.db       = &sq->wq.db[MLX5_SND_DBR];
 
@@ -1053,20 +1049,13 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
-err_unmap_free_uar:
-	mlx5_free_bfreg(mdev, &sq->bfreg);
-
 	return err;
 }
 
 static void mlx5e_destroy_sq(struct mlx5e_sq *sq)
 {
-	struct mlx5e_channel *c = sq->channel;
-	struct mlx5e_priv *priv = c->priv;
-
 	mlx5e_free_sq_db(sq);
 	mlx5_wq_destroy(&sq->wq_ctrl);
-	mlx5_free_bfreg(priv->mdev, &sq->bfreg);
 }
 
 static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
@@ -1103,7 +1092,7 @@ static int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param)
 	MLX5_SET(sqc,  sqc, tis_lst_sz, param->type == MLX5E_SQ_ICO ? 0 : 1);
 
 	MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
-	MLX5_SET(wq,   wq, uar_page,      sq->bfreg.index);
+	MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.bfreg.index);
 	MLX5_SET(wq,   wq, log_wq_pg_sz,  sq->wq_ctrl.buf.page_shift -
 					  MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET64(wq, wq, dbr_addr,      sq->wq_ctrl.db.dma);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2fcff6b4503f..f50864626230 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -728,6 +728,7 @@ struct mlx5e_resources {
 	u32                        pdn;
 	struct mlx5_td             td;
 	struct mlx5_core_mkey      mkey;
+	struct mlx5_sq_bfreg       bfreg;
 };
 
 struct mlx5_core_dev {
-- 
cgit v1.2.3


From 869ab90f0ae0002ce6e9d3a5c75156ae8de48ffc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 24 Mar 2017 18:03:48 -0700
Subject: block: constify struct blk_integrity_profile

blk_integrity_profile's are never modified, so mark them 'const' so that
they are placed in .rodata and benefit from memory protection.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-integrity.c  |  2 +-
 block/t10-pi.c         |  8 ++++----
 include/linux/genhd.h  | 10 +++++-----
 include/linux/t10-pi.h |  8 ++++----
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84..b3622cb00fc2 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
 	return 0;
 }
 
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
 	.name = "nop",
 	.generate_fn = blk_integrity_nop_fn,
 	.verify_fn = blk_integrity_nop_fn,
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a9..680c6d636298 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
 	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }
 
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
 	.name			= "T10-DIF-TYPE1-CRC",
 	.generate_fn		= t10_pi_type1_generate_crc,
 	.verify_fn		= t10_pi_type1_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type1_crc);
 
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
 	.name			= "T10-DIF-TYPE1-IP",
 	.generate_fn		= t10_pi_type1_generate_ip,
 	.verify_fn		= t10_pi_type1_verify_ip,
 };
 EXPORT_SYMBOL(t10_pi_type1_ip);
 
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
 	.name			= "T10-DIF-TYPE3-CRC",
 	.generate_fn		= t10_pi_type3_generate_crc,
 	.verify_fn		= t10_pi_type3_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type3_crc);
 
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
 	.name			= "T10-DIF-TYPE3-IP",
 	.generate_fn		= t10_pi_type3_generate_ip,
 	.verify_fn		= t10_pi_type3_verify_ip,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b0..9e11082c7f9b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -159,11 +159,11 @@ struct badblocks;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 struct blk_integrity {
-	struct blk_integrity_profile	*profile;
-	unsigned char			flags;
-	unsigned char			tuple_size;
-	unsigned char			interval_exp;
-	unsigned char			tag_size;
+	const struct blk_integrity_profile	*profile;
+	unsigned char				flags;
+	unsigned char				tuple_size;
+	unsigned char				interval_exp;
+	unsigned char				tag_size;
 };
 
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9fba9dd33544..9375d23a24e7 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
 };
 
 
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
 
 #endif
-- 
cgit v1.2.3


From f45958756fef552436e4a63029a168495920026e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 24 Mar 2017 10:34:43 -0700
Subject: block: remove bio_clone_bioset_partial()

commit c18a1e0(block: introduce bio_clone_bioset_partial()) introduced
bio_clone_bioset_partial() for raid1 write behind IO. Now the write behind is
rewritten by Ming. We don't need the API any more, so revert the commit.

Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 61 ++++++++++++-----------------------------------------
 include/linux/bio.h | 11 ++--------
 2 files changed, 15 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 1ccff0dace89..036435995c55 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -631,20 +631,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
 }
 EXPORT_SYMBOL(bio_clone_fast);
 
-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-				      struct bio_set *bs, int offset,
-				      int size)
+/**
+ * 	bio_clone_bioset - clone a bio
+ * 	@bio_src: bio to clone
+ *	@gfp_mask: allocation priority
+ *	@bs: bio_set to allocate from
+ *
+ *	Clone bio. Caller will own the returned bio, but not the actual data it
+ *	points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+			     struct bio_set *bs)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
 	struct bio *bio;
-	struct bvec_iter iter_src = bio_src->bi_iter;
-
-	/* for supporting partial clone */
-	if (offset || size != bio_src->bi_iter.bi_size) {
-		bio_advance_iter(bio_src, &iter_src, offset);
-		iter_src.bi_size = size;
-	}
 
 	/*
 	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -668,8 +669,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	 *    __bio_clone_fast() anyways.
 	 */
 
-	bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
-			       &iter_src), bs);
+	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
 	if (!bio)
 		return NULL;
 	bio->bi_bdev		= bio_src->bi_bdev;
@@ -686,7 +686,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
 		break;
 	default:
-		__bio_for_each_segment(bv, bio_src, iter, iter_src)
+		bio_for_each_segment(bv, bio_src, iter)
 			bio->bi_io_vec[bio->bi_vcnt++] = bv;
 		break;
 	}
@@ -705,43 +705,8 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 
 	return bio;
 }
-
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
-{
-	return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
-				  bio_src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_clone_bioset);
 
-/**
- * 	bio_clone_bioset_partial - clone a partial bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *	@offset: cloned starting from the offset
- *	@size: size for the cloned bio
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
-				     struct bio_set *bs, int offset,
-				     int size)
-{
-	return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
-}
-EXPORT_SYMBOL(bio_clone_bioset_partial);
-
 /**
  *	bio_add_pc_page	-	attempt to add page to bio
  *	@q: the target queue
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 42b62a0288b0..fafef6343d1b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
 
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
-static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
+static inline unsigned bio_segments(struct bio *bio)
 {
 	unsigned segs = 0;
 	struct bio_vec bv;
@@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec)
 		break;
 	}
 
-	__bio_for_each_segment(bv, bio, iter, *bvec)
+	bio_for_each_segment(bv, bio, iter)
 		segs++;
 
 	return segs;
 }
 
-static inline unsigned bio_segments(struct bio *bio)
-{
-	return __bio_segments(bio, &bio->bi_iter);
-}
-
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
@@ -389,8 +384,6 @@ extern void bio_put(struct bio *);
 extern void __bio_clone_fast(struct bio *, struct bio *);
 extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *);
 extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs);
-extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t,
-					    struct bio_set *, int, int);
 
 extern struct bio_set *fs_bio_set;
 
-- 
cgit v1.2.3


From d7ed89d5aadf09f1060cd3a9cf07df17447c7392 Mon Sep 17 00:00:00 2001
From: Song Hongyan <hongyan.song@intel.com>
Date: Mon, 20 Mar 2017 22:28:45 +0800
Subject: iio: hid: Add humidity sensor support

Environmental humidity sensor is a hid defined sensor,
it shows raw humidity measurement of air.

More information can be found in:
http://www.usb.org/developers/hidpage/HUTRR39b.pdf

According to IIO ABI definition, humidityrelative data output unit is
milli percent. Add the unit convert from percent to milli percent.

Signed-off-by: Song Hongyan <hongyan.song@intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 .../iio/common/hid-sensors/hid-sensor-attributes.c |   2 +
 drivers/iio/humidity/Kconfig                       |  14 +
 drivers/iio/humidity/Makefile                      |   3 +
 drivers/iio/humidity/hid-sensor-humidity.c         | 315 +++++++++++++++++++++
 include/linux/hid-sensor-ids.h                     |   4 +
 5 files changed, 338 insertions(+)
 create mode 100644 drivers/iio/humidity/hid-sensor-humidity.c

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
index cfb2fdc3177f..81647cc64a7b 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
@@ -71,6 +71,8 @@ static struct {
 
 	{HID_USAGE_SENSOR_TEMPERATURE, 0, 1000, 0},
 	{HID_USAGE_SENSOR_TEMPERATURE, HID_USAGE_SENSOR_UNITS_DEGREES, 1000, 0},
+
+	{HID_USAGE_SENSOR_HUMIDITY, 0, 1000, 0},
 };
 
 static int pow_10(unsigned power)
diff --git a/drivers/iio/humidity/Kconfig b/drivers/iio/humidity/Kconfig
index 912477d54be2..14b9ce453d9d 100644
--- a/drivers/iio/humidity/Kconfig
+++ b/drivers/iio/humidity/Kconfig
@@ -36,6 +36,20 @@ config HDC100X
 	  To compile this driver as a module, choose M here: the module
 	  will be called hdc100x.
 
+config HID_SENSOR_HUMIDITY
+	tristate "HID Environmental humidity sensor"
+	depends on HID_SENSOR_HUB
+	select IIO_BUFFER
+	select IIO_TRIGGERED_BUFFER
+	select HID_SENSOR_IIO_COMMON
+	select HID_SENSOR_IIO_TRIGGER
+	help
+	  Say yes here to build support for the HID SENSOR
+	  humidity driver
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called hid-sensor-humidity.
+
 config HTS221
 	tristate "STMicroelectronics HTS221 sensor Driver"
 	depends on (I2C || SPI)
diff --git a/drivers/iio/humidity/Makefile b/drivers/iio/humidity/Makefile
index a6850e47c100..be0dedeb8f3c 100644
--- a/drivers/iio/humidity/Makefile
+++ b/drivers/iio/humidity/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_AM2315) += am2315.o
 obj-$(CONFIG_DHT11) += dht11.o
 obj-$(CONFIG_HDC100X) += hdc100x.o
+obj-$(CONFIG_HID_SENSOR_HUMIDITY) += hid-sensor-humidity.o
 
 hts221-y := hts221_core.o \
 	    hts221_buffer.o
@@ -15,3 +16,5 @@ obj-$(CONFIG_HTS221_SPI) += hts221_spi.o
 obj-$(CONFIG_HTU21) += htu21.o
 obj-$(CONFIG_SI7005) += si7005.o
 obj-$(CONFIG_SI7020) += si7020.o
+
+ccflags-y += -I$(srctree)/drivers/iio/common/hid-sensors
diff --git a/drivers/iio/humidity/hid-sensor-humidity.c b/drivers/iio/humidity/hid-sensor-humidity.c
new file mode 100644
index 000000000000..6e09c1acfe51
--- /dev/null
+++ b/drivers/iio/humidity/hid-sensor-humidity.c
@@ -0,0 +1,315 @@
+/*
+ * HID Sensors Driver
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.
+ */
+#include <linux/device.h>
+#include <linux/hid-sensor-hub.h>
+#include <linux/iio/buffer.h>
+#include <linux/iio/iio.h>
+#include <linux/iio/triggered_buffer.h>
+#include <linux/iio/trigger_consumer.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+#include "hid-sensor-trigger.h"
+
+struct hid_humidity_state {
+	struct hid_sensor_common common_attributes;
+	struct hid_sensor_hub_attribute_info humidity_attr;
+	s32 humidity_data;
+	int scale_pre_decml;
+	int scale_post_decml;
+	int scale_precision;
+	int value_offset;
+};
+
+/* Channel definitions */
+static const struct iio_chan_spec humidity_channels[] = {
+	{
+		.type = IIO_HUMIDITYRELATIVE,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW),
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_OFFSET) |
+			BIT(IIO_CHAN_INFO_SCALE) |
+			BIT(IIO_CHAN_INFO_SAMP_FREQ) |
+			BIT(IIO_CHAN_INFO_HYSTERESIS),
+	},
+	IIO_CHAN_SOFT_TIMESTAMP(1)
+};
+
+/* Adjust channel real bits based on report descriptor */
+static void humidity_adjust_channel_bit_mask(struct iio_chan_spec *channels,
+					int channel, int size)
+{
+	channels[channel].scan_type.sign = 's';
+	/* Real storage bits will change based on the report desc. */
+	channels[channel].scan_type.realbits = size * 8;
+	/* Maximum size of a sample to capture is s32 */
+	channels[channel].scan_type.storagebits = sizeof(s32) * 8;
+}
+
+static int humidity_read_raw(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan,
+				int *val, int *val2, long mask)
+{
+	struct hid_humidity_state *humid_st = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		if (chan->type != IIO_HUMIDITYRELATIVE)
+			return -EINVAL;
+		hid_sensor_power_state(&humid_st->common_attributes, true);
+		*val = sensor_hub_input_attr_get_raw_value(
+				humid_st->common_attributes.hsdev,
+				HID_USAGE_SENSOR_HUMIDITY,
+				HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY,
+				humid_st->humidity_attr.report_id,
+				SENSOR_HUB_SYNC);
+		hid_sensor_power_state(&humid_st->common_attributes, false);
+
+		return IIO_VAL_INT;
+
+	case IIO_CHAN_INFO_SCALE:
+		*val = humid_st->scale_pre_decml;
+		*val2 = humid_st->scale_post_decml;
+
+		return humid_st->scale_precision;
+
+	case IIO_CHAN_INFO_OFFSET:
+		*val = humid_st->value_offset;
+
+		return IIO_VAL_INT;
+
+	case IIO_CHAN_INFO_SAMP_FREQ:
+		return hid_sensor_read_samp_freq_value(
+				&humid_st->common_attributes, val, val2);
+
+	case IIO_CHAN_INFO_HYSTERESIS:
+		return hid_sensor_read_raw_hyst_value(
+				&humid_st->common_attributes, val, val2);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int humidity_write_raw(struct iio_dev *indio_dev,
+				struct iio_chan_spec const *chan,
+				int val, int val2, long mask)
+{
+	struct hid_humidity_state *humid_st = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_SAMP_FREQ:
+		return hid_sensor_write_samp_freq_value(
+				&humid_st->common_attributes, val, val2);
+
+	case IIO_CHAN_INFO_HYSTERESIS:
+		return hid_sensor_write_raw_hyst_value(
+				&humid_st->common_attributes, val, val2);
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct iio_info humidity_info = {
+	.driver_module = THIS_MODULE,
+	.read_raw = &humidity_read_raw,
+	.write_raw = &humidity_write_raw,
+};
+
+/* Callback handler to send event after all samples are received and captured */
+static int humidity_proc_event(struct hid_sensor_hub_device *hsdev,
+				unsigned int usage_id, void *pdev)
+{
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct hid_humidity_state *humid_st = iio_priv(indio_dev);
+
+	if (atomic_read(&humid_st->common_attributes.data_ready))
+		iio_push_to_buffers_with_timestamp(indio_dev,
+					&humid_st->humidity_data,
+					iio_get_time_ns(indio_dev));
+
+	return 0;
+}
+
+/* Capture samples in local storage */
+static int humidity_capture_sample(struct hid_sensor_hub_device *hsdev,
+				unsigned int usage_id, size_t raw_len,
+				char *raw_data, void *pdev)
+{
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct hid_humidity_state *humid_st = iio_priv(indio_dev);
+
+	switch (usage_id) {
+	case HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY:
+		humid_st->humidity_data = *(s32 *)raw_data;
+
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* Parse report which is specific to an usage id */
+static int humidity_parse_report(struct platform_device *pdev,
+				struct hid_sensor_hub_device *hsdev,
+				struct iio_chan_spec *channels,
+				unsigned int usage_id,
+				struct hid_humidity_state *st)
+{
+	int ret;
+
+	ret = sensor_hub_input_get_attribute_info(hsdev, HID_INPUT_REPORT,
+					usage_id,
+					HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY,
+					&st->humidity_attr);
+	if (ret < 0)
+		return ret;
+
+	humidity_adjust_channel_bit_mask(channels, 0, st->humidity_attr.size);
+
+	st->scale_precision = hid_sensor_format_scale(
+						HID_USAGE_SENSOR_HUMIDITY,
+						&st->humidity_attr,
+						&st->scale_pre_decml,
+						&st->scale_post_decml);
+
+	/* Set Sensitivity field ids, when there is no individual modifier */
+	if (st->common_attributes.sensitivity.index < 0)
+		sensor_hub_input_get_attribute_info(hsdev,
+			HID_FEATURE_REPORT, usage_id,
+			HID_USAGE_SENSOR_DATA_MOD_CHANGE_SENSITIVITY_ABS |
+			HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY,
+			&st->common_attributes.sensitivity);
+
+	return ret;
+}
+
+static struct hid_sensor_hub_callbacks humidity_callbacks = {
+	.send_event = &humidity_proc_event,
+	.capture_sample = &humidity_capture_sample,
+};
+
+/* Function to initialize the processing for usage id */
+static int hid_humidity_probe(struct platform_device *pdev)
+{
+	static const char *name = "humidity";
+	struct iio_dev *indio_dev;
+	struct hid_humidity_state *humid_st;
+	struct iio_chan_spec *humid_chans;
+	struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev);
+	int ret;
+
+	indio_dev = devm_iio_device_alloc(&pdev->dev, sizeof(*humid_st));
+	if (!indio_dev)
+		return -ENOMEM;
+
+	humid_st = iio_priv(indio_dev);
+	humid_st->common_attributes.hsdev = hsdev;
+	humid_st->common_attributes.pdev = pdev;
+
+	ret = hid_sensor_parse_common_attributes(hsdev,
+					HID_USAGE_SENSOR_HUMIDITY,
+					&humid_st->common_attributes);
+	if (ret)
+		return ret;
+
+	humid_chans = devm_kmemdup(&indio_dev->dev, humidity_channels,
+					sizeof(humidity_channels), GFP_KERNEL);
+	if (!humid_chans)
+		return -ENOMEM;
+
+	ret = humidity_parse_report(pdev, hsdev, humid_chans,
+				HID_USAGE_SENSOR_HUMIDITY, humid_st);
+	if (ret)
+		return ret;
+
+	indio_dev->channels = humid_chans;
+	indio_dev->num_channels = ARRAY_SIZE(humidity_channels);
+	indio_dev->dev.parent = &pdev->dev;
+	indio_dev->info = &humidity_info;
+	indio_dev->name = name;
+	indio_dev->modes = INDIO_DIRECT_MODE;
+
+	ret = devm_iio_triggered_buffer_setup(&pdev->dev, indio_dev,
+					&iio_pollfunc_store_time, NULL, NULL);
+	if (ret)
+		return ret;
+
+	atomic_set(&humid_st->common_attributes.data_ready, 0);
+	ret = hid_sensor_setup_trigger(indio_dev, name,
+				&humid_st->common_attributes);
+	if (ret)
+		return ret;
+
+	platform_set_drvdata(pdev, indio_dev);
+
+	humidity_callbacks.pdev = pdev;
+	ret = sensor_hub_register_callback(hsdev, HID_USAGE_SENSOR_HUMIDITY,
+					&humidity_callbacks);
+	if (ret)
+		goto error_remove_trigger;
+
+	ret = iio_device_register(indio_dev);
+	if (ret)
+		goto error_remove_callback;
+
+	return ret;
+
+error_remove_callback:
+	sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_HUMIDITY);
+error_remove_trigger:
+	hid_sensor_remove_trigger(&humid_st->common_attributes);
+	return ret;
+}
+
+/* Function to deinitialize the processing for usage id */
+static int hid_humidity_remove(struct platform_device *pdev)
+{
+	struct hid_sensor_hub_device *hsdev = dev_get_platdata(&pdev->dev);
+	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct hid_humidity_state *humid_st = iio_priv(indio_dev);
+
+	iio_device_unregister(indio_dev);
+	sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_HUMIDITY);
+	hid_sensor_remove_trigger(&humid_st->common_attributes);
+
+	return 0;
+}
+
+static const struct platform_device_id hid_humidity_ids[] = {
+	{
+		/* Format: HID-SENSOR-usage_id_in_hex_lowercase */
+		.name = "HID-SENSOR-200032",
+	},
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, hid_humidity_ids);
+
+static struct platform_driver hid_humidity_platform_driver = {
+	.id_table = hid_humidity_ids,
+	.driver = {
+		.name	= KBUILD_MODNAME,
+		.pm	= &hid_sensor_pm_ops,
+	},
+	.probe		= hid_humidity_probe,
+	.remove		= hid_humidity_remove,
+};
+module_platform_driver(hid_humidity_platform_driver);
+
+MODULE_DESCRIPTION("HID Environmental humidity sensor");
+MODULE_AUTHOR("Song Hongyan <hongyan.song@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h
index 46dd1f27d2f2..761f86242473 100644
--- a/include/linux/hid-sensor-ids.h
+++ b/include/linux/hid-sensor-ids.h
@@ -49,6 +49,10 @@
 #define	HID_USAGE_SENSOR_TEMPERATURE				0x200033
 #define	HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE		0x200434
 
+/* humidity */
+#define HID_USAGE_SENSOR_HUMIDITY                              0x200032
+#define HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY                  0x200433
+
 /* Gyro 3D: (200076) */
 #define HID_USAGE_SENSOR_GYRO_3D				0x200076
 #define HID_USAGE_SENSOR_DATA_ANGL_VELOCITY			0x200456
-- 
cgit v1.2.3


From 8ce371f9846ef1e8b3cc8f6865766cb5c1f17e40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 20 Mar 2017 12:26:55 +0100
Subject: lockdep: Fix per-cpu static objects

Since commit 383776fa7527 ("locking/lockdep: Handle statically initialized
PER_CPU locks properly") we try to collapse per-cpu locks into a single
class by giving them all the same key. For this key we choose the canonical
address of the per-cpu object, which would be the offset into the per-cpu
area.

This has two problems:

 - there is a case where we run !0 lock->key through static_obj() and
   expect this to pass; it doesn't for canonical pointers.

 - 0 is a valid canonical address.

Cure both issues by redefining the canonical address as the address of the
per-cpu variable on the boot CPU.

Since I didn't want to rely on CPU0 being the boot-cpu, or even existing at
all, track the boot CPU in a variable.

Fixes: 383776fa7527 ("locking/lockdep: Handle statically initialized PER_CPU locks properly")
Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-mm@kvack.org
Cc: wfg@linux.intel.com
Cc: kernel test robot <fengguang.wu@intel.com>
Cc: LKP <lkp@01.org>
Link: http://lkml.kernel.org/r/20170320114108.kbvcsuepem45j5cr@hirez.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/smp.h | 12 ++++++++++++
 kernel/cpu.c        |  6 ++++++
 kernel/module.c     |  6 +++++-
 mm/percpu.c         |  5 ++++-
 4 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8e0cb7a0f836..68123c1fe549 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
 extern void __init setup_nr_cpu_ids(void);
 extern void __init smp_init(void);
 
+extern int __boot_cpu_id;
+
+static inline int get_boot_cpu_id(void)
+{
+	return __boot_cpu_id;
+}
+
 #else /* !SMP */
 
 static inline void smp_send_stop(void) { }
@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
 static inline void smp_init(void) { }
 #endif
 
+static inline int get_boot_cpu_id(void)
+{
+	return 0;
+}
+
 #endif /* !SMP */
 
 /*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f7c063239fa5..a795725774c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
 
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+int __boot_cpu_id;
+
 #endif /* CONFIG_SMP */
 
 /* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
 	set_cpu_active(cpu, true);
 	set_cpu_present(cpu, true);
 	set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+	__boot_cpu_id = cpu;
+#endif
 }
 
 /*
diff --git a/kernel/module.c b/kernel/module.c
index 5ef618133849..6d9988031c5b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -682,8 +682,12 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
 			void *va = (void *)addr;
 
 			if (va >= start && va < start + mod->percpu_size) {
-				if (can_addr)
+				if (can_addr) {
 					*can_addr = (unsigned long) (va - start);
+					*can_addr += (unsigned long)
+						per_cpu_ptr(mod->percpu,
+							    get_boot_cpu_id());
+				}
 				preempt_enable();
 				return true;
 			}
diff --git a/mm/percpu.c b/mm/percpu.c
index 7d3b728c0254..bd7416752819 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1293,8 +1293,11 @@ bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
 		void *va = (void *)addr;
 
 		if (va >= start && va < start + static_size) {
-			if (can_addr)
+			if (can_addr) {
 				*can_addr = (unsigned long) (va - start);
+				*can_addr += (unsigned long)
+					per_cpu_ptr(base, get_boot_cpu_id());
+			}
 			return true;
 		}
 	}
-- 
cgit v1.2.3


From e4e55b47ed9ae2c05ff062601ff6dacbe9dc4775 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 24 Mar 2017 20:46:33 +0900
Subject: LSM: Revive security_task_alloc() hook and per "struct task_struct"
 security blob.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We switched from "struct task_struct"->security to "struct cred"->security
in Linux 2.6.29. But not all LSM modules were happy with that change.
TOMOYO LSM module is an example which want to use per "struct task_struct"
security blob, for TOMOYO's security context is defined based on "struct
task_struct" rather than "struct cred". AppArmor LSM module is another
example which want to use it, for AppArmor is currently abusing the cred
a little bit to store the change_hat and setexeccon info. Although
security_task_free() hook was revived in Linux 3.4 because Yama LSM module
wanted to release per "struct task_struct" security blob,
security_task_alloc() hook and "struct task_struct"->security field were
not revived. Nowadays, we are getting proposals of lightweight LSM modules
which want to use per "struct task_struct" security blob.

We are already allowing multiple concurrent LSM modules (up to one fully
armored module which uses "struct cred"->security field or exclusive hooks
like security_xfrm_state_pol_flow_match(), plus unlimited number of
lightweight modules which do not use "struct cred"->security nor exclusive
hooks) as long as they are built into the kernel. But this patch does not
implement variable length "struct task_struct"->security field which will
become needed when multiple LSM modules want to use "struct task_struct"->
security field. Although it won't be difficult to implement variable length
"struct task_struct"->security field, let's think about it after we merged
this patch.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: John Johansen <john.johansen@canonical.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Tested-by: Djalal Harouni <tixxdz@gmail.com>
Acked-by: José Bollo <jobol@nonadev.net>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: James Morris <james.l.morris@oracle.com>
Cc: José Bollo <jobol@nonadev.net>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/init_task.h | 7 +++++++
 include/linux/lsm_hooks.h | 9 ++++++++-
 include/linux/sched.h     | 4 ++++
 include/linux/security.h  | 7 +++++++
 kernel/fork.c             | 7 ++++++-
 security/security.c       | 5 +++++
 6 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..926f2f553cc5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -210,6 +210,12 @@ extern struct cred init_cred;
 # define INIT_TASK_TI(tsk)
 #endif
 
+#ifdef CONFIG_SECURITY
+#define INIT_TASK_SECURITY .security = NULL,
+#else
+#define INIT_TASK_SECURITY
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -288,6 +294,7 @@ extern struct cred init_cred;
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
+	INIT_TASK_SECURITY						\
 }
 
 
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 1aa63335de9e..080f34e66017 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -533,8 +533,13 @@
  *	manual page for definitions of the @clone_flags.
  *	@clone_flags contains the flags indicating what should be shared.
  *	Return 0 if permission is granted.
+ * @task_alloc:
+ *	@task task being allocated.
+ *	@clone_flags contains the flags indicating what should be shared.
+ *	Handle allocation of task-related resources.
+ *	Returns a zero on success, negative values on failure.
  * @task_free:
- *	@task task being freed
+ *	@task task about to be freed.
  *	Handle release of task-related resources. (Note that this can be called
  *	from interrupt context.)
  * @cred_alloc_blank:
@@ -1482,6 +1487,7 @@ union security_list_options {
 	int (*file_open)(struct file *file, const struct cred *cred);
 
 	int (*task_create)(unsigned long clone_flags);
+	int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
 	void (*task_free)(struct task_struct *task);
 	int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
 	void (*cred_free)(struct cred *cred);
@@ -1748,6 +1754,7 @@ struct security_hook_heads {
 	struct list_head file_receive;
 	struct list_head file_open;
 	struct list_head task_create;
+	struct list_head task_alloc;
 	struct list_head task_free;
 	struct list_head cred_alloc_blank;
 	struct list_head cred_free;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..71b8df306bb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1037,6 +1037,10 @@ struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/* A live task holds one reference: */
 	atomic_t			stack_refcount;
+#endif
+#ifdef CONFIG_SECURITY
+	/* Used by LSM modules for access restriction: */
+	void				*security;
 #endif
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
diff --git a/include/linux/security.h b/include/linux/security.h
index 97df7bac5b48..af675b576645 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -308,6 +308,7 @@ int security_file_send_sigiotask(struct task_struct *tsk,
 int security_file_receive(struct file *file);
 int security_file_open(struct file *file, const struct cred *cred);
 int security_task_create(unsigned long clone_flags);
+int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
 void security_task_free(struct task_struct *task);
 int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
 void security_cred_free(struct cred *cred);
@@ -861,6 +862,12 @@ static inline int security_task_create(unsigned long clone_flags)
 	return 0;
 }
 
+static inline int security_task_alloc(struct task_struct *task,
+				      unsigned long clone_flags)
+{
+	return 0;
+}
+
 static inline void security_task_free(struct task_struct *task)
 { }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..3d32513d6c73 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1679,9 +1679,12 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cleanup_perf;
 	/* copy all the process information */
 	shm_init_task(p);
-	retval = copy_semundo(clone_flags, p);
+	retval = security_task_alloc(p, clone_flags);
 	if (retval)
 		goto bad_fork_cleanup_audit;
+	retval = copy_semundo(clone_flags, p);
+	if (retval)
+		goto bad_fork_cleanup_security;
 	retval = copy_files(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_semundo;
@@ -1903,6 +1906,8 @@ bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
 	exit_sem(p);
+bad_fork_cleanup_security:
+	security_task_free(p);
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_perf:
diff --git a/security/security.c b/security/security.c
index 2f15488dc6bc..549bddcc2116 100644
--- a/security/security.c
+++ b/security/security.c
@@ -937,6 +937,11 @@ int security_task_create(unsigned long clone_flags)
 	return call_int_hook(task_create, 0, clone_flags);
 }
 
+int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
+{
+	return call_int_hook(task_alloc, 0, task, clone_flags);
+}
+
 void security_task_free(struct task_struct *task)
 {
 	call_void_hook(task_free, task);
-- 
cgit v1.2.3


From 591a3d7c09fa08baff48ad86c2347dbd28a52753 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Mar 2017 14:13:05 +0300
Subject: mm: Fix false-positive VM_BUG_ON() in
 page_cache_{get,add}_speculative()

0day testing by Fengguang Wu triggered this crash while running Trinity:

  kernel BUG at include/linux/pagemap.h:151!
  ...
  CPU: 0 PID: 458 Comm: trinity-c0 Not tainted 4.11.0-rc2-00251-g2947ba0 #1
  ...
  Call Trace:
   __get_user_pages_fast()
   get_user_pages_fast()
   get_futex_key()
   futex_requeue()
   do_futex()
   SyS_futex()
   do_syscall_64()
   entry_SYSCALL64_slow_path()

It' VM_BUG_ON() due to false-negative in_atomic(). We call
page_cache_get_speculative() with disabled local interrupts.
It should be atomic enough.

So let's check for disabled interrupts in the VM_BUG_ON() condition
too, to resolve this.

( This got triggered by the conversion of the x86 GUP code to the
  generic GUP code. )

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170324114709.pcytvyb3d6ajux33@black.fi.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/pagemap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 84943e8057ef..316a19f6b635 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page)
 
 #ifdef CONFIG_TINY_RCU
 # ifdef CONFIG_PREEMPT_COUNT
-	VM_BUG_ON(!in_atomic());
+	VM_BUG_ON(!in_atomic() && !irqs_disabled());
 # endif
 	/*
 	 * Preempt must be disabled here - we rely on rcu_read_lock doing
@@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
 # ifdef CONFIG_PREEMPT_COUNT
-	VM_BUG_ON(!in_atomic());
+	VM_BUG_ON(!in_atomic() && !irqs_disabled());
 # endif
 	VM_BUG_ON_PAGE(page_count(page) == 0, page);
 	page_ref_add(page, count);
-- 
cgit v1.2.3


From 011d8261117249eab97bc86a8e1ac7731e03e319 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 27 Mar 2017 11:33:02 +0200
Subject: RAS: Add a Corrected Errors Collector

Introduce a simple data structure for collecting correctable errors
along with accessors. More detailed description in the code itself.

The error decoding is done with the decoding chain now and
mce_first_notifier() gets to see the error first and the CEC decides
whether to log it and then the rest of the chain doesn't hear about it -
basically the main reason for the CE collector - or to continue running
the notifiers.

When the CEC hits the action threshold, it will try to soft-offine the
page containing the ECC and then the whole decoding chain gets to see
the error.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170327093304.10683-5-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   6 +
 arch/x86/include/asm/mce.h                      |   9 +-
 arch/x86/kernel/cpu/mcheck/mce.c                | 191 +++++----
 arch/x86/ras/Kconfig                            |  14 +
 drivers/ras/Makefile                            |   3 +-
 drivers/ras/cec.c                               | 532 ++++++++++++++++++++++++
 drivers/ras/debugfs.c                           |   2 +-
 drivers/ras/debugfs.h                           |   8 +
 drivers/ras/ras.c                               |  11 +
 include/linux/ras.h                             |  13 +-
 10 files changed, 706 insertions(+), 83 deletions(-)
 create mode 100644 drivers/ras/cec.c
 create mode 100644 drivers/ras/debugfs.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2ba45caabada..927b8b8854cd 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3172,6 +3172,12 @@
 	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
 			See Documentation/blockdev/ramdisk.txt.
 
+	ras=option[,option,...]	[KNL] RAS-specific options
+
+		cec_disable	[X86]
+				Disable the Correctable Errors Collector,
+				see CONFIG_RAS_CEC help text.
+
 	rcu_nocbs=	[KNL]
 			The argument is a cpu list, as described above.
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 0512dcc11750..c5ae545d27d8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
 
 enum mce_notifier_prios {
-	MCE_PRIO_SRAO		= INT_MAX,
-	MCE_PRIO_EXTLOG		= INT_MAX - 1,
-	MCE_PRIO_NFIT		= INT_MAX - 2,
-	MCE_PRIO_EDAC		= INT_MAX - 3,
+	MCE_PRIO_FIRST		= INT_MAX,
+	MCE_PRIO_SRAO		= INT_MAX - 1,
+	MCE_PRIO_EXTLOG		= INT_MAX - 2,
+	MCE_PRIO_NFIT		= INT_MAX - 3,
+	MCE_PRIO_EDAC		= INT_MAX - 4,
 	MCE_PRIO_LOWEST		= 0,
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8ada093d4e40..4a907758a516 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/cpu.h>
+#include <linux/ras.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {
 
 void mce_log(struct mce *m)
 {
-	unsigned next, entry;
-
-	/* Emit the trace record: */
-	trace_mce_record(m);
-
 	if (!mce_gen_pool_add(m))
 		irq_work_queue(&mce_irq_work);
-
-	wmb();
-	for (;;) {
-		entry = mce_log_get_idx_check(mcelog_buf.next);
-		for (;;) {
-
-			/*
-			 * When the buffer fills up discard new entries.
-			 * Assume that the earlier errors are the more
-			 * interesting ones:
-			 */
-			if (entry >= MCE_LOG_LEN) {
-				set_bit(MCE_OVERFLOW,
-					(unsigned long *)&mcelog_buf.flags);
-				return;
-			}
-			/* Old left over entry. Skip: */
-			if (mcelog_buf.entry[entry].finished) {
-				entry++;
-				continue;
-			}
-			break;
-		}
-		smp_rmb();
-		next = entry + 1;
-		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
-			break;
-	}
-	memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
-	wmb();
-	mcelog_buf.entry[entry].finished = 1;
-	wmb();
-
-	set_bit(0, &mce_need_notify);
 }
 
 void mce_inject_log(struct mce *m)
@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
 
 static struct notifier_block mce_srao_nb;
 
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS	3
 static atomic_t num_notifiers;
 
 void mce_register_decode_chain(struct notifier_block *nb)
@@ -522,7 +490,6 @@ static void mce_schedule_work(void)
 
 static void mce_irq_work_cb(struct irq_work *entry)
 {
-	mce_notify_irq();
 	mce_schedule_work();
 }
 
@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
 	return 1;
 }
 
+static bool memory_error(struct mce *m)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		/* ErrCodeExt[20:16] */
+		u8 xec = (m->status >> 16) & 0x1f;
+
+		return (xec == 0x0 || xec == 0x8);
+	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
+		/*
+		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+		 *
+		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+		 * indicating a memory error. Bit 8 is used for indicating a
+		 * cache hierarchy error. The combination of bit 2 and bit 3
+		 * is used for indicating a `generic' cache hierarchy error
+		 * But we can't just blindly check the above bits, because if
+		 * bit 11 is set, then it is a bus/interconnect error - and
+		 * either way the above bits just gives more detail on what
+		 * bus/interconnect error happened. Note that bit 12 can be
+		 * ignored, as it's the "filter" bit.
+		 */
+		return (m->status & 0xef80) == BIT(7) ||
+		       (m->status & 0xef00) == BIT(8) ||
+		       (m->status & 0xeffc) == 0xc;
+	}
+
+	return false;
+}
+
+static bool cec_add_mce(struct mce *m)
+{
+	if (!m)
+		return false;
+
+	/* We eat only correctable DRAM errors with usable addresses. */
+	if (memory_error(m) &&
+	    !(m->status & MCI_STATUS_UC) &&
+	    mce_usable_address(m))
+		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+			return true;
+
+	return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+			      void *data)
+{
+	struct mce *m = (struct mce *)data;
+	unsigned int next, entry;
+
+	if (!m)
+		return NOTIFY_DONE;
+
+	if (cec_add_mce(m))
+		return NOTIFY_STOP;
+
+	/* Emit the trace record: */
+	trace_mce_record(m);
+
+	wmb();
+	for (;;) {
+		entry = mce_log_get_idx_check(mcelog_buf.next);
+		for (;;) {
+
+			/*
+			 * When the buffer fills up discard new entries.
+			 * Assume that the earlier errors are the more
+			 * interesting ones:
+			 */
+			if (entry >= MCE_LOG_LEN) {
+				set_bit(MCE_OVERFLOW,
+					(unsigned long *)&mcelog_buf.flags);
+				return NOTIFY_DONE;
+			}
+			/* Old left over entry. Skip: */
+			if (mcelog_buf.entry[entry].finished) {
+				entry++;
+				continue;
+			}
+			break;
+		}
+		smp_rmb();
+		next = entry + 1;
+		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
+			break;
+	}
+	memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
+	wmb();
+	mcelog_buf.entry[entry].finished = 1;
+	wmb();
+
+	set_bit(0, &mce_need_notify);
+
+	mce_notify_irq();
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+	.notifier_call	= mce_first_notifier,
+	.priority	= MCE_PRIO_FIRST,
+};
+
 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 	if (!m)
 		return NOTIFY_DONE;
 
-	/*
-	 * Run the default notifier if we have only the SRAO
-	 * notifier and us registered.
-	 */
-	if (atomic_read(&num_notifiers) > 2)
+	if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
 		return NOTIFY_DONE;
 
 	/* Don't print when mcelog is running */
@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
 	}
 }
 
-static bool memory_error(struct mce *m)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		/* ErrCodeExt[20:16] */
-		u8 xec = (m->status >> 16) & 0x1f;
-
-		return (xec == 0x0 || xec == 0x8);
-	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
-		/*
-		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
-		 *
-		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
-		 * indicating a memory error. Bit 8 is used for indicating a
-		 * cache hierarchy error. The combination of bit 2 and bit 3
-		 * is used for indicating a `generic' cache hierarchy error
-		 * But we can't just blindly check the above bits, because if
-		 * bit 11 is set, then it is a bus/interconnect error - and
-		 * either way the above bits just gives more detail on what
-		 * bus/interconnect error happened. Note that bit 12 can be
-		 * ignored, as it's the "filter" bit.
-		 */
-		return (m->status & 0xef80) == BIT(7) ||
-		       (m->status & 0xef00) == BIT(8) ||
-		       (m->status & 0xeffc) == 0xc;
-	}
-
-	return false;
-}
-
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
 /*
@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
+	mce_register_decode_chain(&first_nb);
 	mce_register_decode_chain(&mce_srao_nb);
 	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
 		static_branch_inc(&mcsafe_key);
 
 	mcheck_debugfs_init();
+	cec_init();
 
 	/*
 	 * Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 0bc60a308730..2a2d89d39af6 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
 	  aspects of the MCE handling code.
 
 	  WARNING: Do not even assume this interface is staying stable!
+
+config RAS_CEC
+	bool "Correctable Errors Collector"
+	depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+	---help---
+	  This is a small cache which collects correctable memory errors per 4K
+	  page PFN and counts their repeated occurrence. Once the counter for a
+	  PFN overflows, we try to soft-offline that page as we take it to mean
+	  that it has reached a relatively high error count and would probably
+	  be best if we don't use it anymore.
+
+	  Bear in mind that this is absolutely useless if your platform doesn't
+	  have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index d7f73341ced3..7b26dd3aa5d0 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_RAS) += ras.o debugfs.o
+obj-$(CONFIG_RAS)	+= ras.o debugfs.o
+obj-$(CONFIG_RAS_CEC)	+= cec.o
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
new file mode 100644
index 000000000000..6aab46d91d33
--- /dev/null
+++ b/drivers/ras/cec.c
@@ -0,0 +1,532 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+
+#include <asm/mce.h>
+
+#include "debugfs.h"
+
+/*
+ * RAS Correctable Errors Collector
+ *
+ * This is a simple gadget which collects correctable errors and counts their
+ * occurrence per physical page address.
+ *
+ * We've opted for possibly the simplest data structure to collect those - an
+ * array of the size of a memory page. It stores 512 u64's with the following
+ * structure:
+ *
+ * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
+ *
+ * The generation in the two highest order bits is two bits which are set to 11b
+ * on every insertion. During the course of each entry's existence, the
+ * generation field gets decremented during spring cleaning to 10b, then 01b and
+ * then 00b.
+ *
+ * This way we're employing the natural numeric ordering to make sure that newly
+ * inserted/touched elements have higher 12-bit counts (which we've manufactured)
+ * and thus iterating over the array initially won't kick out those elements
+ * which were inserted last.
+ *
+ * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
+ * elements entered into the array, during which, we're decaying all elements.
+ * If, after decay, an element gets inserted again, its generation is set to 11b
+ * to make sure it has higher numerical count than other, older elements and
+ * thus emulate an an LRU-like behavior when deleting elements to free up space
+ * in the page.
+ *
+ * When an element reaches it's max count of count_threshold, we try to poison
+ * it by assuming that errors triggered count_threshold times in a single page
+ * are excessive and that page shouldn't be used anymore. count_threshold is
+ * initialized to COUNT_MASK which is the maximum.
+ *
+ * That error event entry causes cec_add_elem() to return !0 value and thus
+ * signal to its callers to log the error.
+ *
+ * To the question why we've chosen a page and moving elements around with
+ * memmove(), it is because it is a very simple structure to handle and max data
+ * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
+ * We wanted to avoid the pointer traversal of more complex structures like a
+ * linked list or some sort of a balancing search tree.
+ *
+ * Deleting an element takes O(n) but since it is only a single page, it should
+ * be fast enough and it shouldn't happen all too often depending on error
+ * patterns.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RAS: " fmt
+
+/*
+ * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
+ * elements have stayed in the array without having been accessed again.
+ */
+#define DECAY_BITS		2
+#define DECAY_MASK		((1ULL << DECAY_BITS) - 1)
+#define MAX_ELEMS		(PAGE_SIZE / sizeof(u64))
+
+/*
+ * Threshold amount of inserted elements after which we start spring
+ * cleaning.
+ */
+#define CLEAN_ELEMS		(MAX_ELEMS >> DECAY_BITS)
+
+/* Bits which count the number of errors happened in this 4K page. */
+#define COUNT_BITS		(PAGE_SHIFT - DECAY_BITS)
+#define COUNT_MASK		((1ULL << COUNT_BITS) - 1)
+#define FULL_COUNT_MASK		(PAGE_SIZE - 1)
+
+/*
+ * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
+ */
+
+#define PFN(e)			((e) >> PAGE_SHIFT)
+#define DECAY(e)		(((e) >> COUNT_BITS) & DECAY_MASK)
+#define COUNT(e)		((unsigned int)(e) & COUNT_MASK)
+#define FULL_COUNT(e)		((e) & (PAGE_SIZE - 1))
+
+static struct ce_array {
+	u64 *array;			/* container page */
+	unsigned int n;			/* number of elements in the array */
+
+	unsigned int decay_count;	/*
+					 * number of element insertions/increments
+					 * since the last spring cleaning.
+					 */
+
+	u64 pfns_poisoned;		/*
+					 * number of PFNs which got poisoned.
+					 */
+
+	u64 ces_entered;		/*
+					 * The number of correctable errors
+					 * entered into the collector.
+					 */
+
+	u64 decays_done;		/*
+					 * Times we did spring cleaning.
+					 */
+
+	union {
+		struct {
+			__u32	disabled : 1,	/* cmdline disabled */
+			__resv   : 31;
+		};
+		__u32 flags;
+	};
+} ce_arr;
+
+static DEFINE_MUTEX(ce_mutex);
+static u64 dfs_pfn;
+
+/* Amount of errors after which we offline */
+static unsigned int count_threshold = COUNT_MASK;
+
+/*
+ * The timer "decays" element count each timer_interval which is 24hrs by
+ * default.
+ */
+
+#define CEC_TIMER_DEFAULT_INTERVAL	24 * 60 * 60	/* 24 hrs */
+#define CEC_TIMER_MIN_INTERVAL		 1 * 60 * 60	/* 1h */
+#define CEC_TIMER_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
+static struct timer_list cec_timer;
+static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
+
+/*
+ * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
+ * element in the array. On insertion and any access, it gets reset to max.
+ */
+static void do_spring_cleaning(struct ce_array *ca)
+{
+	int i;
+
+	for (i = 0; i < ca->n; i++) {
+		u8 decay = DECAY(ca->array[i]);
+
+		if (!decay)
+			continue;
+
+		decay--;
+
+		ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
+		ca->array[i] |= (decay << COUNT_BITS);
+	}
+	ca->decay_count = 0;
+	ca->decays_done++;
+}
+
+/*
+ * @interval in seconds
+ */
+static void cec_mod_timer(struct timer_list *t, unsigned long interval)
+{
+	unsigned long iv;
+
+	iv = interval * HZ + jiffies;
+
+	mod_timer(t, round_jiffies(iv));
+}
+
+static void cec_timer_fn(unsigned long data)
+{
+	struct ce_array *ca = (struct ce_array *)data;
+
+	do_spring_cleaning(ca);
+
+	cec_mod_timer(&cec_timer, timer_interval);
+}
+
+/*
+ * @to: index of the smallest element which is >= then @pfn.
+ *
+ * Return the index of the pfn if found, otherwise negative value.
+ */
+static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+	u64 this_pfn;
+	int min = 0, max = ca->n;
+
+	while (min < max) {
+		int tmp = (max + min) >> 1;
+
+		this_pfn = PFN(ca->array[tmp]);
+
+		if (this_pfn < pfn)
+			min = tmp + 1;
+		else if (this_pfn > pfn)
+			max = tmp;
+		else {
+			min = tmp;
+			break;
+		}
+	}
+
+	if (to)
+		*to = min;
+
+	this_pfn = PFN(ca->array[min]);
+
+	if (this_pfn == pfn)
+		return min;
+
+	return -ENOKEY;
+}
+
+static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+	WARN_ON(!to);
+
+	if (!ca->n) {
+		*to = 0;
+		return -ENOKEY;
+	}
+	return __find_elem(ca, pfn, to);
+}
+
+static void del_elem(struct ce_array *ca, int idx)
+{
+	/* Save us a function call when deleting the last element. */
+	if (ca->n - (idx + 1))
+		memmove((void *)&ca->array[idx],
+			(void *)&ca->array[idx + 1],
+			(ca->n - (idx + 1)) * sizeof(u64));
+
+	ca->n--;
+}
+
+static u64 del_lru_elem_unlocked(struct ce_array *ca)
+{
+	unsigned int min = FULL_COUNT_MASK;
+	int i, min_idx = 0;
+
+	for (i = 0; i < ca->n; i++) {
+		unsigned int this = FULL_COUNT(ca->array[i]);
+
+		if (min > this) {
+			min = this;
+			min_idx = i;
+		}
+	}
+
+	del_elem(ca, min_idx);
+
+	return PFN(ca->array[min_idx]);
+}
+
+/*
+ * We return the 0th pfn in the error case under the assumption that it cannot
+ * be poisoned and excessive CEs in there are a serious deal anyway.
+ */
+static u64 __maybe_unused del_lru_elem(void)
+{
+	struct ce_array *ca = &ce_arr;
+	u64 pfn;
+
+	if (!ca->n)
+		return 0;
+
+	mutex_lock(&ce_mutex);
+	pfn = del_lru_elem_unlocked(ca);
+	mutex_unlock(&ce_mutex);
+
+	return pfn;
+}
+
+
+int cec_add_elem(u64 pfn)
+{
+	struct ce_array *ca = &ce_arr;
+	unsigned int to;
+	int count, ret = 0;
+
+	/*
+	 * We can be called very early on the identify_cpu() path where we are
+	 * not initialized yet. We ignore the error for simplicity.
+	 */
+	if (!ce_arr.array || ce_arr.disabled)
+		return -ENODEV;
+
+	ca->ces_entered++;
+
+	mutex_lock(&ce_mutex);
+
+	if (ca->n == MAX_ELEMS)
+		WARN_ON(!del_lru_elem_unlocked(ca));
+
+	ret = find_elem(ca, pfn, &to);
+	if (ret < 0) {
+		/*
+		 * Shift range [to-end] to make room for one more element.
+		 */
+		memmove((void *)&ca->array[to + 1],
+			(void *)&ca->array[to],
+			(ca->n - to) * sizeof(u64));
+
+		ca->array[to] = (pfn << PAGE_SHIFT) |
+				(DECAY_MASK << COUNT_BITS) | 1;
+
+		ca->n++;
+
+		ret = 0;
+
+		goto decay;
+	}
+
+	count = COUNT(ca->array[to]);
+
+	if (count < count_threshold) {
+		ca->array[to] |= (DECAY_MASK << COUNT_BITS);
+		ca->array[to]++;
+
+		ret = 0;
+	} else {
+		u64 pfn = ca->array[to] >> PAGE_SHIFT;
+
+		if (!pfn_valid(pfn)) {
+			pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
+		} else {
+			/* We have reached max count for this page, soft-offline it. */
+			pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
+			memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
+			ca->pfns_poisoned++;
+		}
+
+		del_elem(ca, to);
+
+		/*
+		 * Return a >0 value to denote that we've reached the offlining
+		 * threshold.
+		 */
+		ret = 1;
+
+		goto unlock;
+	}
+
+decay:
+	ca->decay_count++;
+
+	if (ca->decay_count >= CLEAN_ELEMS)
+		do_spring_cleaning(ca);
+
+unlock:
+	mutex_unlock(&ce_mutex);
+
+	return ret;
+}
+
+static int u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+
+	return 0;
+}
+
+static int pfn_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+
+	return cec_add_elem(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
+
+static int decay_interval_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+
+	if (val < CEC_TIMER_MIN_INTERVAL)
+		return -EINVAL;
+
+	if (val > CEC_TIMER_MAX_INTERVAL)
+		return -EINVAL;
+
+	timer_interval = val;
+
+	cec_mod_timer(&cec_timer, timer_interval);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
+
+static int count_threshold_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+
+	if (val > COUNT_MASK)
+		val = COUNT_MASK;
+
+	count_threshold = val;
+
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
+
+static int array_dump(struct seq_file *m, void *v)
+{
+	struct ce_array *ca = &ce_arr;
+	u64 prev = 0;
+	int i;
+
+	mutex_lock(&ce_mutex);
+
+	seq_printf(m, "{ n: %d\n", ca->n);
+	for (i = 0; i < ca->n; i++) {
+		u64 this = PFN(ca->array[i]);
+
+		seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
+
+		WARN_ON(prev > this);
+
+		prev = this;
+	}
+
+	seq_printf(m, "}\n");
+
+	seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
+		   ca->ces_entered, ca->pfns_poisoned);
+
+	seq_printf(m, "Flags: 0x%x\n", ca->flags);
+
+	seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
+	seq_printf(m, "Decays: %lld\n", ca->decays_done);
+
+	seq_printf(m, "Action threshold: %d\n", count_threshold);
+
+	mutex_unlock(&ce_mutex);
+
+	return 0;
+}
+
+static int array_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, array_dump, NULL);
+}
+
+static const struct file_operations array_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = array_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+static int __init create_debugfs_nodes(void)
+{
+	struct dentry *d, *pfn, *decay, *count, *array;
+
+	d = debugfs_create_dir("cec", ras_debugfs_dir);
+	if (!d) {
+		pr_warn("Error creating cec debugfs node!\n");
+		return -1;
+	}
+
+	pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
+	if (!pfn) {
+		pr_warn("Error creating pfn debugfs node!\n");
+		goto err;
+	}
+
+	array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
+	if (!array) {
+		pr_warn("Error creating array debugfs node!\n");
+		goto err;
+	}
+
+	decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
+				    &timer_interval, &decay_interval_ops);
+	if (!decay) {
+		pr_warn("Error creating decay_interval debugfs node!\n");
+		goto err;
+	}
+
+	count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
+				    &count_threshold, &count_threshold_ops);
+	if (!decay) {
+		pr_warn("Error creating count_threshold debugfs node!\n");
+		goto err;
+	}
+
+
+	return 0;
+
+err:
+	debugfs_remove_recursive(d);
+
+	return 1;
+}
+
+void __init cec_init(void)
+{
+	if (ce_arr.disabled)
+		return;
+
+	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ce_arr.array) {
+		pr_err("Error allocating CE array page!\n");
+		return;
+	}
+
+	if (create_debugfs_nodes())
+		return;
+
+	setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
+	cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
+
+	pr_info("Correctable Errors collector initialized.\n");
+}
+
+int __init parse_cec_param(char *str)
+{
+	if (!str)
+		return 0;
+
+	if (*str == '=')
+		str++;
+
+	if (!strncmp(str, "cec_disable", 7))
+		ce_arr.disabled = 1;
+	else
+		return 0;
+
+	return 1;
+}
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c
index 0322acf67ea5..501603057dff 100644
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
@@ -1,6 +1,6 @@
 #include <linux/debugfs.h>
 
-static struct dentry *ras_debugfs_dir;
+struct dentry *ras_debugfs_dir;
 
 static atomic_t trace_count = ATOMIC_INIT(0);
 
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
new file mode 100644
index 000000000000..db72e4513191
--- /dev/null
+++ b/drivers/ras/debugfs.h
@@ -0,0 +1,8 @@
+#ifndef __RAS_DEBUGFS_H__
+#define __RAS_DEBUGFS_H__
+
+#include <linux/debugfs.h>
+
+extern struct dentry *ras_debugfs_dir;
+
+#endif /* __RAS_DEBUGFS_H__ */
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd362b7b6..94f8038864b4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+
+
+int __init parse_ras_param(char *str)
+{
+#ifdef CONFIG_RAS_CEC
+	parse_cec_param(str);
+#endif
+
+	return 1;
+}
+__setup("ras", parse_ras_param);
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 2aceeafd6fe5..ffb147185e8d 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -1,14 +1,25 @@
 #ifndef __RAS_H__
 #define __RAS_H__
 
+#include <asm/errno.h>
+
 #ifdef CONFIG_DEBUG_FS
 int ras_userspace_consumers(void);
 void ras_debugfs_init(void);
 int ras_add_daemon_trace(void);
 #else
 static inline int ras_userspace_consumers(void) { return 0; }
-static inline void ras_debugfs_init(void) { return; }
+static inline void ras_debugfs_init(void) { }
 static inline int ras_add_daemon_trace(void) { return 0; }
 #endif
 
+#ifdef CONFIG_RAS_CEC
+void __init cec_init(void);
+int __init parse_cec_param(char *str);
+int cec_add_elem(u64 pfn);
+#else
+static inline void __init cec_init(void)	{ }
+static inline int cec_add_elem(u64 pfn)		{ return -ENODEV; }
 #endif
+
+#endif /* __RAS_H__ */
-- 
cgit v1.2.3


From 4f9ab0c1570800002e77515888600ca2e3dce4a9 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Fri, 24 Mar 2017 09:47:31 +0100
Subject: mfd: cpcap: Add missing include dependencies

This fixes compilation for files, that try to include the
cpcap header in alphabetically sorted #include lists.

Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/motorola-cpcap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/motorola-cpcap.h b/include/linux/mfd/motorola-cpcap.h
index b4031c2b2214..53758a7d7c32 100644
--- a/include/linux/mfd/motorola-cpcap.h
+++ b/include/linux/mfd/motorola-cpcap.h
@@ -14,6 +14,9 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/device.h>
+#include <linux/regmap.h>
+
 #define CPCAP_VENDOR_ST		0
 #define CPCAP_VENDOR_TI		1
 
-- 
cgit v1.2.3


From 07814246dd5530860ef758fd9b2b5f2e26472aa2 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 16 Mar 2017 17:13:30 +0100
Subject: USB: serial: allow subdrivers to modify port-endpoint mapping

Allow subdrivers to modify the port-endpoint mapping by passing the
endpoint descriptors to calc_num_ports.

The callback can now also be used to verify that the required endpoints
exists and abort probing otherwise.

This will allow us to get rid of a few hacks in subdrivers that are
already modifying the port-endpoint mapping (or aborting probe due to
missing endpoints), but only after the port structures have been setup.

Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/f81534.c     |  3 ++-
 drivers/usb/serial/ipaq.c       |  6 ++++--
 drivers/usb/serial/mos7720.c    |  3 ++-
 drivers/usb/serial/mos7840.c    |  3 ++-
 drivers/usb/serial/mxuport.c    |  3 ++-
 drivers/usb/serial/quatech2.c   |  3 ++-
 drivers/usb/serial/sierra.c     |  3 ++-
 drivers/usb/serial/usb-serial.c | 19 ++++++-------------
 drivers/usb/serial/visor.c      |  6 ++++--
 include/linux/usb/serial.h      | 19 ++++++++++++++++---
 10 files changed, 42 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/f81534.c b/drivers/usb/serial/f81534.c
index 22f23a429a95..385087c008ed 100644
--- a/drivers/usb/serial/f81534.c
+++ b/drivers/usb/serial/f81534.c
@@ -611,7 +611,8 @@ static int f81534_find_config_idx(struct usb_serial *serial, u8 *index)
  * The f81534_calc_num_ports() will run to "new style" with checking
  * F81534_PORT_UNAVAILABLE section.
  */
-static int f81534_calc_num_ports(struct usb_serial *serial)
+static int f81534_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	u8 setting[F81534_CUSTOM_DATA_SIZE];
 	u8 setting_idx;
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index ec1b8f2c1183..df5f1a7d7c6f 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -33,7 +33,8 @@ static int initial_wait;
 /* Function prototypes for an ipaq */
 static int  ipaq_open(struct tty_struct *tty,
 			struct usb_serial_port *port);
-static int  ipaq_calc_num_ports(struct usb_serial *serial);
+static int ipaq_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds);
 static int  ipaq_startup(struct usb_serial *serial);
 
 static const struct usb_device_id ipaq_id_table[] = {
@@ -550,7 +551,8 @@ static int ipaq_open(struct tty_struct *tty,
 	return usb_serial_generic_open(tty, port);
 }
 
-static int ipaq_calc_num_ports(struct usb_serial *serial)
+static int ipaq_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	/*
 	 * some devices have 3 endpoints, the 3rd of which
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index df45ebad5f6f..9ec3e4fb9678 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -973,7 +973,8 @@ static void mos7720_bulk_out_data_callback(struct urb *urb)
 		tty_port_tty_wakeup(&mos7720_port->port->port);
 }
 
-static int mos77xx_calc_num_ports(struct usb_serial *serial)
+static int mos77xx_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	u16 product = le16_to_cpu(serial->dev->descriptor.idProduct);
 	if (product == MOSCHIP_DEVICE_ID_7715)
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 3821c53fcee9..326d6c5055ef 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -2104,7 +2104,8 @@ out:
 	return 0;
 }
 
-static int mos7840_calc_num_ports(struct usb_serial *serial)
+static int mos7840_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	int device_type = (unsigned long)usb_get_serial_data(serial);
 	int mos7840_num_ports;
diff --git a/drivers/usb/serial/mxuport.c b/drivers/usb/serial/mxuport.c
index c88215a0fa3d..bf543e6c05ea 100644
--- a/drivers/usb/serial/mxuport.c
+++ b/drivers/usb/serial/mxuport.c
@@ -946,7 +946,8 @@ out:
  * Determine how many ports this device has dynamically.  It will be
  * called after the probe() callback is called, but before attach().
  */
-static int mxuport_calc_num_ports(struct usb_serial *serial)
+static int mxuport_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	unsigned long features = (unsigned long)usb_get_serial_data(serial);
 
diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c
index fdbb904d153f..6ddcaa2de902 100644
--- a/drivers/usb/serial/quatech2.c
+++ b/drivers/usb/serial/quatech2.c
@@ -246,7 +246,8 @@ static inline int update_mctrl(struct qt2_port_private *port_priv,
 	return status;
 }
 
-static int qt2_calc_num_ports(struct usb_serial *serial)
+static int qt2_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	struct qt2_device_detail d;
 	int i;
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 465e851b2815..4c4ac4705ac0 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -85,7 +85,8 @@ static int sierra_vsc_set_nmea(struct usb_device *udev, __u16 enable)
 			USB_CTRL_SET_TIMEOUT);		/* int timeout       */
 }
 
-static int sierra_calc_num_ports(struct usb_serial *serial)
+static int sierra_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	int num_ports = 0;
 	u8 ifnum, numendpoints;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 747dd414bef9..f0761f491c5f 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -710,17 +710,6 @@ static const struct tty_port_operations serial_port_ops = {
 	.shutdown		= serial_port_shutdown,
 };
 
-struct usb_serial_endpoints {
-	unsigned char num_bulk_in;
-	unsigned char num_bulk_out;
-	unsigned char num_interrupt_in;
-	unsigned char num_interrupt_out;
-	struct usb_endpoint_descriptor *bulk_in[MAX_NUM_PORTS];
-	struct usb_endpoint_descriptor *bulk_out[MAX_NUM_PORTS];
-	struct usb_endpoint_descriptor *interrupt_in[MAX_NUM_PORTS];
-	struct usb_endpoint_descriptor *interrupt_out[MAX_NUM_PORTS];
-};
-
 static void find_endpoints(struct usb_serial *serial,
 					struct usb_serial_endpoints *epds)
 {
@@ -875,8 +864,12 @@ static int usb_serial_probe(struct usb_interface *interface,
 #endif
 	if (!num_ports) {
 		/* if this device type has a calc_num_ports function, call it */
-		if (type->calc_num_ports)
-			num_ports = type->calc_num_ports(serial);
+		if (type->calc_num_ports) {
+			retval = type->calc_num_ports(serial, epds);
+			if (retval < 0)
+				goto err_free_epds;
+			num_ports = retval;
+		}
 		if (!num_ports)
 			num_ports = type->num_ports;
 	}
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 337a0be89fcf..3f943f877ac2 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -40,7 +40,8 @@ static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port);
 static void visor_close(struct usb_serial_port *port);
 static int  visor_probe(struct usb_serial *serial,
 					const struct usb_device_id *id);
-static int  visor_calc_num_ports(struct usb_serial *serial);
+static int  visor_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds);
 static void visor_read_int_callback(struct urb *urb);
 static int  clie_3_5_startup(struct usb_serial *serial);
 static int  treo_attach(struct usb_serial *serial);
@@ -466,7 +467,8 @@ static int visor_probe(struct usb_serial *serial,
 	return retval;
 }
 
-static int visor_calc_num_ports(struct usb_serial *serial)
+static int visor_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
 {
 	int num_ports = (int)(long)(usb_get_serial_data(serial));
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index f1b8a8493762..da528818cfd8 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -181,6 +181,17 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
 	serial->private = data;
 }
 
+struct usb_serial_endpoints {
+	unsigned char num_bulk_in;
+	unsigned char num_bulk_out;
+	unsigned char num_interrupt_in;
+	unsigned char num_interrupt_out;
+	struct usb_endpoint_descriptor *bulk_in[MAX_NUM_PORTS];
+	struct usb_endpoint_descriptor *bulk_out[MAX_NUM_PORTS];
+	struct usb_endpoint_descriptor *interrupt_in[MAX_NUM_PORTS];
+	struct usb_endpoint_descriptor *interrupt_out[MAX_NUM_PORTS];
+};
+
 /**
  * usb_serial_driver - describes a usb serial driver
  * @description: pointer to a string that describes this driver.  This string
@@ -196,8 +207,9 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data)
  *	(0 = end-point size)
  * @bulk_out_size: bytes to allocate for bulk-out buffer (0 = end-point size)
  * @calc_num_ports: pointer to a function to determine how many ports this
- *	device has dynamically.  It will be called after the probe()
- *	callback is called, but before attach()
+ *	device has dynamically. It can also be used to verify the number of
+ *	endpoints or to modify the port-endpoint mapping. It will be called
+ *	after the probe() callback is called, but before attach().
  * @probe: pointer to the driver's probe function.
  *	This will be called when the device is inserted into the system,
  *	but before the device has been fully initialized by the usb_serial
@@ -249,7 +261,8 @@ struct usb_serial_driver {
 
 	int (*probe)(struct usb_serial *serial, const struct usb_device_id *id);
 	int (*attach)(struct usb_serial *serial);
-	int (*calc_num_ports) (struct usb_serial *serial);
+	int (*calc_num_ports)(struct usb_serial *serial,
+			struct usb_serial_endpoints *epds);
 
 	void (*disconnect)(struct usb_serial *serial);
 	void (*release)(struct usb_serial *serial);
-- 
cgit v1.2.3


From a794499b261b8487a984783ccc864975e1bcc7bf Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 16 Mar 2017 17:13:32 +0100
Subject: USB: serial: add calc_num_ports callback to generic driver

Add a calc_num_ports callback to the generic driver and verify that the
device has the required endpoints there instead of in core.

Note that the generic driver num_ports field was never used.

Signed-off-by: Johan Hovold <johan@kernel.org>
---
 drivers/usb/serial/generic.c    | 18 ++++++++++++++++--
 drivers/usb/serial/usb-serial.c | 27 ++++++++-------------------
 include/linux/usb/serial.h      |  1 -
 3 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 8c7600472019..2d3599f014e2 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -48,14 +48,28 @@ static int usb_serial_generic_probe(struct usb_serial *serial,
 	return 0;
 }
 
-struct usb_serial_driver usb_serial_generic_device = {
+static int usb_serial_generic_calc_num_ports(struct usb_serial *serial,
+					struct usb_serial_endpoints *epds)
+{
+	struct device *dev = &serial->interface->dev;
+	int num_ports = epds->num_bulk_out;
+
+	if (num_ports == 0) {
+		dev_err(dev, "Generic device with no bulk out, not allowed.\n");
+		return -ENODEV;
+	}
+
+	return num_ports;
+}
+
+static struct usb_serial_driver usb_serial_generic_device = {
 	.driver = {
 		.owner =	THIS_MODULE,
 		.name =		"generic",
 	},
 	.id_table =		generic_device_ids,
-	.num_ports =		1,
 	.probe =		usb_serial_generic_probe,
+	.calc_num_ports =	usb_serial_generic_calc_num_ports,
 	.throttle =		usb_serial_generic_throttle,
 	.unthrottle =		usb_serial_generic_unthrottle,
 	.resume =		usb_serial_generic_resume,
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index f8ae09e2cff5..101eb105d78e 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -850,28 +850,17 @@ static int usb_serial_probe(struct usb_interface *interface,
 		retval = -ENODEV;
 		goto err_free_epds;
 	}
-#ifdef CONFIG_USB_SERIAL_GENERIC
-	if (type == &usb_serial_generic_device) {
-		num_ports = epds->num_bulk_out;
-		if (num_ports == 0) {
-			dev_err(ddev, "Generic device with no bulk out, not allowed.\n");
-			retval = -EIO;
+
+	if (type->calc_num_ports) {
+		retval = type->calc_num_ports(serial, epds);
+		if (retval < 0)
 			goto err_free_epds;
-		}
-	}
-#endif
-	if (!num_ports) {
-		/* if this device type has a calc_num_ports function, call it */
-		if (type->calc_num_ports) {
-			retval = type->calc_num_ports(serial, epds);
-			if (retval < 0)
-				goto err_free_epds;
-			num_ports = retval;
-		}
-		if (!num_ports)
-			num_ports = type->num_ports;
+		num_ports = retval;
 	}
 
+	if (!num_ports)
+		num_ports = type->num_ports;
+
 	if (num_ports > MAX_NUM_PORTS) {
 		dev_warn(ddev, "too many ports requested: %d\n", num_ports);
 		num_ports = MAX_NUM_PORTS;
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index da528818cfd8..e2f0ab07eea5 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -379,7 +379,6 @@ extern void usb_serial_handle_dcd_change(struct usb_serial_port *usb_port,
 extern int usb_serial_bus_register(struct usb_serial_driver *device);
 extern void usb_serial_bus_deregister(struct usb_serial_driver *device);
 
-extern struct usb_serial_driver usb_serial_generic_device;
 extern struct bus_type usb_serial_bus_type;
 extern struct tty_driver *usb_serial_tty_driver;
 
-- 
cgit v1.2.3


From e753b2b50dc3c6582e9d5971555693db41a6d821 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Wed, 1 Feb 2017 19:01:18 +0200
Subject: net/mlx5: Add helper to initialize a flow steering actions struct
 instance

There are bunch of places in the code where the intermediate struct
that keeps the elements related to flow actions is initialized with
the same default values. Put that into a small DECLARE type helper.

This patch doesn't change any functionality.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 14 +++-----------
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c   | 18 +++---------------
 include/linux/mlx5/fs.h                           |  5 ++++-
 3 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 68419a01db36..c4e9cc79f5c7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -174,13 +174,9 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
 				 enum arfs_type type)
 {
 	struct arfs_table *arfs_t = &priv->fs.arfs.arfs_tables[type];
-	struct mlx5_flow_act flow_act = {
-		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
-		.encap_id = 0,
-	};
-	struct mlx5_flow_destination dest;
 	struct mlx5e_tir *tir = priv->indir_tir;
+	struct mlx5_flow_destination dest;
+	MLX5_DECLARE_FLOW_ACT(flow_act);
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 
@@ -469,15 +465,11 @@ static struct arfs_table *arfs_get_table(struct mlx5e_arfs_tables *arfs,
 static struct mlx5_flow_handle *arfs_add_rule(struct mlx5e_priv *priv,
 					      struct arfs_rule *arfs_rule)
 {
-	struct mlx5_flow_act flow_act = {
-		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
-		.encap_id = 0,
-	};
 	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
 	struct arfs_tuple *tuple = &arfs_rule->tuple;
 	struct mlx5_flow_handle *rule = NULL;
 	struct mlx5_flow_destination dest;
+	MLX5_DECLARE_FLOW_ACT(flow_act);
 	struct arfs_table *arfs_table;
 	struct mlx5_flow_spec *spec;
 	struct mlx5_flow_table *ft;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index f2762e45c8ae..5376d69a6b1a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -159,14 +159,10 @@ static int __mlx5e_add_vlan_rule(struct mlx5e_priv *priv,
 				 enum mlx5e_vlan_rule_type rule_type,
 				 u16 vid, struct mlx5_flow_spec *spec)
 {
-	struct mlx5_flow_act flow_act = {
-		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
-		.encap_id = 0,
-	};
 	struct mlx5_flow_table *ft = priv->fs.vlan.ft.t;
 	struct mlx5_flow_destination dest;
 	struct mlx5_flow_handle **rule_p;
+	MLX5_DECLARE_FLOW_ACT(flow_act);
 	int err = 0;
 
 	dest.type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
@@ -659,11 +655,7 @@ mlx5e_generate_ttc_rule(struct mlx5e_priv *priv,
 			u16 etype,
 			u8 proto)
 {
-	struct mlx5_flow_act flow_act = {
-		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
-		.encap_id = 0,
-	};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
 	struct mlx5_flow_handle *rule;
 	struct mlx5_flow_spec *spec;
 	int err = 0;
@@ -848,13 +840,9 @@ static void mlx5e_del_l2_flow_rule(struct mlx5e_priv *priv,
 static int mlx5e_add_l2_flow_rule(struct mlx5e_priv *priv,
 				  struct mlx5e_l2_rule *ai, int type)
 {
-	struct mlx5_flow_act flow_act = {
-		.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
-		.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG,
-		.encap_id = 0,
-	};
 	struct mlx5_flow_table *ft = priv->fs.l2.ft.t;
 	struct mlx5_flow_destination dest;
+	MLX5_DECLARE_FLOW_ACT(flow_act);
 	struct mlx5_flow_spec *spec;
 	int err = 0;
 	u8 *mc_dmac;
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 949b24b6c479..5eea1ba2e593 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -136,6 +136,10 @@ struct mlx5_flow_act {
 	u32 encap_id;
 };
 
+#define MLX5_DECLARE_FLOW_ACT(name) \
+	struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
+				     MLX5_FS_DEFAULT_FLOW_TAG, 0}
+
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
  */
@@ -156,5 +160,4 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
-
 #endif
-- 
cgit v1.2.3


From 2a69cb9ff7caac00f3bf7c865964228dd2a0c415 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Thu, 19 Jan 2017 19:31:25 +0200
Subject: net/mlx5: Introduce modify header structures, commands and steering
 action definitions

Add the definitions related to creation/deletion of a modify header
context and the modify header steering action which are used for HW
packet header modify (re-write) as part of steering. Add as well the
modify header id into two intermediate structs and set it to the FTE.

Note that as the push/pop vlan steering actions are emulated by the
ewitch management code, we're not breaking any compatibility while
changing their values to make room for the modify header action which
is not emulated and whose value is part of the FW API. The new bit
values for the emulated actions are at the end of the possible range.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |   1 +
 include/linux/mlx5/fs.h                           |   3 +-
 include/linux/mlx5/mlx5_ifc.h                     | 113 +++++++++++++++++++++-
 6 files changed, 118 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ad329b1680b4..cd9240c3a7f0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -285,8 +285,8 @@ enum {
 	SET_VLAN_INSERT	= BIT(1)
 };
 
-#define MLX5_FLOW_CONTEXT_ACTION_VLAN_POP  0x40
-#define MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH 0x80
+#define MLX5_FLOW_CONTEXT_ACTION_VLAN_POP  0x4000
+#define MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH 0x8000
 
 struct mlx5_encap_entry {
 	struct hlist_node encap_hlist;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index b64a781c7e85..20d1fd516d03 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -249,6 +249,7 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 	MLX5_SET(flow_context, in_flow_context, flow_tag, fte->flow_tag);
 	MLX5_SET(flow_context, in_flow_context, action, fte->action);
 	MLX5_SET(flow_context, in_flow_context, encap_id, fte->encap_id);
+	MLX5_SET(flow_context, in_flow_context, modify_header_id, fte->modify_id);
 	in_match_value = MLX5_ADDR_OF(flow_context, in_flow_context,
 				      match_value);
 	memcpy(in_match_value, &fte->val, MLX5_ST_SZ_BYTES(fte_match_param));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index ded27bb9a3b6..27ff815600f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -476,6 +476,7 @@ static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act,
 	fte->index = index;
 	fte->action = flow_act->action;
 	fte->encap_id = flow_act->encap_id;
+	fte->modify_id = flow_act->modify_id;
 
 	return fte;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 8e668c63f69e..03af2e7989f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -152,6 +152,7 @@ struct fs_fte {
 	u32				index;
 	u32				action;
 	u32				encap_id;
+	u32				modify_id;
 	enum fs_fte_status		status;
 	struct mlx5_fc			*counter;
 };
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 5eea1ba2e593..ae91a4bda1a3 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -134,11 +134,12 @@ struct mlx5_flow_act {
 	u32 action;
 	u32 flow_tag;
 	u32 encap_id;
+	u32 modify_id;
 };
 
 #define MLX5_DECLARE_FLOW_ACT(name) \
 	struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\
-				     MLX5_FS_DEFAULT_FLOW_TAG, 0}
+				     MLX5_FS_DEFAULT_FLOW_TAG, 0, 0}
 
 /* Single destination per rule.
  * Group ID is implied by the match criteria.
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 838242697541..56bc842b0620 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -227,6 +227,8 @@ enum {
 	MLX5_CMD_OP_MODIFY_FLOW_TABLE             = 0x93c,
 	MLX5_CMD_OP_ALLOC_ENCAP_HEADER            = 0x93d,
 	MLX5_CMD_OP_DEALLOC_ENCAP_HEADER          = 0x93e,
+	MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT   = 0x940,
+	MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941,
 	MLX5_CMD_OP_MAX
 };
 
@@ -302,7 +304,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits {
 
 	u8         reserved_at_20[0x2];
 	u8         log_max_ft_size[0x6];
-	u8         reserved_at_28[0x10];
+	u8         log_max_modify_header_context[0x8];
+	u8         max_modify_header_actions[0x8];
 	u8         max_ft_level[0x8];
 
 	u8         reserved_at_40[0x20];
@@ -2190,6 +2193,7 @@ enum {
 	MLX5_FLOW_CONTEXT_ACTION_COUNT     = 0x8,
 	MLX5_FLOW_CONTEXT_ACTION_ENCAP     = 0x10,
 	MLX5_FLOW_CONTEXT_ACTION_DECAP     = 0x20,
+	MLX5_FLOW_CONTEXT_ACTION_MOD_HDR   = 0x40,
 };
 
 struct mlx5_ifc_flow_context_bits {
@@ -2211,7 +2215,9 @@ struct mlx5_ifc_flow_context_bits {
 
 	u8         encap_id[0x20];
 
-	u8         reserved_at_e0[0x120];
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_100[0x100];
 
 	struct mlx5_ifc_fte_match_param_bits match_value;
 
@@ -4534,6 +4540,109 @@ struct mlx5_ifc_dealloc_encap_header_in_bits {
 	u8         reserved_60[0x20];
 };
 
+struct mlx5_ifc_set_action_in_bits {
+	u8         action_type[0x4];
+	u8         field[0xc];
+	u8         reserved_at_10[0x3];
+	u8         offset[0x5];
+	u8         reserved_at_18[0x3];
+	u8         length[0x5];
+
+	u8         data[0x20];
+};
+
+struct mlx5_ifc_add_action_in_bits {
+	u8         action_type[0x4];
+	u8         field[0xc];
+	u8         reserved_at_10[0x10];
+
+	u8         data[0x20];
+};
+
+union mlx5_ifc_set_action_in_add_action_in_auto_bits {
+	struct mlx5_ifc_set_action_in_bits set_action_in;
+	struct mlx5_ifc_add_action_in_bits add_action_in;
+	u8         reserved_at_0[0x40];
+};
+
+enum {
+	MLX5_ACTION_TYPE_SET   = 0x1,
+	MLX5_ACTION_TYPE_ADD   = 0x2,
+};
+
+enum {
+	MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16    = 0x1,
+	MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0     = 0x2,
+	MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE     = 0x3,
+	MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16    = 0x4,
+	MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0     = 0x5,
+	MLX5_ACTION_IN_FIELD_OUT_IP_DSCP       = 0x6,
+	MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS     = 0x7,
+	MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT     = 0x8,
+	MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT     = 0x9,
+	MLX5_ACTION_IN_FIELD_OUT_IP_TTL        = 0xa,
+	MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT     = 0xb,
+	MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT     = 0xc,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96  = 0xd,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64   = 0xe,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32   = 0xf,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0    = 0x10,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96  = 0x11,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64   = 0x12,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32   = 0x13,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0    = 0x14,
+	MLX5_ACTION_IN_FIELD_OUT_SIPV4         = 0x15,
+	MLX5_ACTION_IN_FIELD_OUT_DIPV4         = 0x16,
+};
+
+struct mlx5_ifc_alloc_modify_header_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_modify_header_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x20];
+
+	u8         table_type[0x8];
+	u8         reserved_at_68[0x10];
+	u8         num_of_actions[0x8];
+
+	union mlx5_ifc_set_action_in_add_action_in_auto_bits actions[0];
+};
+
+struct mlx5_ifc_dealloc_modify_header_context_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_modify_header_context_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         modify_header_id[0x20];
+
+	u8         reserved_at_60[0x20];
+};
+
 struct mlx5_ifc_query_dct_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
-- 
cgit v1.2.3


From 9e234eeafbe17e85908584392f249f0b329b8e1b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 10:51:41 -0700
Subject: blk-throttle: add a simple idle detection

A cgroup gets assigned a low limit, but the cgroup could never dispatch
enough IO to cross the low limit. In such case, the queue state machine
will remain in LIMIT_LOW state and all other cgroups will be throttled
according to low limit. This is unfair for other cgroups. We should
treat the cgroup idle and upgrade the state machine to lower state.

We also have a downgrade logic. If the state machine upgrades because of
cgroup idle (real idle), the state machine will downgrade soon as the
cgroup is below its low limit. This isn't what we want. A more
complicated case is cgroup isn't idle when queue is in LIMIT_LOW. But
when queue gets upgraded to lower state, other cgroups could dispatch
more IO and this cgroup can't dispatch enough IO, so the cgroup is below
its low limit and looks like idle (fake idle). In this case, the queue
should downgrade soon. The key to determine if we should do downgrade is
to detect if cgroup is truely idle.

Unfortunately it's very hard to determine if a cgroup is real idle. This
patch uses the 'think time check' idea from CFQ for the purpose. Please
note, the idea doesn't work for all workloads. For example, a workload
with io depth 8 has disk utilization 100%, hence think time is 0, eg,
not idle. But the workload can run higher bandwidth with io depth 16.
Compared to io depth 16, the io depth 8 workload is idle. We use the
idea to roughly determine if a cgroup is idle.

We treat a cgroup idle if its think time is above a threshold (by
default 1ms for SSD and 100ms for HD). The idea is think time above the
threshold will start to harm performance. HD is much slower so a longer
think time is ok.

The patch (and the latter patches) uses 'unsigned long' to track time.
We convert 'ns' to 'us' with 'ns >> 10'. This is fast but loses
precision, should not a big deal.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c               |  2 ++
 block/blk-throttle.c      | 82 ++++++++++++++++++++++++++++++++++++++++++++++-
 block/blk.h               |  3 ++
 include/linux/blk_types.h |  3 ++
 4 files changed, 89 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 6194a8cf2aab..f1857c0f0826 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
 #include <linux/cgroup.h>
 
 #include <trace/events/block.h>
+#include "blk.h"
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -1845,6 +1846,7 @@ again:
 		goto again;
 	}
 
+	blk_throtl_bio_endio(bio);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 62984fc92015..6300f3ed70d2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -22,6 +22,9 @@ static int throtl_quantum = 32;
 #define DFL_THROTL_SLICE_HD (HZ / 10)
 #define DFL_THROTL_SLICE_SSD (HZ / 50)
 #define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
 
 static struct blkcg_policy blkcg_policy_throtl;
 
@@ -154,6 +157,11 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
+
+	unsigned long last_finish_time; /* ns / 1024 */
+	unsigned long checked_last_finish_time; /* ns / 1024 */
+	unsigned long avg_idletime; /* ns / 1024 */
+	unsigned long idletime_threshold; /* us */
 };
 
 struct throtl_data
@@ -468,6 +476,11 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
+
+	if (blk_queue_nonrot(td->queue))
+		tg->idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+	else
+		tg->idletime_threshold = DFL_IDLE_THRESHOLD_HD;
 }
 
 /*
@@ -1644,6 +1657,21 @@ static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
 	return ret;
 }
 
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+	/*
+	 * cgroup is idle if:
+	 * - single idle is too long, longer than a fixed value (in case user
+	 *   configure a too big threshold) or 4 times of slice
+	 * - average think time is more than threshold
+	 */
+	unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+	time = min_t(unsigned long, MAX_IDLE_TIME, time);
+	return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+	       tg->avg_idletime > tg->idletime_threshold;
+}
+
 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1843,6 +1871,19 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 	tg->last_io_disp[WRITE] = 0;
 }
 
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+	unsigned long now = ktime_get_ns() >> 10;
+	unsigned long last_finish_time = tg->last_finish_time;
+
+	if (now <= last_finish_time || last_finish_time == 0 ||
+	    last_finish_time == tg->checked_last_finish_time)
+		return;
+
+	tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+	tg->checked_last_finish_time = last_finish_time;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1851,6 +1892,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
 	bool throttled = false;
+	int ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -1863,6 +1905,13 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
+	ret = bio_associate_current(bio);
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	if (ret == 0 || ret == -EBUSY)
+		bio->bi_cg_private = tg;
+#endif
+	blk_throtl_update_idletime(tg);
+
 	sq = &tg->service_queue;
 
 again:
@@ -1923,7 +1972,6 @@ again:
 
 	tg->last_low_overflow_time[rw] = jiffies;
 
-	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
@@ -1952,6 +2000,20 @@ out:
 	return throttled;
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+void blk_throtl_bio_endio(struct bio *bio)
+{
+	struct throtl_grp *tg;
+
+	tg = bio->bi_cg_private;
+	if (!tg)
+		return;
+	bio->bi_cg_private = NULL;
+
+	tg->last_finish_time = ktime_get_ns() >> 10;
+}
+#endif
+
 /*
  * Dispatch all bios from all children tg's queued on @parent_sq.  On
  * return, @parent_sq is guaranteed to not have any active children tg's
@@ -2035,6 +2097,7 @@ int blk_throtl_init(struct request_queue *q)
 	td->limit_index = LIMIT_MAX;
 	td->low_upgrade_time = jiffies;
 	td->low_downgrade_time = jiffies;
+
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
@@ -2053,6 +2116,8 @@ void blk_throtl_exit(struct request_queue *q)
 void blk_throtl_register_queue(struct request_queue *q)
 {
 	struct throtl_data *td;
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg;
 
 	td = q->td;
 	BUG_ON(!td);
@@ -2065,6 +2130,21 @@ void blk_throtl_register_queue(struct request_queue *q)
 	/* if no low limit, use previous default */
 	td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
+
+	/*
+	 * some tg are created before queue is fully initialized, eg, nonrot
+	 * isn't initialized yet
+	 */
+	rcu_read_lock();
+	blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
+		if (blk_queue_nonrot(q))
+			tg->idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+		else
+			tg->idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+	}
+	rcu_read_unlock();
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk.h b/block/blk.h
index 13070c325858..3ac833ec2adb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -330,6 +330,9 @@ static inline void blk_throtl_register_queue(struct request_queue *q) { }
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
 	const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
+#else
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
 #endif
 
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 270119a501fb..07a9e9607909 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -58,6 +58,9 @@ struct bio {
 	 */
 	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	void			*bi_cg_private;
+#endif
 #endif
 	union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
cgit v1.2.3


From 88eeca495ba7de749ff253376ec6be19bb05368d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 15:19:41 -0700
Subject: block: track request size in blk_issue_stat

Currently there is no way to know the request size when the request is
finished. Next patch will need this info. We could add extra field to
record the size, but blk_issue_stat has enough space to record it, so
this patch just overloads blk_issue_stat. With this, we will have 49bits
to track time, which still is very long time.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |  2 +-
 block/blk-mq.c            |  2 +-
 block/blk-stat.h          | 41 ++++++++++++++++++++++++++++++-----------
 block/blk-wbt.h           | 10 +++++-----
 include/linux/blk_types.h |  2 +-
 5 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index ad388d5e309a..4234332aa23c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2483,7 +2483,7 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		blk_stat_set_issue_time(&req->issue_stat);
+		blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
 		req->rq_flags |= RQF_STATS;
 		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 45b9bebf8436..caef6ee08b04 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -488,7 +488,7 @@ void blk_mq_start_request(struct request *rq)
 	trace_block_rq_issue(q, rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		blk_stat_set_issue_time(&rq->issue_stat);
+		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
 		rq->rq_flags |= RQF_STATS;
 		wbt_issue(q->rq_wb, &rq->issue_stat);
 	}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 6ad5b8c59a79..ee47f816d5bd 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -8,12 +8,19 @@
 #include <linux/timer.h>
 
 /*
- * Upper 3 bits can be used elsewhere
+ * from upper:
+ * 3 bits: reserved for other usage
+ * 12 bits: size
+ * 49 bits: time
  */
 #define BLK_STAT_RES_BITS	3
-#define BLK_STAT_SHIFT		(64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
-#define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
+#define BLK_STAT_SIZE_BITS	12
+#define BLK_STAT_RES_SHIFT	(64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_SIZE_SHIFT	(BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
+#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
+#define BLK_STAT_SIZE_MASK	\
+	(((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
+#define BLK_STAT_RES_MASK	(~((1ULL << BLK_STAT_RES_SHIFT) - 1))
 
 /**
  * struct blk_stat_callback - Block statistics callback.
@@ -73,12 +80,6 @@ void blk_free_queue_stats(struct blk_queue_stats *);
 
 void blk_stat_add(struct request *);
 
-static inline void blk_stat_set_issue_time(struct blk_issue_stat *stat)
-{
-	stat->time = ((stat->time & BLK_STAT_MASK) |
-		      (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK));
-}
-
 static inline u64 __blk_stat_time(u64 time)
 {
 	return time & BLK_STAT_TIME_MASK;
@@ -86,7 +87,25 @@ static inline u64 __blk_stat_time(u64 time)
 
 static inline u64 blk_stat_time(struct blk_issue_stat *stat)
 {
-	return __blk_stat_time(stat->time);
+	return __blk_stat_time(stat->stat);
+}
+
+static inline sector_t blk_capped_size(sector_t size)
+{
+	return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
+}
+
+static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
+{
+	return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
+}
+
+static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
+	sector_t size)
+{
+	stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
+		(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
+		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
 }
 
 /*
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 591ff2f4b2ee..ad6c78507c3a 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -32,27 +32,27 @@ enum {
 
 static inline void wbt_clear_state(struct blk_issue_stat *stat)
 {
-	stat->time &= BLK_STAT_TIME_MASK;
+	stat->stat &= ~BLK_STAT_RES_MASK;
 }
 
 static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
 {
-	return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+	return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
 }
 
 static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
 {
-	stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+	stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
 }
 
 static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
 {
-	return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
 }
 
 static inline bool wbt_is_read(struct blk_issue_stat *stat)
 {
-	return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
 }
 
 struct rq_wait {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 07a9e9607909..3ad567347671 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -287,7 +287,7 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_issue_stat {
-	u64 time;
+	u64 stat;
 };
 
 struct blk_rq_stat {
-- 
cgit v1.2.3


From b9147dd1bae2b15d6931ecd42f8606c775fecbc9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 15:19:42 -0700
Subject: blk-throttle: add a mechanism to estimate IO latency

User configures latency target, but the latency threshold for each
request size isn't fixed. For a SSD, the IO latency highly depends on
request size. To calculate latency threshold, we sample some data, eg,
average latency for request size 4k, 8k, 16k, 32k .. 1M. The latency
threshold of each request size will be the sample latency (I'll call it
base latency) plus latency target. For example, the base latency for
request size 4k is 80us and user configures latency target 60us. The 4k
latency threshold will be 80 + 60 = 140us.

To sample data, we calculate the order base 2 of rounded up IO sectors.
If the IO size is bigger than 1M, it will be accounted as 1M. Since the
calculation does round up, the base latency will be slightly smaller
than actual value. Also if there isn't any IO dispatched for a specific
IO size, we will use the base latency of smaller IO size for this IO
size.

But we shouldn't sample data at any time. The base latency is supposed
to be latency where disk isn't congested, because we use latency
threshold to schedule IOs between cgroups. If disk is congested, the
latency is higher, using it for scheduling is meaningless. Hence we only
do the sampling when block throttling is in the LOW limit, with
assumption disk isn't congested in such state. If the assumption isn't
true, eg, low limit is too high, calculated latency threshold will be
higher.

Hard disk is completely different. Latency depends on spindle seek
instead of request size. Currently this feature is SSD only, we probably
can use a fixed threshold like 4ms for hard disk though.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c          |  15 ++++-
 block/blk-stat.h          |   3 +
 block/blk-throttle.c      | 166 ++++++++++++++++++++++++++++++++++++++++++++--
 block/blk.h               |   2 +
 include/linux/blk_types.h |   9 +--
 5 files changed, 185 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index 188b535cf4d6..e77ec52f5bb5 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -9,12 +9,14 @@
 
 #include "blk-stat.h"
 #include "blk-mq.h"
+#include "blk.h"
 
 #define BLK_RQ_STAT_BATCH	64
 
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
+	bool enable_accounting;
 };
 
 unsigned int blk_stat_rq_ddir(const struct request *rq)
@@ -96,6 +98,8 @@ void blk_stat_add(struct request *rq)
 
 	value = now - blk_stat_time(&rq->issue_stat);
 
+	blk_throtl_stat_add(rq, value);
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
 		if (blk_stat_is_active(cb)) {
@@ -190,7 +194,7 @@ void blk_stat_remove_callback(struct request_queue *q,
 {
 	spin_lock(&q->stats->lock);
 	list_del_rcu(&cb->list);
-	if (list_empty(&q->stats->callbacks))
+	if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
 		clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
 	spin_unlock(&q->stats->lock);
 
@@ -215,6 +219,14 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
 }
 EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
+void blk_stat_enable_accounting(struct request_queue *q)
+{
+	spin_lock(&q->stats->lock);
+	q->stats->enable_accounting = true;
+	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
+}
+
 struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
 	struct blk_queue_stats *stats;
@@ -225,6 +237,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
 
 	INIT_LIST_HEAD(&stats->callbacks);
 	spin_lock_init(&stats->lock);
+	stats->enable_accounting = false;
 
 	return stats;
 }
diff --git a/block/blk-stat.h b/block/blk-stat.h
index ee47f816d5bd..53f08a63bf15 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -108,6 +108,9 @@ static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
 		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
 }
 
+/* record time/size info in request but not add a callback */
+void blk_stat_enable_accounting(struct request_queue *q);
+
 /*
  * blk_stat_rq_ddir() - Bucket callback function for the request data direction.
  * @rq: Request.
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6e1c29860eec..140da29f5800 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -28,6 +28,8 @@ static int throtl_quantum = 32;
 /* default latency target is 0, eg, guarantee IO latency by default */
 #define DFL_LATENCY_TARGET (0)
 
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
+
 static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
@@ -165,6 +167,19 @@ struct throtl_grp {
 	unsigned long idletime_threshold; /* us */
 };
 
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+	unsigned long total_latency; /* ns / 1024 */
+	int samples;
+};
+
+struct avg_latency_bucket {
+	unsigned long latency; /* ns / 1024 */
+	bool valid;
+};
+
 struct throtl_data
 {
 	/* service tree for active throtl groups */
@@ -188,6 +203,13 @@ struct throtl_data
 	unsigned long low_downgrade_time;
 
 	unsigned int scale;
+
+	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+	struct latency_bucket __percpu *latency_buckets;
+	unsigned long last_calculate_time;
+
+	bool track_bio_latency;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -306,6 +328,9 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 	return ret;
 }
 
+#define request_bucket_index(sectors) \
+	clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -1931,6 +1956,73 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
 	tg->checked_last_finish_time = last_finish_time;
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+	int i, cpu;
+	unsigned long last_latency = 0;
+	unsigned long latency;
+
+	if (!blk_queue_nonrot(td->queue))
+		return;
+	if (time_before(jiffies, td->last_calculate_time + HZ))
+		return;
+	td->last_calculate_time = jiffies;
+
+	memset(avg_latency, 0, sizeof(avg_latency));
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+		for_each_possible_cpu(cpu) {
+			struct latency_bucket *bucket;
+
+			/* this isn't race free, but ok in practice */
+			bucket = per_cpu_ptr(td->latency_buckets, cpu);
+			tmp->total_latency += bucket[i].total_latency;
+			tmp->samples += bucket[i].samples;
+			bucket[i].total_latency = 0;
+			bucket[i].samples = 0;
+		}
+
+		if (tmp->samples >= 32) {
+			int samples = tmp->samples;
+
+			latency = tmp->total_latency;
+
+			tmp->total_latency = 0;
+			tmp->samples = 0;
+			latency /= samples;
+			if (latency == 0)
+				continue;
+			avg_latency[i].latency = latency;
+		}
+	}
+
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		if (!avg_latency[i].latency) {
+			if (td->avg_buckets[i].latency < last_latency)
+				td->avg_buckets[i].latency = last_latency;
+			continue;
+		}
+
+		if (!td->avg_buckets[i].valid)
+			latency = avg_latency[i].latency;
+		else
+			latency = (td->avg_buckets[i].latency * 7 +
+				avg_latency[i].latency) >> 3;
+
+		td->avg_buckets[i].latency = max(latency, last_latency);
+		td->avg_buckets[i].valid = true;
+		last_latency = td->avg_buckets[i].latency;
+	}
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1939,6 +2031,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
 	bool throttled = false;
+	struct throtl_data *td = tg->td;
 	int ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1949,6 +2042,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	spin_lock_irq(q->queue_lock);
 
+	throtl_update_latency_buckets(td);
+
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
@@ -1956,6 +2051,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (ret == 0 || ret == -EBUSY)
 		bio->bi_cg_private = tg;
+	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
 #endif
 	blk_throtl_update_idletime(tg);
 
@@ -1974,8 +2070,8 @@ again:
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL)) {
 			tg->last_low_overflow_time[rw] = jiffies;
-			if (throtl_can_upgrade(tg->td, tg)) {
-				throtl_upgrade_state(tg->td);
+			if (throtl_can_upgrade(td, tg)) {
+				throtl_upgrade_state(td);
 				goto again;
 			}
 			break;
@@ -2019,7 +2115,7 @@ again:
 
 	tg->last_low_overflow_time[rw] = jiffies;
 
-	tg->td->nr_queued[rw]++;
+	td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
 
@@ -2044,20 +2140,67 @@ out:
 	 */
 	if (!throttled)
 		bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	if (throttled || !td->track_bio_latency)
+		bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
 	return throttled;
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+	int op, unsigned long time)
+{
+	struct latency_bucket *latency;
+	int index;
+
+	if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+	    !blk_queue_nonrot(td->queue))
+		return;
+
+	index = request_bucket_index(size);
+
+	latency = get_cpu_ptr(td->latency_buckets);
+	latency[index].total_latency += time;
+	latency[index].samples++;
+	put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+	struct request_queue *q = rq->q;
+	struct throtl_data *td = q->td;
+
+	throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+		req_op(rq), time_ns >> 10);
+}
+
 void blk_throtl_bio_endio(struct bio *bio)
 {
 	struct throtl_grp *tg;
+	u64 finish_time_ns;
+	unsigned long finish_time;
+	unsigned long start_time;
+	unsigned long lat;
 
 	tg = bio->bi_cg_private;
 	if (!tg)
 		return;
 	bio->bi_cg_private = NULL;
 
-	tg->last_finish_time = ktime_get_ns() >> 10;
+	finish_time_ns = ktime_get_ns();
+	tg->last_finish_time = finish_time_ns >> 10;
+
+	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+	finish_time = __blk_stat_time(finish_time_ns) >> 10;
+	/* this is only for bio based driver */
+	if (start_time && finish_time > start_time &&
+	    !(bio->bi_issue_stat.stat & SKIP_LATENCY)) {
+		lat = finish_time - start_time;
+		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+			bio_op(bio), lat);
+	}
 }
 #endif
 
@@ -2133,6 +2276,12 @@ int blk_throtl_init(struct request_queue *q)
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
+	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+		LATENCY_BUCKET_SIZE, __alignof__(u64));
+	if (!td->latency_buckets) {
+		kfree(td);
+		return -ENOMEM;
+	}
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
@@ -2147,8 +2296,10 @@ int blk_throtl_init(struct request_queue *q)
 
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-	if (ret)
+	if (ret) {
+		free_percpu(td->latency_buckets);
 		kfree(td);
+	}
 	return ret;
 }
 
@@ -2157,6 +2308,7 @@ void blk_throtl_exit(struct request_queue *q)
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+	free_percpu(q->td->latency_buckets);
 	kfree(q->td);
 }
 
@@ -2181,6 +2333,10 @@ void blk_throtl_register_queue(struct request_queue *q)
 	td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
+	td->track_bio_latency = !q->mq_ops && !q->request_fn;
+	if (!td->track_bio_latency)
+		blk_stat_enable_accounting(q);
+
 	/*
 	 * some tg are created before queue is fully initialized, eg, nonrot
 	 * isn't initialized yet
diff --git a/block/blk.h b/block/blk.h
index 3ac833ec2adb..07d375183f31 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -331,8 +331,10 @@ extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
 	const char *page, size_t count);
 extern void blk_throtl_bio_endio(struct bio *bio);
+extern void blk_throtl_stat_add(struct request *rq, u64 time);
 #else
 static inline void blk_throtl_bio_endio(struct bio *bio) { }
+static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
 #endif
 
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3ad567347671..67bcf8a5326e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,6 +17,10 @@ struct io_context;
 struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 
+struct blk_issue_stat {
+	u64 stat;
+};
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -60,6 +64,7 @@ struct bio {
 	struct cgroup_subsys_state *bi_css;
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	void			*bi_cg_private;
+	struct blk_issue_stat	bi_issue_stat;
 #endif
 #endif
 	union {
@@ -286,10 +291,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 	return (cookie & BLK_QC_T_INTERNAL) != 0;
 }
 
-struct blk_issue_stat {
-	u64 stat;
-};
-
 struct blk_rq_stat {
 	s64 mean;
 	u64 min;
-- 
cgit v1.2.3


From e8bb4673596ea28fab287dbc417e8100d798cd40 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Mon, 27 Mar 2017 07:31:03 +0200
Subject: dmaengine: pl330: remove pdata based initialization

This driver is now used only on platforms which support device tree, so
it is safe to remove legacy platform data based initialization code.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
For plat-samsung:
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 arch/arm/plat-samsung/devs.c |  1 -
 drivers/dma/pl330.c          | 42 ++++++++----------------------------------
 include/linux/amba/pl330.h   | 35 -----------------------------------
 3 files changed, 8 insertions(+), 70 deletions(-)
 delete mode 100644 include/linux/amba/pl330.h

(limited to 'include/linux')

diff --git a/arch/arm/plat-samsung/devs.c b/arch/arm/plat-samsung/devs.c
index 03fac123676d..dc269d9143bc 100644
--- a/arch/arm/plat-samsung/devs.c
+++ b/arch/arm/plat-samsung/devs.c
@@ -10,7 +10,6 @@
  * published by the Free Software Foundation.
 */
 
-#include <linux/amba/pl330.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
diff --git a/drivers/dma/pl330.c b/drivers/dma/pl330.c
index f37f4978dabb..8b0da7fa520d 100644
--- a/drivers/dma/pl330.c
+++ b/drivers/dma/pl330.c
@@ -22,7 +22,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <linux/amba/bus.h>
-#include <linux/amba/pl330.h>
 #include <linux/scatterlist.h>
 #include <linux/of.h>
 #include <linux/of_dma.h>
@@ -2077,18 +2076,6 @@ static void pl330_tasklet(unsigned long data)
 	}
 }
 
-bool pl330_filter(struct dma_chan *chan, void *param)
-{
-	u8 *peri_id;
-
-	if (chan->device->dev->driver != &pl330_driver.drv)
-		return false;
-
-	peri_id = chan->private;
-	return *peri_id == (unsigned long)param;
-}
-EXPORT_SYMBOL(pl330_filter);
-
 static struct dma_chan *of_dma_pl330_xlate(struct of_phandle_args *dma_spec,
 						struct of_dma *ofdma)
 {
@@ -2833,7 +2820,6 @@ static SIMPLE_DEV_PM_OPS(pl330_pm, pl330_suspend, pl330_resume);
 static int
 pl330_probe(struct amba_device *adev, const struct amba_id *id)
 {
-	struct dma_pl330_platdata *pdat;
 	struct pl330_config *pcfg;
 	struct pl330_dmac *pl330;
 	struct dma_pl330_chan *pch, *_p;
@@ -2843,8 +2829,6 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 	int num_chan;
 	struct device_node *np = adev->dev.of_node;
 
-	pdat = dev_get_platdata(&adev->dev);
-
 	ret = dma_set_mask_and_coherent(&adev->dev, DMA_BIT_MASK(32));
 	if (ret)
 		return ret;
@@ -2857,7 +2841,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 	pd = &pl330->ddma;
 	pd->dev = &adev->dev;
 
-	pl330->mcbufsz = pdat ? pdat->mcbuf_sz : 0;
+	pl330->mcbufsz = 0;
 
 	/* get quirk */
 	for (i = 0; i < ARRAY_SIZE(of_quirks); i++)
@@ -2901,10 +2885,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 	INIT_LIST_HEAD(&pd->channels);
 
 	/* Initialize channel parameters */
-	if (pdat)
-		num_chan = max_t(int, pdat->nr_valid_peri, pcfg->num_chan);
-	else
-		num_chan = max_t(int, pcfg->num_peri, pcfg->num_chan);
+	num_chan = max_t(int, pcfg->num_peri, pcfg->num_chan);
 
 	pl330->num_peripherals = num_chan;
 
@@ -2916,11 +2897,8 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
 	for (i = 0; i < num_chan; i++) {
 		pch = &pl330->peripherals[i];
-		if (!adev->dev.of_node)
-			pch->chan.private = pdat ? &pdat->peri_id[i] : NULL;
-		else
-			pch->chan.private = adev->dev.of_node;
 
+		pch->chan.private = adev->dev.of_node;
 		INIT_LIST_HEAD(&pch->submitted_list);
 		INIT_LIST_HEAD(&pch->work_list);
 		INIT_LIST_HEAD(&pch->completed_list);
@@ -2933,15 +2911,11 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 		list_add_tail(&pch->chan.device_node, &pd->channels);
 	}
 
-	if (pdat) {
-		pd->cap_mask = pdat->cap_mask;
-	} else {
-		dma_cap_set(DMA_MEMCPY, pd->cap_mask);
-		if (pcfg->num_peri) {
-			dma_cap_set(DMA_SLAVE, pd->cap_mask);
-			dma_cap_set(DMA_CYCLIC, pd->cap_mask);
-			dma_cap_set(DMA_PRIVATE, pd->cap_mask);
-		}
+	dma_cap_set(DMA_MEMCPY, pd->cap_mask);
+	if (pcfg->num_peri) {
+		dma_cap_set(DMA_SLAVE, pd->cap_mask);
+		dma_cap_set(DMA_CYCLIC, pd->cap_mask);
+		dma_cap_set(DMA_PRIVATE, pd->cap_mask);
 	}
 
 	pd->device_alloc_chan_resources = pl330_alloc_chan_resources;
diff --git a/include/linux/amba/pl330.h b/include/linux/amba/pl330.h
deleted file mode 100644
index fe93758e8403..000000000000
--- a/include/linux/amba/pl330.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* linux/include/linux/amba/pl330.h
- *
- * Copyright (C) 2010 Samsung Electronics Co. Ltd.
- *	Jaswinder Singh <jassi.brar@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef	__AMBA_PL330_H_
-#define	__AMBA_PL330_H_
-
-#include <linux/dmaengine.h>
-
-struct dma_pl330_platdata {
-	/*
-	 * Number of valid peripherals connected to DMAC.
-	 * This may be different from the value read from
-	 * CR0, as the PL330 implementation might have 'holes'
-	 * in the peri list or the peri could also be reached
-	 * from another DMAC which the platform prefers.
-	 */
-	u8 nr_valid_peri;
-	/* Array of valid peripherals */
-	u8 *peri_id;
-	/* Operational capabilities */
-	dma_cap_mask_t cap_mask;
-	/* Bytes to allocate for MC buffer */
-	unsigned mcbuf_sz;
-};
-
-extern bool pl330_filter(struct dma_chan *chan, void *param);
-#endif	/* __AMBA_PL330_H_ */
-- 
cgit v1.2.3


From dbc049eee73004db996cc8f63754f8dd5f86d0f7 Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@broadcom.com>
Date: Wed, 15 Mar 2017 12:10:00 +0530
Subject: mailbox: Add driver for Broadcom FlexRM ring manager

Some of the Broadcom iProc SoCs have FlexRM ring manager
which provides a ring-based programming interface to various
offload engines (e.g. RAID, Crypto, etc).

This patch adds a common mailbox driver for Broadcom FlexRM
ring manager which can be shared by various offload engine
drivers (implemented as mailbox clients).

Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Pramod KUMAR <pramod.kumar@broadcom.com>
Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig              |   11 +
 drivers/mailbox/Makefile             |    2 +
 drivers/mailbox/bcm-flexrm-mailbox.c | 1595 ++++++++++++++++++++++++++++++++++
 include/linux/mailbox/brcm-message.h |   14 +-
 4 files changed, 1618 insertions(+), 4 deletions(-)
 create mode 100644 drivers/mailbox/bcm-flexrm-mailbox.c

(limited to 'include/linux')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index 91eafb56568e..0b6f25e26c7c 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -151,4 +151,15 @@ config BCM_PDC_MBOX
 	  Mailbox implementation for the Broadcom FlexSparx DMA ring manager,
 	  which provides access to various offload engines on Broadcom
 	  SoCs, including FA2/FA+ on Northstar Plus and PDC on Northstar 2.
+
+config BCM_FLEXRM_MBOX
+	tristate "Broadcom FlexRM Mailbox"
+	depends on ARM64 || COMPILE_TEST
+	depends on HAS_DMA
+	select GENERIC_MSI_IRQ_DOMAIN
+	default ARCH_BCM_IPROC
+	help
+	  Mailbox implementation of the Broadcom FlexRM ring manager,
+	  which provides access to various offload engines on Broadcom
+	  SoCs. Say Y here if you want to use the Broadcom FlexRM.
 endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 7dde4f609ae8..e2bcb03cd35b 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -30,4 +30,6 @@ obj-$(CONFIG_HI6220_MBOX)	+= hi6220-mailbox.o
 
 obj-$(CONFIG_BCM_PDC_MBOX)	+= bcm-pdc-mailbox.o
 
+obj-$(CONFIG_BCM_FLEXRM_MBOX)	+= bcm-flexrm-mailbox.o
+
 obj-$(CONFIG_TEGRA_HSP_MBOX)	+= tegra-hsp.o
diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c b/drivers/mailbox/bcm-flexrm-mailbox.c
new file mode 100644
index 000000000000..da67882caa7b
--- /dev/null
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -0,0 +1,1595 @@
+/* Broadcom FlexRM Mailbox Driver
+ *
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Each Broadcom FlexSparx4 offload engine is implemented as an
+ * extension to Broadcom FlexRM ring manager. The FlexRM ring
+ * manager provides a set of rings which can be used to submit
+ * work to a FlexSparx4 offload engine.
+ *
+ * This driver creates a mailbox controller using a set of FlexRM
+ * rings where each mailbox channel represents a separate FlexRM ring.
+ */
+
+#include <asm/barrier.h>
+#include <asm/byteorder.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+
+/* ====== FlexRM register defines ===== */
+
+/* FlexRM configuration */
+#define RING_REGS_SIZE					0x10000
+#define RING_DESC_SIZE					8
+#define RING_DESC_INDEX(offset)				\
+			((offset) / RING_DESC_SIZE)
+#define RING_DESC_OFFSET(index)				\
+			((index) * RING_DESC_SIZE)
+#define RING_MAX_REQ_COUNT				1024
+#define RING_BD_ALIGN_ORDER				12
+#define RING_BD_ALIGN_CHECK(addr)			\
+			(!((addr) & ((0x1 << RING_BD_ALIGN_ORDER) - 1)))
+#define RING_BD_TOGGLE_INVALID(offset)			\
+			(((offset) >> RING_BD_ALIGN_ORDER) & 0x1)
+#define RING_BD_TOGGLE_VALID(offset)			\
+			(!RING_BD_TOGGLE_INVALID(offset))
+#define RING_BD_DESC_PER_REQ				32
+#define RING_BD_DESC_COUNT				\
+			(RING_MAX_REQ_COUNT * RING_BD_DESC_PER_REQ)
+#define RING_BD_SIZE					\
+			(RING_BD_DESC_COUNT * RING_DESC_SIZE)
+#define RING_CMPL_ALIGN_ORDER				13
+#define RING_CMPL_DESC_COUNT				RING_MAX_REQ_COUNT
+#define RING_CMPL_SIZE					\
+			(RING_CMPL_DESC_COUNT * RING_DESC_SIZE)
+#define RING_VER_MAGIC					0x76303031
+
+/* Per-Ring register offsets */
+#define RING_VER					0x000
+#define RING_BD_START_ADDR				0x004
+#define RING_BD_READ_PTR				0x008
+#define RING_BD_WRITE_PTR				0x00c
+#define RING_BD_READ_PTR_DDR_LS				0x010
+#define RING_BD_READ_PTR_DDR_MS				0x014
+#define RING_CMPL_START_ADDR				0x018
+#define RING_CMPL_WRITE_PTR				0x01c
+#define RING_NUM_REQ_RECV_LS				0x020
+#define RING_NUM_REQ_RECV_MS				0x024
+#define RING_NUM_REQ_TRANS_LS				0x028
+#define RING_NUM_REQ_TRANS_MS				0x02c
+#define RING_NUM_REQ_OUTSTAND				0x030
+#define RING_CONTROL					0x034
+#define RING_FLUSH_DONE					0x038
+#define RING_MSI_ADDR_LS				0x03c
+#define RING_MSI_ADDR_MS				0x040
+#define RING_MSI_CONTROL				0x048
+#define RING_BD_READ_PTR_DDR_CONTROL			0x04c
+#define RING_MSI_DATA_VALUE				0x064
+
+/* Register RING_BD_START_ADDR fields */
+#define BD_LAST_UPDATE_HW_SHIFT				28
+#define BD_LAST_UPDATE_HW_MASK				0x1
+#define BD_START_ADDR_VALUE(pa)				\
+	((u32)((((dma_addr_t)(pa)) >> RING_BD_ALIGN_ORDER) & 0x0fffffff))
+#define BD_START_ADDR_DECODE(val)			\
+	((dma_addr_t)((val) & 0x0fffffff) << RING_BD_ALIGN_ORDER)
+
+/* Register RING_CMPL_START_ADDR fields */
+#define CMPL_START_ADDR_VALUE(pa)			\
+	((u32)((((u64)(pa)) >> RING_CMPL_ALIGN_ORDER) & 0x03ffffff))
+
+/* Register RING_CONTROL fields */
+#define CONTROL_MASK_DISABLE_CONTROL			12
+#define CONTROL_FLUSH_SHIFT				5
+#define CONTROL_ACTIVE_SHIFT				4
+#define CONTROL_RATE_ADAPT_MASK				0xf
+#define CONTROL_RATE_DYNAMIC				0x0
+#define CONTROL_RATE_FAST				0x8
+#define CONTROL_RATE_MEDIUM				0x9
+#define CONTROL_RATE_SLOW				0xa
+#define CONTROL_RATE_IDLE				0xb
+
+/* Register RING_FLUSH_DONE fields */
+#define FLUSH_DONE_MASK					0x1
+
+/* Register RING_MSI_CONTROL fields */
+#define MSI_TIMER_VAL_SHIFT				16
+#define MSI_TIMER_VAL_MASK				0xffff
+#define MSI_ENABLE_SHIFT				15
+#define MSI_ENABLE_MASK					0x1
+#define MSI_COUNT_SHIFT					0
+#define MSI_COUNT_MASK					0x3ff
+
+/* Register RING_BD_READ_PTR_DDR_CONTROL fields */
+#define BD_READ_PTR_DDR_TIMER_VAL_SHIFT			16
+#define BD_READ_PTR_DDR_TIMER_VAL_MASK			0xffff
+#define BD_READ_PTR_DDR_ENABLE_SHIFT			15
+#define BD_READ_PTR_DDR_ENABLE_MASK			0x1
+
+/* ====== FlexRM ring descriptor defines ===== */
+
+/* Completion descriptor format */
+#define CMPL_OPAQUE_SHIFT			0
+#define CMPL_OPAQUE_MASK			0xffff
+#define CMPL_ENGINE_STATUS_SHIFT		16
+#define CMPL_ENGINE_STATUS_MASK			0xffff
+#define CMPL_DME_STATUS_SHIFT			32
+#define CMPL_DME_STATUS_MASK			0xffff
+#define CMPL_RM_STATUS_SHIFT			48
+#define CMPL_RM_STATUS_MASK			0xffff
+
+/* Completion DME status code */
+#define DME_STATUS_MEM_COR_ERR			BIT(0)
+#define DME_STATUS_MEM_UCOR_ERR			BIT(1)
+#define DME_STATUS_FIFO_UNDERFLOW		BIT(2)
+#define DME_STATUS_FIFO_OVERFLOW		BIT(3)
+#define DME_STATUS_RRESP_ERR			BIT(4)
+#define DME_STATUS_BRESP_ERR			BIT(5)
+#define DME_STATUS_ERROR_MASK			(DME_STATUS_MEM_COR_ERR | \
+						 DME_STATUS_MEM_UCOR_ERR | \
+						 DME_STATUS_FIFO_UNDERFLOW | \
+						 DME_STATUS_FIFO_OVERFLOW | \
+						 DME_STATUS_RRESP_ERR | \
+						 DME_STATUS_BRESP_ERR)
+
+/* Completion RM status code */
+#define RM_STATUS_CODE_SHIFT			0
+#define RM_STATUS_CODE_MASK			0x3ff
+#define RM_STATUS_CODE_GOOD			0x0
+#define RM_STATUS_CODE_AE_TIMEOUT		0x3ff
+
+/* General descriptor format */
+#define DESC_TYPE_SHIFT				60
+#define DESC_TYPE_MASK				0xf
+#define DESC_PAYLOAD_SHIFT			0
+#define DESC_PAYLOAD_MASK			0x0fffffffffffffff
+
+/* Null descriptor format  */
+#define NULL_TYPE				0
+#define NULL_TOGGLE_SHIFT			58
+#define NULL_TOGGLE_MASK			0x1
+
+/* Header descriptor format */
+#define HEADER_TYPE				1
+#define HEADER_TOGGLE_SHIFT			58
+#define HEADER_TOGGLE_MASK			0x1
+#define HEADER_ENDPKT_SHIFT			57
+#define HEADER_ENDPKT_MASK			0x1
+#define HEADER_STARTPKT_SHIFT			56
+#define HEADER_STARTPKT_MASK			0x1
+#define HEADER_BDCOUNT_SHIFT			36
+#define HEADER_BDCOUNT_MASK			0x1f
+#define HEADER_BDCOUNT_MAX			HEADER_BDCOUNT_MASK
+#define HEADER_FLAGS_SHIFT			16
+#define HEADER_FLAGS_MASK			0xffff
+#define HEADER_OPAQUE_SHIFT			0
+#define HEADER_OPAQUE_MASK			0xffff
+
+/* Source (SRC) descriptor format */
+#define SRC_TYPE				2
+#define SRC_LENGTH_SHIFT			44
+#define SRC_LENGTH_MASK				0xffff
+#define SRC_ADDR_SHIFT				0
+#define SRC_ADDR_MASK				0x00000fffffffffff
+
+/* Destination (DST) descriptor format */
+#define DST_TYPE				3
+#define DST_LENGTH_SHIFT			44
+#define DST_LENGTH_MASK				0xffff
+#define DST_ADDR_SHIFT				0
+#define DST_ADDR_MASK				0x00000fffffffffff
+
+/* Immediate (IMM) descriptor format */
+#define IMM_TYPE				4
+#define IMM_DATA_SHIFT				0
+#define IMM_DATA_MASK				0x0fffffffffffffff
+
+/* Next pointer (NPTR) descriptor format */
+#define NPTR_TYPE				5
+#define NPTR_TOGGLE_SHIFT			58
+#define NPTR_TOGGLE_MASK			0x1
+#define NPTR_ADDR_SHIFT				0
+#define NPTR_ADDR_MASK				0x00000fffffffffff
+
+/* Mega source (MSRC) descriptor format */
+#define MSRC_TYPE				6
+#define MSRC_LENGTH_SHIFT			44
+#define MSRC_LENGTH_MASK			0xffff
+#define MSRC_ADDR_SHIFT				0
+#define MSRC_ADDR_MASK				0x00000fffffffffff
+
+/* Mega destination (MDST) descriptor format */
+#define MDST_TYPE				7
+#define MDST_LENGTH_SHIFT			44
+#define MDST_LENGTH_MASK			0xffff
+#define MDST_ADDR_SHIFT				0
+#define MDST_ADDR_MASK				0x00000fffffffffff
+
+/* Source with tlast (SRCT) descriptor format */
+#define SRCT_TYPE				8
+#define SRCT_LENGTH_SHIFT			44
+#define SRCT_LENGTH_MASK			0xffff
+#define SRCT_ADDR_SHIFT				0
+#define SRCT_ADDR_MASK				0x00000fffffffffff
+
+/* Destination with tlast (DSTT) descriptor format */
+#define DSTT_TYPE				9
+#define DSTT_LENGTH_SHIFT			44
+#define DSTT_LENGTH_MASK			0xffff
+#define DSTT_ADDR_SHIFT				0
+#define DSTT_ADDR_MASK				0x00000fffffffffff
+
+/* Immediate with tlast (IMMT) descriptor format */
+#define IMMT_TYPE				10
+#define IMMT_DATA_SHIFT				0
+#define IMMT_DATA_MASK				0x0fffffffffffffff
+
+/* Descriptor helper macros */
+#define DESC_DEC(_d, _s, _m)			(((_d) >> (_s)) & (_m))
+#define DESC_ENC(_d, _v, _s, _m)		\
+			do { \
+				(_d) &= ~((u64)(_m) << (_s)); \
+				(_d) |= (((u64)(_v) & (_m)) << (_s)); \
+			} while (0)
+
+/* ====== FlexRM data structures ===== */
+
+struct flexrm_ring {
+	/* Unprotected members */
+	int num;
+	struct flexrm_mbox *mbox;
+	void __iomem *regs;
+	bool irq_requested;
+	unsigned int irq;
+	unsigned int msi_timer_val;
+	unsigned int msi_count_threshold;
+	struct ida requests_ida;
+	struct brcm_message *requests[RING_MAX_REQ_COUNT];
+	void *bd_base;
+	dma_addr_t bd_dma_base;
+	u32 bd_write_offset;
+	void *cmpl_base;
+	dma_addr_t cmpl_dma_base;
+	/* Protected members */
+	spinlock_t lock;
+	struct brcm_message *last_pending_msg;
+	u32 cmpl_read_offset;
+};
+
+struct flexrm_mbox {
+	struct device *dev;
+	void __iomem *regs;
+	u32 num_rings;
+	struct flexrm_ring *rings;
+	struct dma_pool *bd_pool;
+	struct dma_pool *cmpl_pool;
+	struct mbox_controller controller;
+};
+
+/* ====== FlexRM ring descriptor helper routines ===== */
+
+static u64 flexrm_read_desc(void *desc_ptr)
+{
+	return le64_to_cpu(*((u64 *)desc_ptr));
+}
+
+static void flexrm_write_desc(void *desc_ptr, u64 desc)
+{
+	*((u64 *)desc_ptr) = cpu_to_le64(desc);
+}
+
+static u32 flexrm_cmpl_desc_to_reqid(u64 cmpl_desc)
+{
+	return (u32)(cmpl_desc & CMPL_OPAQUE_MASK);
+}
+
+static int flexrm_cmpl_desc_to_error(u64 cmpl_desc)
+{
+	u32 status;
+
+	status = DESC_DEC(cmpl_desc, CMPL_DME_STATUS_SHIFT,
+			  CMPL_DME_STATUS_MASK);
+	if (status & DME_STATUS_ERROR_MASK)
+		return -EIO;
+
+	status = DESC_DEC(cmpl_desc, CMPL_RM_STATUS_SHIFT,
+			  CMPL_RM_STATUS_MASK);
+	status &= RM_STATUS_CODE_MASK;
+	if (status == RM_STATUS_CODE_AE_TIMEOUT)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static bool flexrm_is_next_table_desc(void *desc_ptr)
+{
+	u64 desc = flexrm_read_desc(desc_ptr);
+	u32 type = DESC_DEC(desc, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+
+	return (type == NPTR_TYPE) ? true : false;
+}
+
+static u64 flexrm_next_table_desc(u32 toggle, dma_addr_t next_addr)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, NPTR_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, toggle, NPTR_TOGGLE_SHIFT, NPTR_TOGGLE_MASK);
+	DESC_ENC(desc, next_addr, NPTR_ADDR_SHIFT, NPTR_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_null_desc(u32 toggle)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, NULL_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, toggle, NULL_TOGGLE_SHIFT, NULL_TOGGLE_MASK);
+
+	return desc;
+}
+
+static u32 flexrm_estimate_header_desc_count(u32 nhcnt)
+{
+	u32 hcnt = nhcnt / HEADER_BDCOUNT_MAX;
+
+	if (!(nhcnt % HEADER_BDCOUNT_MAX))
+		hcnt += 1;
+
+	return hcnt;
+}
+
+static void flexrm_flip_header_toogle(void *desc_ptr)
+{
+	u64 desc = flexrm_read_desc(desc_ptr);
+
+	if (desc & ((u64)0x1 << HEADER_TOGGLE_SHIFT))
+		desc &= ~((u64)0x1 << HEADER_TOGGLE_SHIFT);
+	else
+		desc |= ((u64)0x1 << HEADER_TOGGLE_SHIFT);
+
+	flexrm_write_desc(desc_ptr, desc);
+}
+
+static u64 flexrm_header_desc(u32 toggle, u32 startpkt, u32 endpkt,
+			       u32 bdcount, u32 flags, u32 opaque)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, HEADER_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, toggle, HEADER_TOGGLE_SHIFT, HEADER_TOGGLE_MASK);
+	DESC_ENC(desc, startpkt, HEADER_STARTPKT_SHIFT, HEADER_STARTPKT_MASK);
+	DESC_ENC(desc, endpkt, HEADER_ENDPKT_SHIFT, HEADER_ENDPKT_MASK);
+	DESC_ENC(desc, bdcount, HEADER_BDCOUNT_SHIFT, HEADER_BDCOUNT_MASK);
+	DESC_ENC(desc, flags, HEADER_FLAGS_SHIFT, HEADER_FLAGS_MASK);
+	DESC_ENC(desc, opaque, HEADER_OPAQUE_SHIFT, HEADER_OPAQUE_MASK);
+
+	return desc;
+}
+
+static void flexrm_enqueue_desc(u32 nhpos, u32 nhcnt, u32 reqid,
+				 u64 desc, void **desc_ptr, u32 *toggle,
+				 void *start_desc, void *end_desc)
+{
+	u64 d;
+	u32 nhavail, _toggle, _startpkt, _endpkt, _bdcount;
+
+	/* Sanity check */
+	if (nhcnt <= nhpos)
+		return;
+
+	/*
+	 * Each request or packet start with a HEADER descriptor followed
+	 * by one or more non-HEADER descriptors (SRC, SRCT, MSRC, DST,
+	 * DSTT, MDST, IMM, and IMMT). The number of non-HEADER descriptors
+	 * following a HEADER descriptor is represented by BDCOUNT field
+	 * of HEADER descriptor. The max value of BDCOUNT field is 31 which
+	 * means we can only have 31 non-HEADER descriptors following one
+	 * HEADER descriptor.
+	 *
+	 * In general use, number of non-HEADER descriptors can easily go
+	 * beyond 31. To tackle this situation, we have packet (or request)
+	 * extenstion bits (STARTPKT and ENDPKT) in the HEADER descriptor.
+	 *
+	 * To use packet extension, the first HEADER descriptor of request
+	 * (or packet) will have STARTPKT=1 and ENDPKT=0. The intermediate
+	 * HEADER descriptors will have STARTPKT=0 and ENDPKT=0. The last
+	 * HEADER descriptor will have STARTPKT=0 and ENDPKT=1. Also, the
+	 * TOGGLE bit of the first HEADER will be set to invalid state to
+	 * ensure that FlexRM does not start fetching descriptors till all
+	 * descriptors are enqueued. The user of this function will flip
+	 * the TOGGLE bit of first HEADER after all descriptors are
+	 * enqueued.
+	 */
+
+	if ((nhpos % HEADER_BDCOUNT_MAX == 0) && (nhcnt - nhpos)) {
+		/* Prepare the header descriptor */
+		nhavail = (nhcnt - nhpos);
+		_toggle = (nhpos == 0) ? !(*toggle) : (*toggle);
+		_startpkt = (nhpos == 0) ? 0x1 : 0x0;
+		_endpkt = (nhavail <= HEADER_BDCOUNT_MAX) ? 0x1 : 0x0;
+		_bdcount = (nhavail <= HEADER_BDCOUNT_MAX) ?
+				nhavail : HEADER_BDCOUNT_MAX;
+		if (nhavail <= HEADER_BDCOUNT_MAX)
+			_bdcount = nhavail;
+		else
+			_bdcount = HEADER_BDCOUNT_MAX;
+		d = flexrm_header_desc(_toggle, _startpkt, _endpkt,
+					_bdcount, 0x0, reqid);
+
+		/* Write header descriptor */
+		flexrm_write_desc(*desc_ptr, d);
+
+		/* Point to next descriptor */
+		*desc_ptr += sizeof(desc);
+		if (*desc_ptr == end_desc)
+			*desc_ptr = start_desc;
+
+		/* Skip next pointer descriptors */
+		while (flexrm_is_next_table_desc(*desc_ptr)) {
+			*toggle = (*toggle) ? 0 : 1;
+			*desc_ptr += sizeof(desc);
+			if (*desc_ptr == end_desc)
+				*desc_ptr = start_desc;
+		}
+	}
+
+	/* Write desired descriptor */
+	flexrm_write_desc(*desc_ptr, desc);
+
+	/* Point to next descriptor */
+	*desc_ptr += sizeof(desc);
+	if (*desc_ptr == end_desc)
+		*desc_ptr = start_desc;
+
+	/* Skip next pointer descriptors */
+	while (flexrm_is_next_table_desc(*desc_ptr)) {
+		*toggle = (*toggle) ? 0 : 1;
+		*desc_ptr += sizeof(desc);
+		if (*desc_ptr == end_desc)
+			*desc_ptr = start_desc;
+	}
+}
+
+static u64 flexrm_src_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, SRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, SRC_LENGTH_SHIFT, SRC_LENGTH_MASK);
+	DESC_ENC(desc, addr, SRC_ADDR_SHIFT, SRC_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_msrc_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, MSRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length_div_16, MSRC_LENGTH_SHIFT, MSRC_LENGTH_MASK);
+	DESC_ENC(desc, addr, MSRC_ADDR_SHIFT, MSRC_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_dst_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, DST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, DST_LENGTH_SHIFT, DST_LENGTH_MASK);
+	DESC_ENC(desc, addr, DST_ADDR_SHIFT, DST_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_mdst_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, MDST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length_div_16, MDST_LENGTH_SHIFT, MDST_LENGTH_MASK);
+	DESC_ENC(desc, addr, MDST_ADDR_SHIFT, MDST_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_imm_desc(u64 data)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, IMM_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, data, IMM_DATA_SHIFT, IMM_DATA_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_srct_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, SRCT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, SRCT_LENGTH_SHIFT, SRCT_LENGTH_MASK);
+	DESC_ENC(desc, addr, SRCT_ADDR_SHIFT, SRCT_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_dstt_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, DSTT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, DSTT_LENGTH_SHIFT, DSTT_LENGTH_MASK);
+	DESC_ENC(desc, addr, DSTT_ADDR_SHIFT, DSTT_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_immt_desc(u64 data)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, IMMT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, data, IMMT_DATA_SHIFT, IMMT_DATA_MASK);
+
+	return desc;
+}
+
+static bool flexrm_spu_sanity_check(struct brcm_message *msg)
+{
+	struct scatterlist *sg;
+
+	if (!msg->spu.src || !msg->spu.dst)
+		return false;
+	for (sg = msg->spu.src; sg; sg = sg_next(sg)) {
+		if (sg->length & 0xf) {
+			if (sg->length > SRC_LENGTH_MASK)
+				return false;
+		} else {
+			if (sg->length > (MSRC_LENGTH_MASK * 16))
+				return false;
+		}
+	}
+	for (sg = msg->spu.dst; sg; sg = sg_next(sg)) {
+		if (sg->length & 0xf) {
+			if (sg->length > DST_LENGTH_MASK)
+				return false;
+		} else {
+			if (sg->length > (MDST_LENGTH_MASK * 16))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static u32 flexrm_spu_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+	u32 cnt = 0;
+	unsigned int dst_target = 0;
+	struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+	while (src_sg || dst_sg) {
+		if (src_sg) {
+			cnt++;
+			dst_target = src_sg->length;
+			src_sg = sg_next(src_sg);
+		} else
+			dst_target = UINT_MAX;
+
+		while (dst_target && dst_sg) {
+			cnt++;
+			if (dst_sg->length < dst_target)
+				dst_target -= dst_sg->length;
+			else
+				dst_target = 0;
+			dst_sg = sg_next(dst_sg);
+		}
+	}
+
+	return cnt;
+}
+
+static int flexrm_spu_dma_map(struct device *dev, struct brcm_message *msg)
+{
+	int rc;
+
+	rc = dma_map_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+			DMA_TO_DEVICE);
+	if (rc < 0)
+		return rc;
+
+	rc = dma_map_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+			DMA_FROM_DEVICE);
+	if (rc < 0) {
+		dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+			     DMA_TO_DEVICE);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void flexrm_spu_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+	dma_unmap_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+		     DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+		     DMA_TO_DEVICE);
+}
+
+static void *flexrm_spu_write_descs(struct brcm_message *msg, u32 nhcnt,
+				     u32 reqid, void *desc_ptr, u32 toggle,
+				     void *start_desc, void *end_desc)
+{
+	u64 d;
+	u32 nhpos = 0;
+	void *orig_desc_ptr = desc_ptr;
+	unsigned int dst_target = 0;
+	struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+	while (src_sg || dst_sg) {
+		if (src_sg) {
+			if (sg_dma_len(src_sg) & 0xf)
+				d = flexrm_src_desc(sg_dma_address(src_sg),
+						     sg_dma_len(src_sg));
+			else
+				d = flexrm_msrc_desc(sg_dma_address(src_sg),
+						      sg_dma_len(src_sg)/16);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+			dst_target = sg_dma_len(src_sg);
+			src_sg = sg_next(src_sg);
+		} else
+			dst_target = UINT_MAX;
+
+		while (dst_target && dst_sg) {
+			if (sg_dma_len(dst_sg) & 0xf)
+				d = flexrm_dst_desc(sg_dma_address(dst_sg),
+						     sg_dma_len(dst_sg));
+			else
+				d = flexrm_mdst_desc(sg_dma_address(dst_sg),
+						      sg_dma_len(dst_sg)/16);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+			if (sg_dma_len(dst_sg) < dst_target)
+				dst_target -= sg_dma_len(dst_sg);
+			else
+				dst_target = 0;
+			dst_sg = sg_next(dst_sg);
+		}
+	}
+
+	/* Null descriptor with invalid toggle bit */
+	flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+	/* Ensure that descriptors have been written to memory */
+	wmb();
+
+	/* Flip toggle bit in header */
+	flexrm_flip_header_toogle(orig_desc_ptr);
+
+	return desc_ptr;
+}
+
+static bool flexrm_sba_sanity_check(struct brcm_message *msg)
+{
+	u32 i;
+
+	if (!msg->sba.cmds || !msg->sba.cmds_count)
+		return false;
+
+	for (i = 0; i < msg->sba.cmds_count; i++) {
+		if (((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+		     (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C)) &&
+		    (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) &&
+		    (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C) &&
+		    (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP) &&
+		    (msg->sba.cmds[i].resp_len > DSTT_LENGTH_MASK))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT) &&
+		    (msg->sba.cmds[i].data_len > DSTT_LENGTH_MASK))
+			return false;
+	}
+
+	return true;
+}
+
+static u32 flexrm_sba_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+	u32 i, cnt;
+
+	cnt = 0;
+	for (i = 0; i < msg->sba.cmds_count; i++) {
+		cnt++;
+
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+		    (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C))
+			cnt++;
+
+		if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP)
+			cnt++;
+
+		if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT)
+			cnt++;
+	}
+
+	return cnt;
+}
+
+static void *flexrm_sba_write_descs(struct brcm_message *msg, u32 nhcnt,
+				     u32 reqid, void *desc_ptr, u32 toggle,
+				     void *start_desc, void *end_desc)
+{
+	u64 d;
+	u32 i, nhpos = 0;
+	struct brcm_sba_command *c;
+	void *orig_desc_ptr = desc_ptr;
+
+	/* Convert SBA commands into descriptors */
+	for (i = 0; i < msg->sba.cmds_count; i++) {
+		c = &msg->sba.cmds[i];
+
+		if ((c->flags & BRCM_SBA_CMD_HAS_RESP) &&
+		    (c->flags & BRCM_SBA_CMD_HAS_OUTPUT)) {
+			/* Destination response descriptor */
+			d = flexrm_dst_desc(c->resp, c->resp_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		} else if (c->flags & BRCM_SBA_CMD_HAS_RESP) {
+			/* Destination response with tlast descriptor */
+			d = flexrm_dstt_desc(c->resp, c->resp_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+
+		if (c->flags & BRCM_SBA_CMD_HAS_OUTPUT) {
+			/* Destination with tlast descriptor */
+			d = flexrm_dstt_desc(c->data, c->data_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+
+		if (c->flags & BRCM_SBA_CMD_TYPE_B) {
+			/* Command as immediate descriptor */
+			d = flexrm_imm_desc(c->cmd);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		} else {
+			/* Command as immediate descriptor with tlast */
+			d = flexrm_immt_desc(c->cmd);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+
+		if ((c->flags & BRCM_SBA_CMD_TYPE_B) ||
+		    (c->flags & BRCM_SBA_CMD_TYPE_C)) {
+			/* Source with tlast descriptor */
+			d = flexrm_srct_desc(c->data, c->data_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+	}
+
+	/* Null descriptor with invalid toggle bit */
+	flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+	/* Ensure that descriptors have been written to memory */
+	wmb();
+
+	/* Flip toggle bit in header */
+	flexrm_flip_header_toogle(orig_desc_ptr);
+
+	return desc_ptr;
+}
+
+static bool flexrm_sanity_check(struct brcm_message *msg)
+{
+	if (!msg)
+		return false;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_sanity_check(msg);
+	case BRCM_MESSAGE_SBA:
+		return flexrm_sba_sanity_check(msg);
+	default:
+		return false;
+	};
+}
+
+static u32 flexrm_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+	if (!msg)
+		return 0;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_estimate_nonheader_desc_count(msg);
+	case BRCM_MESSAGE_SBA:
+		return flexrm_sba_estimate_nonheader_desc_count(msg);
+	default:
+		return 0;
+	};
+}
+
+static int flexrm_dma_map(struct device *dev, struct brcm_message *msg)
+{
+	if (!dev || !msg)
+		return -EINVAL;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_dma_map(dev, msg);
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void flexrm_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+	if (!dev || !msg)
+		return;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		flexrm_spu_dma_unmap(dev, msg);
+		break;
+	default:
+		break;
+	}
+}
+
+static void *flexrm_write_descs(struct brcm_message *msg, u32 nhcnt,
+				u32 reqid, void *desc_ptr, u32 toggle,
+				void *start_desc, void *end_desc)
+{
+	if (!msg || !desc_ptr || !start_desc || !end_desc)
+		return ERR_PTR(-ENOTSUPP);
+
+	if ((desc_ptr < start_desc) || (end_desc <= desc_ptr))
+		return ERR_PTR(-ERANGE);
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_write_descs(msg, nhcnt, reqid,
+					       desc_ptr, toggle,
+					       start_desc, end_desc);
+	case BRCM_MESSAGE_SBA:
+		return flexrm_sba_write_descs(msg, nhcnt, reqid,
+					       desc_ptr, toggle,
+					       start_desc, end_desc);
+	default:
+		return ERR_PTR(-ENOTSUPP);
+	};
+}
+
+/* ====== FlexRM driver helper routines ===== */
+
+static int flexrm_new_request(struct flexrm_ring *ring,
+				struct brcm_message *batch_msg,
+				struct brcm_message *msg)
+{
+	void *next;
+	unsigned long flags;
+	u32 val, count, nhcnt;
+	u32 read_offset, write_offset;
+	bool exit_cleanup = false;
+	int ret = 0, reqid;
+
+	/* Do sanity check on message */
+	if (!flexrm_sanity_check(msg))
+		return -EIO;
+	msg->error = 0;
+
+	/* If no requests possible then save data pointer and goto done. */
+	reqid = ida_simple_get(&ring->requests_ida, 0,
+				RING_MAX_REQ_COUNT, GFP_KERNEL);
+	if (reqid < 0) {
+		spin_lock_irqsave(&ring->lock, flags);
+		if (batch_msg)
+			ring->last_pending_msg = batch_msg;
+		else
+			ring->last_pending_msg = msg;
+		spin_unlock_irqrestore(&ring->lock, flags);
+		return 0;
+	}
+	ring->requests[reqid] = msg;
+
+	/* Do DMA mappings for the message */
+	ret = flexrm_dma_map(ring->mbox->dev, msg);
+	if (ret < 0) {
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+		return ret;
+	}
+
+	/* If last_pending_msg is already set then goto done with error */
+	spin_lock_irqsave(&ring->lock, flags);
+	if (ring->last_pending_msg)
+		ret = -ENOSPC;
+	spin_unlock_irqrestore(&ring->lock, flags);
+	if (ret < 0) {
+		dev_warn(ring->mbox->dev, "no space in ring %d\n", ring->num);
+		exit_cleanup = true;
+		goto exit;
+	}
+
+	/* Determine current HW BD read offset */
+	read_offset = readl_relaxed(ring->regs + RING_BD_READ_PTR);
+	val = readl_relaxed(ring->regs + RING_BD_START_ADDR);
+	read_offset *= RING_DESC_SIZE;
+	read_offset += (u32)(BD_START_ADDR_DECODE(val) - ring->bd_dma_base);
+
+	/*
+	 * Number required descriptors = number of non-header descriptors +
+	 *				 number of header descriptors +
+	 *				 1x null descriptor
+	 */
+	nhcnt = flexrm_estimate_nonheader_desc_count(msg);
+	count = flexrm_estimate_header_desc_count(nhcnt) + nhcnt + 1;
+
+	/* Check for available descriptor space. */
+	write_offset = ring->bd_write_offset;
+	while (count) {
+		if (!flexrm_is_next_table_desc(ring->bd_base + write_offset))
+			count--;
+		write_offset += RING_DESC_SIZE;
+		if (write_offset == RING_BD_SIZE)
+			write_offset = 0x0;
+		if (write_offset == read_offset)
+			break;
+	}
+	if (count) {
+		spin_lock_irqsave(&ring->lock, flags);
+		if (batch_msg)
+			ring->last_pending_msg = batch_msg;
+		else
+			ring->last_pending_msg = msg;
+		spin_unlock_irqrestore(&ring->lock, flags);
+		ret = 0;
+		exit_cleanup = true;
+		goto exit;
+	}
+
+	/* Write descriptors to ring */
+	next = flexrm_write_descs(msg, nhcnt, reqid,
+			ring->bd_base + ring->bd_write_offset,
+			RING_BD_TOGGLE_VALID(ring->bd_write_offset),
+			ring->bd_base, ring->bd_base + RING_BD_SIZE);
+	if (IS_ERR(next)) {
+		ret = PTR_ERR(next);
+		exit_cleanup = true;
+		goto exit;
+	}
+
+	/* Save ring BD write offset */
+	ring->bd_write_offset = (unsigned long)(next - ring->bd_base);
+
+exit:
+	/* Update error status in message */
+	msg->error = ret;
+
+	/* Cleanup if we failed */
+	if (exit_cleanup) {
+		flexrm_dma_unmap(ring->mbox->dev, msg);
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+	}
+
+	return ret;
+}
+
+static int flexrm_process_completions(struct flexrm_ring *ring)
+{
+	u64 desc;
+	int err, count = 0;
+	unsigned long flags;
+	struct brcm_message *msg = NULL;
+	u32 reqid, cmpl_read_offset, cmpl_write_offset;
+	struct mbox_chan *chan = &ring->mbox->controller.chans[ring->num];
+
+	spin_lock_irqsave(&ring->lock, flags);
+
+	/* Check last_pending_msg */
+	if (ring->last_pending_msg) {
+		msg = ring->last_pending_msg;
+		ring->last_pending_msg = NULL;
+	}
+
+	/*
+	 * Get current completion read and write offset
+	 *
+	 * Note: We should read completion write pointer atleast once
+	 * after we get a MSI interrupt because HW maintains internal
+	 * MSI status which will allow next MSI interrupt only after
+	 * completion write pointer is read.
+	 */
+	cmpl_write_offset = readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+	cmpl_write_offset *= RING_DESC_SIZE;
+	cmpl_read_offset = ring->cmpl_read_offset;
+	ring->cmpl_read_offset = cmpl_write_offset;
+
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	/* If last_pending_msg was set then queue it back */
+	if (msg)
+		mbox_send_message(chan, msg);
+
+	/* For each completed request notify mailbox clients */
+	reqid = 0;
+	while (cmpl_read_offset != cmpl_write_offset) {
+		/* Dequeue next completion descriptor */
+		desc = *((u64 *)(ring->cmpl_base + cmpl_read_offset));
+
+		/* Next read offset */
+		cmpl_read_offset += RING_DESC_SIZE;
+		if (cmpl_read_offset == RING_CMPL_SIZE)
+			cmpl_read_offset = 0;
+
+		/* Decode error from completion descriptor */
+		err = flexrm_cmpl_desc_to_error(desc);
+		if (err < 0) {
+			dev_warn(ring->mbox->dev,
+				 "got completion desc=0x%lx with error %d",
+				 (unsigned long)desc, err);
+		}
+
+		/* Determine request id from completion descriptor */
+		reqid = flexrm_cmpl_desc_to_reqid(desc);
+
+		/* Determine message pointer based on reqid */
+		msg = ring->requests[reqid];
+		if (!msg) {
+			dev_warn(ring->mbox->dev,
+				 "null msg pointer for completion desc=0x%lx",
+				 (unsigned long)desc);
+			continue;
+		}
+
+		/* Release reqid for recycling */
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+
+		/* Unmap DMA mappings */
+		flexrm_dma_unmap(ring->mbox->dev, msg);
+
+		/* Give-back message to mailbox client */
+		msg->error = err;
+		mbox_chan_received_data(chan, msg);
+
+		/* Increment number of completions processed */
+		count++;
+	}
+
+	return count;
+}
+
+/* ====== FlexRM interrupt handler ===== */
+
+static irqreturn_t flexrm_irq_event(int irq, void *dev_id)
+{
+	/* We only have MSI for completions so just wakeup IRQ thread */
+	/* Ring related errors will be informed via completion descriptors */
+
+	return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t flexrm_irq_thread(int irq, void *dev_id)
+{
+	flexrm_process_completions(dev_id);
+
+	return IRQ_HANDLED;
+}
+
+/* ====== FlexRM mailbox callbacks ===== */
+
+static int flexrm_send_data(struct mbox_chan *chan, void *data)
+{
+	int i, rc;
+	struct flexrm_ring *ring = chan->con_priv;
+	struct brcm_message *msg = data;
+
+	if (msg->type == BRCM_MESSAGE_BATCH) {
+		for (i = msg->batch.msgs_queued;
+		     i < msg->batch.msgs_count; i++) {
+			rc = flexrm_new_request(ring, msg,
+						 &msg->batch.msgs[i]);
+			if (rc) {
+				msg->error = rc;
+				return rc;
+			}
+			msg->batch.msgs_queued++;
+		}
+		return 0;
+	}
+
+	return flexrm_new_request(ring, NULL, data);
+}
+
+static bool flexrm_peek_data(struct mbox_chan *chan)
+{
+	int cnt = flexrm_process_completions(chan->con_priv);
+
+	return (cnt > 0) ? true : false;
+}
+
+static int flexrm_startup(struct mbox_chan *chan)
+{
+	u64 d;
+	u32 val, off;
+	int ret = 0;
+	dma_addr_t next_addr;
+	struct flexrm_ring *ring = chan->con_priv;
+
+	/* Allocate BD memory */
+	ring->bd_base = dma_pool_alloc(ring->mbox->bd_pool,
+				       GFP_KERNEL, &ring->bd_dma_base);
+	if (!ring->bd_base) {
+		dev_err(ring->mbox->dev, "can't allocate BD memory\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Configure next table pointer entries in BD memory */
+	for (off = 0; off < RING_BD_SIZE; off += RING_DESC_SIZE) {
+		next_addr = off + RING_DESC_SIZE;
+		if (next_addr == RING_BD_SIZE)
+			next_addr = 0;
+		next_addr += ring->bd_dma_base;
+		if (RING_BD_ALIGN_CHECK(next_addr))
+			d = flexrm_next_table_desc(RING_BD_TOGGLE_VALID(off),
+						    next_addr);
+		else
+			d = flexrm_null_desc(RING_BD_TOGGLE_INVALID(off));
+		flexrm_write_desc(ring->bd_base + off, d);
+	}
+
+	/* Allocate completion memory */
+	ring->cmpl_base = dma_pool_alloc(ring->mbox->cmpl_pool,
+					 GFP_KERNEL, &ring->cmpl_dma_base);
+	if (!ring->cmpl_base) {
+		dev_err(ring->mbox->dev, "can't allocate completion memory\n");
+		ret = -ENOMEM;
+		goto fail_free_bd_memory;
+	}
+	memset(ring->cmpl_base, 0, RING_CMPL_SIZE);
+
+	/* Request IRQ */
+	if (ring->irq == UINT_MAX) {
+		dev_err(ring->mbox->dev, "ring IRQ not available\n");
+		ret = -ENODEV;
+		goto fail_free_cmpl_memory;
+	}
+	ret = request_threaded_irq(ring->irq,
+				   flexrm_irq_event,
+				   flexrm_irq_thread,
+				   0, dev_name(ring->mbox->dev), ring);
+	if (ret) {
+		dev_err(ring->mbox->dev, "failed to request ring IRQ\n");
+		goto fail_free_cmpl_memory;
+	}
+	ring->irq_requested = true;
+
+	/* Disable/inactivate ring */
+	writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+	/* Program BD start address */
+	val = BD_START_ADDR_VALUE(ring->bd_dma_base);
+	writel_relaxed(val, ring->regs + RING_BD_START_ADDR);
+
+	/* BD write pointer will be same as HW write pointer */
+	ring->bd_write_offset =
+			readl_relaxed(ring->regs + RING_BD_WRITE_PTR);
+	ring->bd_write_offset *= RING_DESC_SIZE;
+
+	/* Program completion start address */
+	val = CMPL_START_ADDR_VALUE(ring->cmpl_dma_base);
+	writel_relaxed(val, ring->regs + RING_CMPL_START_ADDR);
+
+	/* Ensure last pending message is cleared */
+	ring->last_pending_msg = NULL;
+
+	/* Completion read pointer will be same as HW write pointer */
+	ring->cmpl_read_offset =
+			readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+	ring->cmpl_read_offset *= RING_DESC_SIZE;
+
+	/* Read ring Tx, Rx, and Outstanding counts to clear */
+	readl_relaxed(ring->regs + RING_NUM_REQ_RECV_LS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_RECV_MS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_LS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_MS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_OUTSTAND);
+
+	/* Configure RING_MSI_CONTROL */
+	val = 0;
+	val |= (ring->msi_timer_val << MSI_TIMER_VAL_SHIFT);
+	val |= BIT(MSI_ENABLE_SHIFT);
+	val |= (ring->msi_count_threshold & MSI_COUNT_MASK) << MSI_COUNT_SHIFT;
+	writel_relaxed(val, ring->regs + RING_MSI_CONTROL);
+
+	/* Enable/activate ring */
+	val = BIT(CONTROL_ACTIVE_SHIFT);
+	writel_relaxed(val, ring->regs + RING_CONTROL);
+
+	return 0;
+
+fail_free_cmpl_memory:
+	dma_pool_free(ring->mbox->cmpl_pool,
+		      ring->cmpl_base, ring->cmpl_dma_base);
+	ring->cmpl_base = NULL;
+fail_free_bd_memory:
+	dma_pool_free(ring->mbox->bd_pool,
+		      ring->bd_base, ring->bd_dma_base);
+	ring->bd_base = NULL;
+fail:
+	return ret;
+}
+
+static void flexrm_shutdown(struct mbox_chan *chan)
+{
+	u32 reqid;
+	unsigned int timeout;
+	struct brcm_message *msg;
+	struct flexrm_ring *ring = chan->con_priv;
+
+	/* Disable/inactivate ring */
+	writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+	/* Flush ring with timeout of 1s */
+	timeout = 1000;
+	writel_relaxed(BIT(CONTROL_FLUSH_SHIFT),
+			ring->regs + RING_CONTROL);
+	do {
+		if (readl_relaxed(ring->regs + RING_FLUSH_DONE) &
+		    FLUSH_DONE_MASK)
+			break;
+		mdelay(1);
+	} while (timeout--);
+
+	/* Abort all in-flight requests */
+	for (reqid = 0; reqid < RING_MAX_REQ_COUNT; reqid++) {
+		msg = ring->requests[reqid];
+		if (!msg)
+			continue;
+
+		/* Release reqid for recycling */
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+
+		/* Unmap DMA mappings */
+		flexrm_dma_unmap(ring->mbox->dev, msg);
+
+		/* Give-back message to mailbox client */
+		msg->error = -EIO;
+		mbox_chan_received_data(chan, msg);
+	}
+
+	/* Release IRQ */
+	if (ring->irq_requested) {
+		free_irq(ring->irq, ring);
+		ring->irq_requested = false;
+	}
+
+	/* Free-up completion descriptor ring */
+	if (ring->cmpl_base) {
+		dma_pool_free(ring->mbox->cmpl_pool,
+			      ring->cmpl_base, ring->cmpl_dma_base);
+		ring->cmpl_base = NULL;
+	}
+
+	/* Free-up BD descriptor ring */
+	if (ring->bd_base) {
+		dma_pool_free(ring->mbox->bd_pool,
+			      ring->bd_base, ring->bd_dma_base);
+		ring->bd_base = NULL;
+	}
+}
+
+static bool flexrm_last_tx_done(struct mbox_chan *chan)
+{
+	bool ret;
+	unsigned long flags;
+	struct flexrm_ring *ring = chan->con_priv;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	ret = (ring->last_pending_msg) ? false : true;
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	return ret;
+}
+
+static const struct mbox_chan_ops flexrm_mbox_chan_ops = {
+	.send_data	= flexrm_send_data,
+	.startup	= flexrm_startup,
+	.shutdown	= flexrm_shutdown,
+	.last_tx_done	= flexrm_last_tx_done,
+	.peek_data	= flexrm_peek_data,
+};
+
+static struct mbox_chan *flexrm_mbox_of_xlate(struct mbox_controller *cntlr,
+					const struct of_phandle_args *pa)
+{
+	struct mbox_chan *chan;
+	struct flexrm_ring *ring;
+
+	if (pa->args_count < 3)
+		return ERR_PTR(-EINVAL);
+
+	if (pa->args[0] >= cntlr->num_chans)
+		return ERR_PTR(-ENOENT);
+
+	if (pa->args[1] > MSI_COUNT_MASK)
+		return ERR_PTR(-EINVAL);
+
+	if (pa->args[2] > MSI_TIMER_VAL_MASK)
+		return ERR_PTR(-EINVAL);
+
+	chan = &cntlr->chans[pa->args[0]];
+	ring = chan->con_priv;
+	ring->msi_count_threshold = pa->args[1];
+	ring->msi_timer_val = pa->args[2];
+
+	return chan;
+}
+
+/* ====== FlexRM platform driver ===== */
+
+static void flexrm_mbox_msi_write(struct msi_desc *desc, struct msi_msg *msg)
+{
+	struct device *dev = msi_desc_to_dev(desc);
+	struct flexrm_mbox *mbox = dev_get_drvdata(dev);
+	struct flexrm_ring *ring = &mbox->rings[desc->platform.msi_index];
+
+	/* Configure per-Ring MSI registers */
+	writel_relaxed(msg->address_lo, ring->regs + RING_MSI_ADDR_LS);
+	writel_relaxed(msg->address_hi, ring->regs + RING_MSI_ADDR_MS);
+	writel_relaxed(msg->data, ring->regs + RING_MSI_DATA_VALUE);
+}
+
+static int flexrm_mbox_probe(struct platform_device *pdev)
+{
+	int index, ret = 0;
+	void __iomem *regs;
+	void __iomem *regs_end;
+	struct msi_desc *desc;
+	struct resource *iomem;
+	struct flexrm_ring *ring;
+	struct flexrm_mbox *mbox;
+	struct device *dev = &pdev->dev;
+
+	/* Allocate driver mailbox struct */
+	mbox = devm_kzalloc(dev, sizeof(*mbox), GFP_KERNEL);
+	if (!mbox) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	mbox->dev = dev;
+	platform_set_drvdata(pdev, mbox);
+
+	/* Get resource for registers */
+	iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!iomem || (resource_size(iomem) < RING_REGS_SIZE)) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	/* Map registers of all rings */
+	mbox->regs = devm_ioremap_resource(&pdev->dev, iomem);
+	if (IS_ERR(mbox->regs)) {
+		ret = PTR_ERR(mbox->regs);
+		dev_err(&pdev->dev, "Failed to remap mailbox regs: %d\n", ret);
+		goto fail;
+	}
+	regs_end = mbox->regs + resource_size(iomem);
+
+	/* Scan and count available rings */
+	mbox->num_rings = 0;
+	for (regs = mbox->regs; regs < regs_end; regs += RING_REGS_SIZE) {
+		if (readl_relaxed(regs + RING_VER) == RING_VER_MAGIC)
+			mbox->num_rings++;
+	}
+	if (!mbox->num_rings) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	/* Allocate driver ring structs */
+	ring = devm_kcalloc(dev, mbox->num_rings, sizeof(*ring), GFP_KERNEL);
+	if (!ring) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	mbox->rings = ring;
+
+	/* Initialize members of driver ring structs */
+	regs = mbox->regs;
+	for (index = 0; index < mbox->num_rings; index++) {
+		ring = &mbox->rings[index];
+		ring->num = index;
+		ring->mbox = mbox;
+		while ((regs < regs_end) &&
+		       (readl_relaxed(regs + RING_VER) != RING_VER_MAGIC))
+			regs += RING_REGS_SIZE;
+		if (regs_end <= regs) {
+			ret = -ENODEV;
+			goto fail;
+		}
+		ring->regs = regs;
+		regs += RING_REGS_SIZE;
+		ring->irq = UINT_MAX;
+		ring->irq_requested = false;
+		ring->msi_timer_val = MSI_TIMER_VAL_MASK;
+		ring->msi_count_threshold = 0x1;
+		ida_init(&ring->requests_ida);
+		memset(ring->requests, 0, sizeof(ring->requests));
+		ring->bd_base = NULL;
+		ring->bd_dma_base = 0;
+		ring->cmpl_base = NULL;
+		ring->cmpl_dma_base = 0;
+		spin_lock_init(&ring->lock);
+		ring->last_pending_msg = NULL;
+		ring->cmpl_read_offset = 0;
+	}
+
+	/* FlexRM is capable of 40-bit physical addresses only */
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40));
+	if (ret) {
+		ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+		if (ret)
+			goto fail;
+	}
+
+	/* Create DMA pool for ring BD memory */
+	mbox->bd_pool = dma_pool_create("bd", dev, RING_BD_SIZE,
+					1 << RING_BD_ALIGN_ORDER, 0);
+	if (!mbox->bd_pool) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Create DMA pool for ring completion memory */
+	mbox->cmpl_pool = dma_pool_create("cmpl", dev, RING_CMPL_SIZE,
+					  1 << RING_CMPL_ALIGN_ORDER, 0);
+	if (!mbox->cmpl_pool) {
+		ret = -ENOMEM;
+		goto fail_destroy_bd_pool;
+	}
+
+	/* Allocate platform MSIs for each ring */
+	ret = platform_msi_domain_alloc_irqs(dev, mbox->num_rings,
+						flexrm_mbox_msi_write);
+	if (ret)
+		goto fail_destroy_cmpl_pool;
+
+	/* Save alloced IRQ numbers for each ring */
+	for_each_msi_entry(desc, dev) {
+		ring = &mbox->rings[desc->platform.msi_index];
+		ring->irq = desc->irq;
+	}
+
+	/* Initialize mailbox controller */
+	mbox->controller.txdone_irq = false;
+	mbox->controller.txdone_poll = true;
+	mbox->controller.txpoll_period = 1;
+	mbox->controller.ops = &flexrm_mbox_chan_ops;
+	mbox->controller.dev = dev;
+	mbox->controller.num_chans = mbox->num_rings;
+	mbox->controller.of_xlate = flexrm_mbox_of_xlate;
+	mbox->controller.chans = devm_kcalloc(dev, mbox->num_rings,
+				sizeof(*mbox->controller.chans), GFP_KERNEL);
+	if (!mbox->controller.chans) {
+		ret = -ENOMEM;
+		goto fail_free_msis;
+	}
+	for (index = 0; index < mbox->num_rings; index++)
+		mbox->controller.chans[index].con_priv = &mbox->rings[index];
+
+	/* Register mailbox controller */
+	ret = mbox_controller_register(&mbox->controller);
+	if (ret)
+		goto fail_free_msis;
+
+	dev_info(dev, "registered flexrm mailbox with %d channels\n",
+			mbox->controller.num_chans);
+
+	return 0;
+
+fail_free_msis:
+	platform_msi_domain_free_irqs(dev);
+fail_destroy_cmpl_pool:
+	dma_pool_destroy(mbox->cmpl_pool);
+fail_destroy_bd_pool:
+	dma_pool_destroy(mbox->bd_pool);
+fail:
+	return ret;
+}
+
+static int flexrm_mbox_remove(struct platform_device *pdev)
+{
+	int index;
+	struct device *dev = &pdev->dev;
+	struct flexrm_ring *ring;
+	struct flexrm_mbox *mbox = platform_get_drvdata(pdev);
+
+	mbox_controller_unregister(&mbox->controller);
+
+	platform_msi_domain_free_irqs(dev);
+
+	dma_pool_destroy(mbox->cmpl_pool);
+	dma_pool_destroy(mbox->bd_pool);
+
+	for (index = 0; index < mbox->num_rings; index++) {
+		ring = &mbox->rings[index];
+		ida_destroy(&ring->requests_ida);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id flexrm_mbox_of_match[] = {
+	{ .compatible = "brcm,iproc-flexrm-mbox", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, flexrm_mbox_of_match);
+
+static struct platform_driver flexrm_mbox_driver = {
+	.driver = {
+		.name = "brcm-flexrm-mbox",
+		.of_match_table = flexrm_mbox_of_match,
+	},
+	.probe		= flexrm_mbox_probe,
+	.remove		= flexrm_mbox_remove,
+};
+module_platform_driver(flexrm_mbox_driver);
+
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_DESCRIPTION("Broadcom FlexRM mailbox driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mailbox/brcm-message.h b/include/linux/mailbox/brcm-message.h
index 6b55c938b401..c20b4843fc2d 100644
--- a/include/linux/mailbox/brcm-message.h
+++ b/include/linux/mailbox/brcm-message.h
@@ -16,6 +16,7 @@
 
 enum brcm_message_type {
 	BRCM_MESSAGE_UNKNOWN = 0,
+	BRCM_MESSAGE_BATCH,
 	BRCM_MESSAGE_SPU,
 	BRCM_MESSAGE_SBA,
 	BRCM_MESSAGE_MAX,
@@ -23,23 +24,28 @@ enum brcm_message_type {
 
 struct brcm_sba_command {
 	u64 cmd;
+	u64 *cmd_dma;
+	dma_addr_t cmd_dma_addr;
 #define BRCM_SBA_CMD_TYPE_A		BIT(0)
 #define BRCM_SBA_CMD_TYPE_B		BIT(1)
 #define BRCM_SBA_CMD_TYPE_C		BIT(2)
 #define BRCM_SBA_CMD_HAS_RESP		BIT(3)
 #define BRCM_SBA_CMD_HAS_OUTPUT		BIT(4)
 	u64 flags;
-	dma_addr_t input;
-	size_t input_len;
 	dma_addr_t resp;
 	size_t resp_len;
-	dma_addr_t output;
-	size_t output_len;
+	dma_addr_t data;
+	size_t data_len;
 };
 
 struct brcm_message {
 	enum brcm_message_type type;
 	union {
+		struct {
+			struct brcm_message *msgs;
+			unsigned int msgs_queued;
+			unsigned int msgs_count;
+		} batch;
 		struct {
 			struct scatterlist *src;
 			struct scatterlist *dst;
-- 
cgit v1.2.3


From db68ce10c4f0a27c1ff9fa0e789e5c41f8c4ea63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Mar 2017 21:08:07 -0400
Subject: new helper: uaccess_kernel()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/uaccess.h     |  2 +-
 arch/arc/include/asm/uaccess.h       |  2 +-
 arch/arm/include/asm/uaccess.h       |  2 +-
 arch/arm/lib/uaccess_with_memcpy.c   |  4 ++--
 arch/blackfin/kernel/process.c       |  2 +-
 arch/c6x/kernel/sys_c6x.c            |  2 +-
 arch/cris/include/asm/uaccess.h      |  2 +-
 arch/m68k/include/asm/uaccess_mm.h   |  2 +-
 arch/metag/include/asm/uaccess.h     |  2 +-
 arch/mips/include/asm/checksum.h     |  4 ++--
 arch/mips/include/asm/r4kcache.h     |  4 ++--
 arch/mips/include/asm/uaccess.h      |  2 +-
 arch/mips/kernel/unaligned.c         | 10 +++++-----
 arch/openrisc/include/asm/uaccess.h  |  2 +-
 arch/parisc/include/asm/futex.h      |  2 +-
 arch/parisc/lib/memcpy.c             |  2 +-
 arch/s390/include/asm/uaccess.h      |  2 +-
 arch/sparc/include/asm/uaccess.h     |  2 +-
 arch/sparc/include/asm/uaccess_32.h  |  2 +-
 arch/um/include/asm/uaccess.h        |  2 +-
 arch/um/kernel/skas/uaccess.c        | 10 +++++-----
 arch/unicore32/include/asm/uaccess.h |  2 +-
 arch/unicore32/kernel/process.c      |  2 +-
 arch/xtensa/include/asm/uaccess.h    |  2 +-
 block/bsg.c                          |  2 +-
 drivers/scsi/sg.c                    |  2 +-
 include/linux/uaccess.h              |  2 ++
 include/rdma/ib.h                    |  2 +-
 kernel/trace/bpf_trace.c             |  2 +-
 lib/iov_iter.c                       |  2 +-
 mm/memory.c                          |  2 +-
 security/tomoyo/network.c            |  2 +-
 32 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 77c55ce89936..b2f62b98e290 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -425,7 +425,7 @@ clear_user(void __user *to, long len)
 #undef __module_call
 
 #define user_addr_max() \
-        (segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+        (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index d837a53c6e59..ffd14e630aca 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -27,7 +27,7 @@
 #include <linux/string.h>	/* for generic string functions */
 
 
-#define __kernel_ok		(segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok		(uaccess_kernel())
 
 /*
  * Algorithmically, for __user_ok() we want do:
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 9677a7cf7987..b63527359d52 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -266,7 +266,7 @@ static inline void set_fs(mm_segment_t fs)
 #define access_ok(type, addr, size)	(__range_ok(addr, size) == 0)
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), KERNEL_DS) ? ~0UL : get_fs())
+	(uaccess_kernel() ? ~0UL : get_fs())
 
 /*
  * The "__xxx" versions of the user access functions do not verify the
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 6bd1089b07e0..9b4ed1728616 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -90,7 +90,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
 	unsigned long ua_flags;
 	int atomic;
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+	if (uaccess_kernel()) {
 		memcpy((void *)to, from, n);
 		return 0;
 	}
@@ -162,7 +162,7 @@ __clear_user_memset(void __user *addr, unsigned long n)
 {
 	unsigned long ua_flags;
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+	if (uaccess_kernel()) {
 		memset((void *)addr, 0, n);
 		return 0;
 	}
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 89d5162d4ca6..89814850b08b 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -370,7 +370,7 @@ int _access_ok(unsigned long addr, unsigned long size)
 	/* Check that things do not wrap around */
 	if (addr > ULONG_MAX - size)
 		return 0;
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return 1;
 #ifdef CONFIG_MTD_UCLINUX
 	if (1)
diff --git a/arch/c6x/kernel/sys_c6x.c b/arch/c6x/kernel/sys_c6x.c
index 3e9bdfbee8ad..a742ae259239 100644
--- a/arch/c6x/kernel/sys_c6x.c
+++ b/arch/c6x/kernel/sys_c6x.c
@@ -23,7 +23,7 @@ int _access_ok(unsigned long addr, unsigned long size)
 	if (!addr || addr > (0xffffffffUL - (size - 1)))
 		goto _bad_access;
 
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return 1;
 
 	if (memory_start <= addr && (addr + size - 1) < memory_end)
diff --git a/arch/cris/include/asm/uaccess.h b/arch/cris/include/asm/uaccess.h
index c2462ef04eaf..5f5b8f53d2d7 100644
--- a/arch/cris/include/asm/uaccess.h
+++ b/arch/cris/include/asm/uaccess.h
@@ -43,7 +43,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __user_ok(addr, size) \
 	(((size) <= TASK_SIZE) && ((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index fb72b710759e..14054a4e4216 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -375,7 +375,7 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n)
 #define copy_to_user(to, from, n)	__copy_to_user(to, from, n)
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
diff --git a/arch/metag/include/asm/uaccess.h b/arch/metag/include/asm/uaccess.h
index 7fc5277ae71f..83de7554d2b3 100644
--- a/arch/metag/include/asm/uaccess.h
+++ b/arch/metag/include/asm/uaccess.h
@@ -24,7 +24,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 /*
  * Explicitly allow NULL pointers here. Parts of the kernel such
  * as readv/writev use access_ok to validate pointers, but want
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index c8b574f7e0cc..77cad232a1c6 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -50,7 +50,7 @@ __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len,
 				   __wsum sum, int *err_ptr)
 {
 	might_fault();
-	if (segment_eq(get_fs(), get_ds()))
+	if (uaccess_kernel())
 		return __csum_partial_copy_kernel((__force void *)src, dst,
 						  len, sum, err_ptr);
 	else
@@ -82,7 +82,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 {
 	might_fault();
 	if (access_ok(VERIFY_WRITE, dst, len)) {
-		if (segment_eq(get_fs(), get_ds()))
+		if (uaccess_kernel())
 			return __csum_partial_copy_kernel(src,
 							  (__force void *)dst,
 							  len, sum, err_ptr);
diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index 55fd94e6cd0b..7f12d7e27c94 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -20,7 +20,7 @@
 #include <asm/cpu-features.h>
 #include <asm/cpu-type.h>
 #include <asm/mipsmtregs.h>
-#include <linux/uaccess.h> /* for segment_eq() */
+#include <linux/uaccess.h> /* for uaccess_kernel() */
 
 extern void (*r4k_blast_dcache)(void);
 extern void (*r4k_blast_icache)(void);
@@ -714,7 +714,7 @@ static inline void protected_blast_##pfx##cache##_range(unsigned long start,\
 									\
 	__##pfx##flush_prologue						\
 									\
-	if (segment_eq(get_fs(), USER_DS)) {				\
+	if (!uaccess_kernel()) {					\
 		while (1) {						\
 			protected_cachee_op(hitop, addr);		\
 			if (addr == aend)				\
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index dd25b312c973..70ca8eee166a 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -88,7 +88,7 @@ static inline bool eva_kernel_access(void)
 	if (!IS_ENABLED(CONFIG_EVA))
 		return false;
 
-	return segment_eq(get_fs(), get_ds());
+	return uaccess_kernel();
 }
 
 /*
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 7ed98354fe9d..f806ee56e639 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -1026,7 +1026,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				LoadHW(addr, value, res);
 			else
 				LoadHWE(addr, value, res);
@@ -1045,7 +1045,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				LoadW(addr, value, res);
 			else
 				LoadWE(addr, value, res);
@@ -1064,7 +1064,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				LoadHWU(addr, value, res);
 			else
 				LoadHWUE(addr, value, res);
@@ -1132,7 +1132,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		value = regs->regs[insn.i_format.rt];
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				StoreHW(addr, value, res);
 			else
 				StoreHWE(addr, value, res);
@@ -1152,7 +1152,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		value = regs->regs[insn.i_format.rt];
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				StoreW(addr, value, res);
 			else
 				StoreWE(addr, value, res);
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 0b0f60444b76..227af1acb8bd 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -292,7 +292,7 @@ clear_user(void *addr, unsigned long size)
 }
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index ac8bd586ace8..0ba14300cd8e 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -109,7 +109,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	/* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is
 	 * our gateway page, and causes no end of trouble...
 	 */
-	if (segment_eq(KERNEL_DS, get_fs()) && !uaddr)
+	if (uaccess_kernel() && !uaddr)
 		return -EFAULT;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c
index f82ff10ed974..66f5160136c2 100644
--- a/arch/parisc/lib/memcpy.c
+++ b/arch/parisc/lib/memcpy.c
@@ -76,7 +76,7 @@ DECLARE_PER_CPU(struct exception_data, exception_data);
 		goto label;						\
 } while (0)
 
-#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
+#define get_user_space() (uaccess_kernel() ? 0 : mfsp(3))
 #define get_kernel_space() (0)
 
 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 9e9a5e8d6cf6..7228ed8da67d 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -37,7 +37,7 @@
 static inline void set_fs(mm_segment_t fs)
 {
 	current->thread.mm_segment = fs;
-	if (segment_eq(fs, KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		set_cpu_flag(CIF_ASCE_SECONDARY);
 		__ctl_load(S390_lowcore.kernel_asce, 7, 7);
 	} else {
diff --git a/arch/sparc/include/asm/uaccess.h b/arch/sparc/include/asm/uaccess.h
index bd56c28fff9f..9e068bf9060a 100644
--- a/arch/sparc/include/asm/uaccess.h
+++ b/arch/sparc/include/asm/uaccess.h
@@ -7,7 +7,7 @@
 #endif
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 long strncpy_from_user(char *dest, const char __user *src, long count);
 
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 952d512a64f2..a59a1e81986d 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -36,7 +36,7 @@
  * large size and address near to PAGE_OFFSET - a fault will break his intentions.
  */
 #define __user_ok(addr, size) ({ (void)(size); (addr) < STACK_TOP; })
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __access_ok(addr, size) (__user_ok((addr) & get_fs().seg, (size)))
 #define access_ok(type, addr, size) \
 	({ (void)(type); __access_ok((unsigned long)(addr), size); })
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 076bdcb0c2ad..e992bb57da5a 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -45,7 +45,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 	return __addr_range_nowrap(addr, size) &&
 		(__under_task_size(addr, size) ||
 		__access_ok_vsyscall(addr, size) ||
-		segment_eq(get_fs(), KERNEL_DS));
+		uaccess_kernel());
 }
 
 #endif
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 85ac8adb069b..22c9f79db8e6 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -141,7 +141,7 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg)
 
 long __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		memcpy(to, (__force void*)from, n);
 		return 0;
 	}
@@ -161,7 +161,7 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg)
 
 long __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		memcpy((__force void *) to, from, n);
 		return 0;
 	}
@@ -189,7 +189,7 @@ long __strncpy_from_user(char *dst, const char __user *src, long count)
 	long n;
 	char *ptr = dst;
 
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		strncpy(dst, (__force void *) src, count);
 		return strnlen(dst, count);
 	}
@@ -210,7 +210,7 @@ static int clear_chunk(unsigned long addr, int len, void *unused)
 
 unsigned long __clear_user(void __user *mem, unsigned long len)
 {
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		memset((__force void*)mem, 0, len);
 		return 0;
 	}
@@ -235,7 +235,7 @@ long __strnlen_user(const void __user *str, long len)
 {
 	int count = 0, n;
 
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return strnlen((__force char*)str, len) + 1;
 
 	n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count);
diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h
index f60fab718b59..1196c88c2f9b 100644
--- a/arch/unicore32/include/asm/uaccess.h
+++ b/arch/unicore32/include/asm/uaccess.h
@@ -20,7 +20,7 @@
 #define __strnlen_user		__strnlen_user
 #define __clear_user		__clear_user
 
-#define __kernel_ok		(segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok		(uaccess_kernel())
 #define __user_ok(addr, size)	(((size) <= TASK_SIZE)			\
 				&& ((addr) <= TASK_SIZE - (size)))
 #define __access_ok(addr, size)	(__kernel_ok || __user_ok((addr), (size)))
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index d22c1dc7e39e..ddaf78ae6854 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -178,7 +178,7 @@ void __show_regs(struct pt_regs *regs)
 		buf, interrupts_enabled(regs) ? "n" : "ff",
 		fast_interrupts_enabled(regs) ? "n" : "ff",
 		processor_modes[processor_mode(regs)],
-		segment_eq(get_fs(), get_ds()) ? "kernel" : "user");
+		uaccess_kernel() ? "kernel" : "user");
 	{
 		unsigned int ctrl;
 
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index bd8861c811ef..26512692e28f 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -37,7 +37,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __user_ok(addr, size) \
 		(((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..69ccb7801a75 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -650,7 +650,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 
 	dprintk("%s: write %zd bytes\n", bd->name, count);
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+	if (unlikely(uaccess_kernel()))
 		return -EINVAL;
 
 	bsg_set_block(bd, file);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 29b86505f796..5d9136a345ec 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -581,7 +581,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
 	sg_io_hdr_t *hp;
 	unsigned char cmnd[SG_MAX_CDB_SIZE];
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+	if (unlikely(uaccess_kernel()))
 		return -EINVAL;
 
 	if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index b786ca2419b4..9c3ae8706e9d 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -7,6 +7,8 @@
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
 
+#define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS)
+
 #include <asm/uaccess.h>
 
 static __always_inline void pagefault_disabled_inc(void)
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
index 9b4c22a36931..66dbed0c146d 100644
--- a/include/rdma/ib.h
+++ b/include/rdma/ib.h
@@ -100,7 +100,7 @@ struct sockaddr_ib {
  */
 static inline bool ib_safe_file_access(struct file *filp)
 {
-	return filp->f_cred == current_cred() && segment_eq(get_fs(), USER_DS);
+	return filp->f_cred == current_cred() && !uaccess_kernel();
 }
 
 #endif /* _RDMA_IB_H */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cee9802cf3e0..f806dbd66de9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
 	if (unlikely(in_interrupt() ||
 		     current->flags & (PF_KTHREAD | PF_EXITING)))
 		return -EPERM;
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+	if (unlikely(uaccess_kernel()))
 		return -EPERM;
 	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
 		return -EPERM;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..97db876c6862 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -413,7 +413,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 			size_t count)
 {
 	/* It will get better.  Eventually... */
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		direction |= ITER_KVEC;
 		i->type = direction;
 		i->kvec = (struct kvec *)iov;
diff --git a/mm/memory.c b/mm/memory.c
index a97a4cec2e1f..e8f4e10e770a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4136,7 +4136,7 @@ void __might_fault(const char *file, int line)
 	 * get paged out, therefore we'll never actually fault, and the
 	 * below annotations will generate false positives.
 	 */
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return;
 	if (pagefault_disabled())
 		return;
diff --git a/security/tomoyo/network.c b/security/tomoyo/network.c
index 97527710a72a..6c02ac478247 100644
--- a/security/tomoyo/network.c
+++ b/security/tomoyo/network.c
@@ -608,7 +608,7 @@ static int tomoyo_check_unix_address(struct sockaddr *addr,
 static bool tomoyo_kernel_service(void)
 {
 	/* Nothing to do if I am a kernel service. */
-	return segment_eq(get_fs(), KERNEL_DS);
+	return uaccess_kernel();
 }
 
 /**
-- 
cgit v1.2.3


From a2c680c6ce386e9ca6cdf362e8b01789126c9bf7 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@gmail.com>
Date: Tue, 14 Mar 2017 11:18:03 -0400
Subject: firmware/qcom: add qcom_scm_restore_sec_cfg()

Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/firmware/qcom_scm-32.c |  6 ++++++
 drivers/firmware/qcom_scm-64.c | 16 ++++++++++++++++
 drivers/firmware/qcom_scm.c    |  6 ++++++
 drivers/firmware/qcom_scm.h    |  5 +++++
 include/linux/qcom_scm.h       |  2 ++
 5 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom_scm-32.c b/drivers/firmware/qcom_scm-32.c
index 8ad226c60374..722e65af588d 100644
--- a/drivers/firmware/qcom_scm-32.c
+++ b/drivers/firmware/qcom_scm-32.c
@@ -578,3 +578,9 @@ int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id)
 
 	return ret ? : le32_to_cpu(scm_ret);
 }
+
+int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id,
+			       u32 spare)
+{
+	return -ENODEV;
+}
diff --git a/drivers/firmware/qcom_scm-64.c b/drivers/firmware/qcom_scm-64.c
index c9332590e8c6..550e3a34e260 100644
--- a/drivers/firmware/qcom_scm-64.c
+++ b/drivers/firmware/qcom_scm-64.c
@@ -381,3 +381,19 @@ int __qcom_scm_set_remote_state(struct device *dev, u32 state, u32 id)
 
 	return ret ? : res.a1;
 }
+
+int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id, u32 spare)
+{
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+	int ret;
+
+	desc.args[0] = device_id;
+	desc.args[1] = spare;
+	desc.arginfo = QCOM_SCM_ARGS(2);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_MP, QCOM_SCM_RESTORE_SEC_CFG,
+			    &desc, &res);
+
+	return ret ? : res.a1;
+}
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index d987bcc7489d..ae1f4732e060 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -315,6 +315,12 @@ static const struct reset_control_ops qcom_scm_pas_reset_ops = {
 	.deassert = qcom_scm_pas_reset_deassert,
 };
 
+int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare)
+{
+	return __qcom_scm_restore_sec_cfg(__scm->dev, device_id, spare);
+}
+EXPORT_SYMBOL(qcom_scm_restore_sec_cfg);
+
 /**
  * qcom_scm_is_available() - Checks if SCM is available
  */
diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h
index 6a0f15469344..31fc732960ca 100644
--- a/drivers/firmware/qcom_scm.h
+++ b/drivers/firmware/qcom_scm.h
@@ -85,4 +85,9 @@ static inline int qcom_scm_remap_error(int err)
 	return -EINVAL;
 }
 
+#define QCOM_SCM_SVC_MP			0xc
+#define QCOM_SCM_RESTORE_SEC_CFG	2
+extern int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id,
+				      u32 spare);
+
 #endif
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index d32f6f1a5225..22017f5d17e0 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -40,6 +40,7 @@ extern int qcom_scm_pas_shutdown(u32 peripheral);
 extern void qcom_scm_cpu_power_down(u32 flags);
 extern u32 qcom_scm_get_version(void);
 extern int qcom_scm_set_remote_state(u32 state, u32 id);
+extern int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
 #else
 static inline
 int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus)
@@ -67,5 +68,6 @@ static inline void qcom_scm_cpu_power_down(u32 flags) {}
 static inline u32 qcom_scm_get_version(void) { return 0; }
 static inline u32
 qcom_scm_set_remote_state(u32 state,u32 id) { return -ENODEV; }
+static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) { return -ENODEV; }
 #endif
 #endif
-- 
cgit v1.2.3


From b182cc4d597a6e73ff04ee1b7fb4f1a28f56ae3d Mon Sep 17 00:00:00 2001
From: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Date: Tue, 14 Mar 2017 11:18:04 -0400
Subject: firmware: qcom_scm: add two scm calls for iommu secure page table

Those two new SCM calls are needed from qcom-iommu driver in order
to initialize secure iommu page table.

Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
Signed-off-by: Rob Clark <robdclark@gmail.com>
Signed-off-by: Andy Gross <andy.gross@linaro.org>
---
 drivers/firmware/qcom_scm-32.c | 12 ++++++++++++
 drivers/firmware/qcom_scm-64.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 drivers/firmware/qcom_scm.c    | 12 ++++++++++++
 drivers/firmware/qcom_scm.h    |  6 ++++++
 include/linux/qcom_scm.h       |  4 ++++
 5 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/qcom_scm-32.c b/drivers/firmware/qcom_scm-32.c
index 722e65af588d..93e3b96b6dfa 100644
--- a/drivers/firmware/qcom_scm-32.c
+++ b/drivers/firmware/qcom_scm-32.c
@@ -584,3 +584,15 @@ int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id,
 {
 	return -ENODEV;
 }
+
+int __qcom_scm_iommu_secure_ptbl_size(struct device *dev, u32 spare,
+				      size_t *size)
+{
+	return -ENODEV;
+}
+
+int __qcom_scm_iommu_secure_ptbl_init(struct device *dev, u64 addr, u32 size,
+				      u32 spare)
+{
+	return -ENODEV;
+}
diff --git a/drivers/firmware/qcom_scm-64.c b/drivers/firmware/qcom_scm-64.c
index 550e3a34e260..6e6d561708e2 100644
--- a/drivers/firmware/qcom_scm-64.c
+++ b/drivers/firmware/qcom_scm-64.c
@@ -397,3 +397,45 @@ int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id, u32 spare)
 
 	return ret ? : res.a1;
 }
+
+int __qcom_scm_iommu_secure_ptbl_size(struct device *dev, u32 spare,
+				      size_t *size)
+{
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+	int ret;
+
+	desc.args[0] = spare;
+	desc.arginfo = QCOM_SCM_ARGS(1);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_MP,
+			    QCOM_SCM_IOMMU_SECURE_PTBL_SIZE, &desc, &res);
+
+	if (size)
+		*size = res.a1;
+
+	return ret ? : res.a2;
+}
+
+int __qcom_scm_iommu_secure_ptbl_init(struct device *dev, u64 addr, u32 size,
+				      u32 spare)
+{
+	struct qcom_scm_desc desc = {0};
+	struct arm_smccc_res res;
+	int ret;
+
+	desc.args[0] = addr;
+	desc.args[1] = size;
+	desc.args[2] = spare;
+	desc.arginfo = QCOM_SCM_ARGS(3, QCOM_SCM_RW, QCOM_SCM_VAL,
+				     QCOM_SCM_VAL);
+
+	ret = qcom_scm_call(dev, QCOM_SCM_SVC_MP,
+			    QCOM_SCM_IOMMU_SECURE_PTBL_INIT, &desc, &res);
+
+	/* the pg table has been initialized already, ignore the error */
+	if (ret == -EPERM)
+		ret = 0;
+
+	return ret;
+}
diff --git a/drivers/firmware/qcom_scm.c b/drivers/firmware/qcom_scm.c
index ae1f4732e060..bb16510d75ba 100644
--- a/drivers/firmware/qcom_scm.c
+++ b/drivers/firmware/qcom_scm.c
@@ -321,6 +321,18 @@ int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare)
 }
 EXPORT_SYMBOL(qcom_scm_restore_sec_cfg);
 
+int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size)
+{
+	return __qcom_scm_iommu_secure_ptbl_size(__scm->dev, spare, size);
+}
+EXPORT_SYMBOL(qcom_scm_iommu_secure_ptbl_size);
+
+int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare)
+{
+	return __qcom_scm_iommu_secure_ptbl_init(__scm->dev, addr, size, spare);
+}
+EXPORT_SYMBOL(qcom_scm_iommu_secure_ptbl_init);
+
 /**
  * qcom_scm_is_available() - Checks if SCM is available
  */
diff --git a/drivers/firmware/qcom_scm.h b/drivers/firmware/qcom_scm.h
index 31fc732960ca..9bea691f30fb 100644
--- a/drivers/firmware/qcom_scm.h
+++ b/drivers/firmware/qcom_scm.h
@@ -89,5 +89,11 @@ static inline int qcom_scm_remap_error(int err)
 #define QCOM_SCM_RESTORE_SEC_CFG	2
 extern int __qcom_scm_restore_sec_cfg(struct device *dev, u32 device_id,
 				      u32 spare);
+#define QCOM_SCM_IOMMU_SECURE_PTBL_SIZE	3
+#define QCOM_SCM_IOMMU_SECURE_PTBL_INIT	4
+extern int __qcom_scm_iommu_secure_ptbl_size(struct device *dev, u32 spare,
+					     size_t *size);
+extern int __qcom_scm_iommu_secure_ptbl_init(struct device *dev, u64 addr,
+					     u32 size, u32 spare);
 
 #endif
diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h
index 22017f5d17e0..e5380471c2cd 100644
--- a/include/linux/qcom_scm.h
+++ b/include/linux/qcom_scm.h
@@ -41,6 +41,8 @@ extern void qcom_scm_cpu_power_down(u32 flags);
 extern u32 qcom_scm_get_version(void);
 extern int qcom_scm_set_remote_state(u32 state, u32 id);
 extern int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare);
+extern int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size);
+extern int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare);
 #else
 static inline
 int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus)
@@ -69,5 +71,7 @@ static inline u32 qcom_scm_get_version(void) { return 0; }
 static inline u32
 qcom_scm_set_remote_state(u32 state,u32 id) { return -ENODEV; }
 static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) { return -ENODEV; }
+static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size) { return -ENODEV; }
+static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare) { return -ENODEV; }
 #endif
 #endif
-- 
cgit v1.2.3


From 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Thu, 9 Mar 2017 00:05:31 -0800
Subject: blkcg: allocate struct blkcg_gq outside request queue spinlock

blkg_conf_prep() currently calls blkg_lookup_create() while holding
request queue spinlock. This means allocating memory for struct
blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
failures in call paths like below:

  pcpu_alloc+0x68f/0x710
  __alloc_percpu_gfp+0xd/0x10
  __percpu_counter_init+0x55/0xc0
  cfq_pd_alloc+0x3b2/0x4e0
  blkg_alloc+0x187/0x230
  blkg_create+0x489/0x670
  blkg_lookup_create+0x9a/0x230
  blkg_conf_prep+0x1fb/0x240
  __cfqg_set_weight_device.isra.105+0x5c/0x180
  cfq_set_weight_on_dfl+0x69/0xc0
  cgroup_file_write+0x39/0x1c0
  kernfs_fop_write+0x13f/0x1d0
  __vfs_write+0x23/0x120
  vfs_write+0xc2/0x1f0
  SyS_write+0x44/0xb0
  entry_SYSCALL_64_fastpath+0x18/0xad

In the code path above, percpu allocator cannot call vmalloc() due to
queue spinlock.

A failure in this call path gives grief to tools which are trying to
configure io weights. We see occasional failures happen shortly after
reboots even when system is not under any memory pressure. Machines
with a lot of cpus are more vulnerable to this condition.

Update blkg_create() function to temporarily drop the rcu and queue
locks when it is allowed by gfp mask.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 138 ++++++++++++++++++++++++++++-----------------
 include/linux/blk-cgroup.h |   6 +-
 2 files changed, 91 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..bdf87f0c1b1b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -165,16 +165,18 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
- * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
+ * If gfp mask allows blocking, this function temporarily drops rcu and queue
+ * locks to allocate memory.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
-				    struct request_queue *q,
-				    struct blkcg_gq *new_blkg)
+				    struct request_queue *q, gfp_t gfp,
+				    const struct blkcg_policy *pol)
 {
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *blkg = NULL;
 	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
+	const bool drop_locks = gfpflags_allow_blocking(gfp);
+	bool preloaded = false;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -185,31 +187,53 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
+	if (drop_locks) {
+		spin_unlock_irq(q->queue_lock);
+		rcu_read_unlock();
+	}
+
 	wb_congested = wb_congested_get_create(q->backing_dev_info,
-					       blkcg->css.id,
-					       GFP_NOWAIT | __GFP_NOWARN);
-	if (!wb_congested) {
+					       blkcg->css.id, gfp);
+	blkg = blkg_alloc(blkcg, q, gfp);
+
+	if (drop_locks) {
+		preloaded = !radix_tree_preload(gfp);
+		rcu_read_lock();
+		spin_lock_irq(q->queue_lock);
+	}
+
+	if (unlikely(!wb_congested || !blkg)) {
 		ret = -ENOMEM;
-		goto err_put_css;
+		goto err_put;
 	}
 
-	/* allocate */
-	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
-		if (unlikely(!new_blkg)) {
-			ret = -ENOMEM;
-			goto err_put_congested;
+	blkg->wb_congested = wb_congested;
+
+	if (pol) {
+		WARN_ON(!drop_locks);
+
+		if (!blkcg_policy_enabled(q, pol)) {
+			ret = -EOPNOTSUPP;
+			goto err_put;
+		}
+
+		/*
+		 * This could be the first entry point of blkcg implementation
+		 * and we shouldn't allow anything to go through for a bypassing
+		 * queue.
+		 */
+		if (unlikely(blk_queue_bypass(q))) {
+			ret = blk_queue_dying(q) ? -ENODEV : -EBUSY;
+			goto err_put;
 		}
 	}
-	blkg = new_blkg;
-	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put_congested;
+			goto err_put;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -236,6 +260,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
+
+	if (preloaded)
+		radix_tree_preload_end();
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
@@ -246,44 +273,45 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put_congested:
-	wb_congested_put(wb_congested);
-err_put_css:
+err_put:
+	if (preloaded)
+		radix_tree_preload_end();
+	if (wb_congested)
+		wb_congested_put(wb_congested);
 	css_put(&blkcg->css);
 err_free_blkg:
-	blkg_free(new_blkg);
+	blkg_free(blkg);
 	return ERR_PTR(ret);
 }
 
 /**
- * blkg_lookup_create - lookup blkg, try to create one if not there
+ * __blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
+ * @gfp: gfp mask
+ * @pol: blkcg policy (optional)
  *
  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
  * create one.  blkg creation is performed recursively from blkcg_root such
  * that all non-root blkg's have access to the parent blkg.  This function
  * should be called under RCU read lock and @q->queue_lock.
  *
+ * When gfp mask allows blocking, rcu and queue locks may be dropped for
+ * allocating memory. In this case, the locks will be reacquired on return.
+ *
  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
  * dead and bypassing, returns ERR_PTR(-EBUSY).
  */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q)
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+				      struct request_queue *q, gfp_t gfp,
+				      const struct blkcg_policy *pol)
 {
 	struct blkcg_gq *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
-
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
 		return blkg;
@@ -301,12 +329,35 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			parent = blkcg_parent(parent);
 		}
 
-		blkg = blkg_create(pos, q, NULL);
+		blkg = blkg_create(pos, q, gfp, pol);
 		if (pos == blkcg || IS_ERR(blkg))
 			return blkg;
 	}
 }
 
+/**
+ * blkg_lookup_create - lookup blkg, try to create one if not there
+ *
+ * Performs an initial queue bypass check and then passes control to
+ * __blkg_lookup_create().
+ */
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q, gfp_t gfp,
+				    const struct blkcg_policy *pol)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
+
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+	return __blkg_lookup_create(blkcg, q, gfp, pol);
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
@@ -817,7 +868,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	spin_lock_irq(disk->queue->queue_lock);
 
 	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue);
+		blkg = blkg_lookup_create(blkcg, disk->queue, GFP_KERNEL, pol);
 	else
 		blkg = ERR_PTR(-EOPNOTSUPP);
 
@@ -1056,30 +1107,15 @@ free_blkcg:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	struct blkcg_gq *new_blkg, *blkg;
-	bool preloaded;
+	struct blkcg_gq *blkg;
 	int ret;
 
-	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-	if (!new_blkg)
-		return -ENOMEM;
-
-	preloaded = !radix_tree_preload(GFP_KERNEL);
-
-	/*
-	 * Make sure the root blkg exists and count the existing blkgs.  As
-	 * @q is bypassing at this point, blkg_lookup_create() can't be
-	 * used.  Open code insertion.
-	 */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
-	blkg = blkg_create(&blkcg_root, q, new_blkg);
+	blkg = __blkg_lookup_create(&blkcg_root, q, GFP_KERNEL, NULL);
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
-	if (preloaded)
-		radix_tree_preload_end();
-
 	if (IS_ERR(blkg))
 		return PTR_ERR(blkg);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 01b62e7bac74..955903a8f6cb 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -172,7 +172,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q);
+				    struct request_queue *q, gfp_t gfp,
+				    const struct blkcg_policy *pol);
 int blkcg_init_queue(struct request_queue *q);
 void blkcg_drain_queue(struct request_queue *q);
 void blkcg_exit_queue(struct request_queue *q);
@@ -694,7 +695,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q);
+		blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN,
+					  NULL);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(q->queue_lock);
-- 
cgit v1.2.3


From dfa672fbc0d9e83ff0dc1a75f1f5d0e59a30706b Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:16 +0300
Subject: ACPI / property: Add possiblity to retrieve parent firmware node

Sometimes it is useful to be able to navigate firmware node hierarchy
upwards toward parent nodes. ACPI device nodes are pretty much already
supported because ACPICA provides acpi_get_parent(). ACPI data nodes,
however, are all below the same parent ACPI device. Their hierarchy is
created by "linking" each other using references in the value field.

Add parent pointer to the parent data node while we create them so it is
easy to navigate the hierarchy backwards. We use this parent pointer in a
new function acpi_node_get_parent() that is able to extract parent of both
ACPI firmware node types.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 72 ++++++++++++++++++++++++++++++++++++++-----------
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  7 +++++
 3 files changed, 65 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 3afddcd834ef..587c9d000f0e 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -37,14 +37,16 @@ static const u8 ads_uuid[16] = {
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 					   const union acpi_object *desc,
-					   struct acpi_device_data *data);
+					   struct acpi_device_data *data,
+					   struct fwnode_handle *parent);
 static bool acpi_extract_properties(const union acpi_object *desc,
 				    struct acpi_device_data *data);
 
 static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 					acpi_handle handle,
 					const union acpi_object *link,
-					struct list_head *list)
+					struct list_head *list,
+					struct fwnode_handle *parent)
 {
 	struct acpi_data_node *dn;
 	bool result;
@@ -55,6 +57,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 
 	dn->name = link->package.elements[0].string.pointer;
 	dn->fwnode.type = FWNODE_ACPI_DATA;
+	dn->parent = parent;
 	INIT_LIST_HEAD(&dn->data.subnodes);
 
 	result = acpi_extract_properties(desc, &dn->data);
@@ -71,9 +74,11 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 		 */
 		status = acpi_get_parent(handle, &scope);
 		if (ACPI_SUCCESS(status)
-		    && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data))
+		    && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data,
+						      &dn->fwnode))
 			result = true;
-	} else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data)) {
+	} else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data,
+						  &dn->fwnode)) {
 		result = true;
 	}
 
@@ -91,7 +96,8 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 
 static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 					const union acpi_object *link,
-					struct list_head *list)
+					struct list_head *list,
+					struct fwnode_handle *parent)
 {
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
 	acpi_status status;
@@ -101,7 +107,8 @@ static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 	if (ACPI_FAILURE(status))
 		return false;
 
-	if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list))
+	if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list,
+					parent))
 		return true;
 
 	ACPI_FREE(buf.pointer);
@@ -110,7 +117,8 @@ static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 
 static bool acpi_nondev_subnode_ok(acpi_handle scope,
 				   const union acpi_object *link,
-				   struct list_head *list)
+				   struct list_head *list,
+				   struct fwnode_handle *parent)
 {
 	acpi_handle handle;
 	acpi_status status;
@@ -123,12 +131,13 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope,
 	if (ACPI_FAILURE(status))
 		return false;
 
-	return acpi_nondev_subnode_data_ok(handle, link, list);
+	return acpi_nondev_subnode_data_ok(handle, link, list, parent);
 }
 
 static int acpi_add_nondev_subnodes(acpi_handle scope,
 				    const union acpi_object *links,
-				    struct list_head *list)
+				    struct list_head *list,
+				    struct fwnode_handle *parent)
 {
 	bool ret = false;
 	int i;
@@ -150,15 +159,18 @@ static int acpi_add_nondev_subnodes(acpi_handle scope,
 		/* The second one may be a string, a reference or a package. */
 		switch (link->package.elements[1].type) {
 		case ACPI_TYPE_STRING:
-			result = acpi_nondev_subnode_ok(scope, link, list);
+			result = acpi_nondev_subnode_ok(scope, link, list,
+							 parent);
 			break;
 		case ACPI_TYPE_LOCAL_REFERENCE:
 			handle = link->package.elements[1].reference.handle;
-			result = acpi_nondev_subnode_data_ok(handle, link, list);
+			result = acpi_nondev_subnode_data_ok(handle, link, list,
+							     parent);
 			break;
 		case ACPI_TYPE_PACKAGE:
 			desc = &link->package.elements[1];
-			result = acpi_nondev_subnode_extract(desc, NULL, link, list);
+			result = acpi_nondev_subnode_extract(desc, NULL, link,
+							     list, parent);
 			break;
 		default:
 			result = false;
@@ -172,7 +184,8 @@ static int acpi_add_nondev_subnodes(acpi_handle scope,
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 					   const union acpi_object *desc,
-					   struct acpi_device_data *data)
+					   struct acpi_device_data *data,
+					   struct fwnode_handle *parent)
 {
 	int i;
 
@@ -194,7 +207,8 @@ static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 		if (memcmp(uuid->buffer.pointer, ads_uuid, sizeof(ads_uuid)))
 			continue;
 
-		return acpi_add_nondev_subnodes(scope, links, &data->subnodes);
+		return acpi_add_nondev_subnodes(scope, links, &data->subnodes,
+						parent);
 	}
 
 	return false;
@@ -345,7 +359,8 @@ void acpi_init_properties(struct acpi_device *adev)
 		if (acpi_of)
 			acpi_init_of_compatible(adev);
 	}
-	if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer, &adev->data))
+	if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer,
+					&adev->data, acpi_fwnode_handle(adev)))
 		adev->data.pointer = buf.pointer;
 
 	if (!adev->data.pointer) {
@@ -920,3 +935,30 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 	}
 	return NULL;
 }
+
+/**
+ * acpi_node_get_parent - Return parent fwnode of this fwnode
+ * @fwnode: Firmware node whose parent to get
+ *
+ * Returns parent node of an ACPI device or data firmware node or %NULL if
+ * not available.
+ */
+struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
+{
+	if (is_acpi_data_node(fwnode)) {
+		/* All data nodes have parent pointer so just return that */
+		return to_acpi_data_node(fwnode)->parent;
+	} else if (is_acpi_device_node(fwnode)) {
+		acpi_handle handle, parent_handle;
+
+		handle = to_acpi_device_node(fwnode)->handle;
+		if (ACPI_SUCCESS(acpi_get_parent(handle, &parent_handle))) {
+			struct acpi_device *adev;
+
+			if (!acpi_bus_get_device(parent_handle, &adev))
+				return acpi_fwnode_handle(adev);
+		}
+	}
+
+	return NULL;
+}
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ef0ae8aaa567..49cca52b214b 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -386,6 +386,7 @@ struct acpi_data_node {
 	const char *name;
 	acpi_handle handle;
 	struct fwnode_handle fwnode;
+	struct fwnode_handle *parent;
 	struct acpi_device_data data;
 	struct list_head sibling;
 	struct kobject kobj;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..e74e8bdbb6af 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -999,6 +999,7 @@ int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
 
 struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 					    struct fwnode_handle *subnode);
+struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
 
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
@@ -1121,6 +1122,12 @@ static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 	return NULL;
 }
 
+static inline struct fwnode_handle *
+acpi_node_get_parent(struct fwnode_handle *fwnode)
+{
+	return NULL;
+}
+
 #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
 	static const void * __acpi_table_##name[]			\
 		__attribute__((unused))					\
-- 
cgit v1.2.3


From afaf26fd8458be29949ae5a52c65a464a1b0cbb6 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:17 +0300
Subject: device property: Add fwnode_get_parent()

Now that ACPI has support for returning parent firmware node for both types
of nodes we can expose this to others as well. This adds a new function
fwnode_get_parent() that can be used for DT and ACPI nodes to retrieve the
parent firmware node.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 25 +++++++++++++++++++++++++
 include/linux/property.h |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index c458c63e353f..a25233430d18 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -931,6 +931,31 @@ int device_add_properties(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_add_properties);
 
+/**
+ * fwnode_get_parent - Return parent firwmare node
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * Return parent firmware node of the given node if possible or %NULL if no
+ * parent was available.
+ */
+struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_get_parent(to_of_node(fwnode));
+		if (node)
+			parent = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		parent = acpi_node_get_parent(fwnode);
+	}
+
+	return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_parent);
+
 /**
  * device_get_next_child_node - Return the next child node handle for a device
  * @dev: Device to find the next child node for.
diff --git a/include/linux/property.h b/include/linux/property.h
index 64e3a9c6d95f..ab0a8160cef6 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -70,6 +70,8 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
 int fwnode_property_match_string(struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
+struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+
 struct fwnode_handle *device_get_next_child_node(struct device *dev,
 						 struct fwnode_handle *child);
 
-- 
cgit v1.2.3


From 34055190b19d7c634caf738c8ca195cad06550cd Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:18 +0300
Subject: ACPI / property: Add fwnode_get_next_child_node()

The ACPI _DSD hierarchical data extension makes it possible to have
hierarchies deeper than one level in similar way than DT allows. These
"subsubnodes" have not been accessible because device property
implementation only provides device_get_next_child_node() that is limited
to direct descendants of a device.

We need this ability in order support things like remote endpoints
currently supported in DT with of_graph_* APIs.

Modify acpi_get_next_subnode() to accept fwnode handle instead and update
callers accordingly. Also add a new function fwnode_get_next_child_node()
that works directly with fwnodes and modify device_get_next_child_node() to
call it directly. While there add a macro fwnode_for_each_child_node()
analogous to the current device_for_each_child_node() but it works with
fwnodes instead of devices.

Link: http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.pdf
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c  | 27 +++++++++++++++++----------
 drivers/base/property.c  | 38 ++++++++++++++++++++++++++++++--------
 include/linux/acpi.h     |  8 ++++----
 include/linux/property.h |  6 ++++++
 4 files changed, 57 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 587c9d000f0e..8730ce745b43 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -880,21 +880,22 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode,  const char *propname,
 }
 
 /**
- * acpi_get_next_subnode - Return the next child node handle for a device.
- * @dev: Device to find the next child node for.
+ * acpi_get_next_subnode - Return the next child node handle for a fwnode
+ * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the device's child nodes or a null handle.
  */
-struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
+struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child)
 {
-	struct acpi_device *adev = ACPI_COMPANION(dev);
+	struct acpi_device *adev = to_acpi_device_node(fwnode);
 	struct list_head *head, *next;
 
-	if (!adev)
-		return NULL;
-
 	if (!child || child->type == FWNODE_ACPI) {
-		head = &adev->children;
+		if (adev)
+			head = &adev->children;
+		else
+			goto nondev;
+
 		if (list_empty(head))
 			goto nondev;
 
@@ -903,7 +904,6 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 			next = adev->node.next;
 			if (next == head) {
 				child = NULL;
-				adev = ACPI_COMPANION(dev);
 				goto nondev;
 			}
 			adev = list_entry(next, struct acpi_device, node);
@@ -915,9 +915,16 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 
  nondev:
 	if (!child || child->type == FWNODE_ACPI_DATA) {
+		struct acpi_data_node *data = to_acpi_data_node(fwnode);
 		struct acpi_data_node *dn;
 
-		head = &adev->data.subnodes;
+		if (adev)
+			head = &adev->data.subnodes;
+		else if (data)
+			head = &data->data.subnodes;
+		else
+			return NULL;
+
 		if (list_empty(head))
 			return NULL;
 
diff --git a/drivers/base/property.c b/drivers/base/property.c
index a25233430d18..bc0f07ac48f6 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -957,24 +957,46 @@ struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode)
 EXPORT_SYMBOL_GPL(fwnode_get_parent);
 
 /**
- * device_get_next_child_node - Return the next child node handle for a device
- * @dev: Device to find the next child node for.
- * @child: Handle to one of the device's child nodes or a null handle.
+ * fwnode_get_next_child_node - Return the next child node handle for a node
+ * @fwnode: Firmware node to find the next child node for.
+ * @child: Handle to one of the node's child nodes or a %NULL handle.
  */
-struct fwnode_handle *device_get_next_child_node(struct device *dev,
+struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
 						 struct fwnode_handle *child)
 {
-	if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
+	if (is_of_node(fwnode)) {
 		struct device_node *node;
 
-		node = of_get_next_available_child(dev->of_node, to_of_node(child));
+		node = of_get_next_available_child(to_of_node(fwnode),
+						   to_of_node(child));
 		if (node)
 			return &node->fwnode;
-	} else if (IS_ENABLED(CONFIG_ACPI)) {
-		return acpi_get_next_subnode(dev, child);
+	} else if (is_acpi_node(fwnode)) {
+		return acpi_get_next_subnode(fwnode, child);
 	}
+
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fwnode_get_next_child_node);
+
+/**
+ * device_get_next_child_node - Return the next child node handle for a device
+ * @dev: Device to find the next child node for.
+ * @child: Handle to one of the device's child nodes or a null handle.
+ */
+struct fwnode_handle *device_get_next_child_node(struct device *dev,
+						 struct fwnode_handle *child)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	struct fwnode_handle *fwnode = NULL;
+
+	if (dev->of_node)
+		fwnode = &dev->of_node->fwnode;
+	else if (adev)
+		fwnode = acpi_fwnode_handle(adev);
+
+	return fwnode_get_next_child_node(fwnode, child);
+}
 EXPORT_SYMBOL_GPL(device_get_next_child_node);
 
 /**
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e74e8bdbb6af..4eb1f5941ede 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -997,8 +997,8 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode, const char *propname,
 int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
 		       enum dev_prop_type proptype, void *val, size_t nval);
 
-struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
-					    struct fwnode_handle *subnode);
+struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
+					    struct fwnode_handle *child);
 struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
 
 struct acpi_probe_entry;
@@ -1116,8 +1116,8 @@ static inline int acpi_dev_prop_read(struct acpi_device *adev,
 	return -ENXIO;
 }
 
-static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
-						struct fwnode_handle *subnode)
+static inline struct fwnode_handle *
+acpi_get_next_subnode(struct fwnode_handle *fwnode, struct fwnode_handle *child)
 {
 	return NULL;
 }
diff --git a/include/linux/property.h b/include/linux/property.h
index ab0a8160cef6..f4786a8655f1 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -71,6 +71,12 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
 struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
+						 struct fwnode_handle *child);
+
+#define fwnode_for_each_child_node(fwnode, child)			\
+	for (child = fwnode_get_next_child_node(fwnode, NULL); child;	\
+	     child = fwnode_get_next_child_node(fwnode, child))
 
 struct fwnode_handle *device_get_next_child_node(struct device *dev,
 						 struct fwnode_handle *child);
-- 
cgit v1.2.3


From 21ea73f54c6d77f35381c79870160496c9e78b60 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:19 +0300
Subject: device property: Add fwnode_get_named_child_node()

Since now we have means to enumerate all children of any fwnode even in
ACPI we can implement fwnode_get_named_child_node(). This is similar than
device_get_named_child_node() with the exception that it can be called to
any fwnode handle. Make device_get_named_child_node() call directly this
new function.

This is useful in cases where we need to be able to find child nodes which
are not direct descendants of the parent device.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 22 +++++++++++++++++-----
 include/linux/property.h |  2 ++
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index bc0f07ac48f6..538b248d5dcf 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1000,20 +1000,20 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev,
 EXPORT_SYMBOL_GPL(device_get_next_child_node);
 
 /**
- * device_get_named_child_node - Return first matching named child node handle
- * @dev: Device to find the named child node for.
+ * fwnode_get_named_child_node - Return first matching named child node handle
+ * @fwnode: Firmware node to find the named child node for.
  * @childname: String to match child node name against.
  */
-struct fwnode_handle *device_get_named_child_node(struct device *dev,
+struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
 						  const char *childname)
 {
 	struct fwnode_handle *child;
 
 	/*
-	 * Find first matching named child node of this device.
+	 * Find first matching named child node of this fwnode.
 	 * For ACPI this will be a data only sub-node.
 	 */
-	device_for_each_child_node(dev, child) {
+	fwnode_for_each_child_node(fwnode, child) {
 		if (is_of_node(child)) {
 			if (!of_node_cmp(to_of_node(child)->name, childname))
 				return child;
@@ -1025,6 +1025,18 @@ struct fwnode_handle *device_get_named_child_node(struct device *dev,
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fwnode_get_named_child_node);
+
+/**
+ * device_get_named_child_node - Return first matching named child node handle
+ * @dev: Device to find the named child node for.
+ * @childname: String to match child node name against.
+ */
+struct fwnode_handle *device_get_named_child_node(struct device *dev,
+						  const char *childname)
+{
+	return fwnode_get_named_child_node(dev_fwnode(dev), childname);
+}
 EXPORT_SYMBOL_GPL(device_get_named_child_node);
 
 /**
diff --git a/include/linux/property.h b/include/linux/property.h
index f4786a8655f1..514b19559fbe 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -85,6 +85,8 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev,
 	for (child = device_get_next_child_node(dev, NULL); child;	\
 	     child = device_get_next_child_node(dev, child))
 
+struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
+						  const char *childname);
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
 						  const char *childname);
 
-- 
cgit v1.2.3


From 79389a83bc3888a900191e3990cda5c76f2ca1ec Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:20 +0300
Subject: ACPI / property: Add support for remote endpoints

DT has had concept of remote endpoints for some time already. It makes
possible to reference another firmware node through a property called
remote-endpoint. This is already used by some subsystems like v4l2 for
parsing hardware properties related to camera.

This patch adds ACPI support for remote endpoints utilizing _DSD
hierarchical data extensions.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h    |  23 ++++++++
 2 files changed, 161 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 8730ce745b43..9eb7c13c5bb7 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -969,3 +969,141 @@ struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
 
 	return NULL;
 }
+
+/**
+ * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node
+ * @fwnode: Pointer to the parent firmware node
+ * @prev: Previous endpoint node or %NULL to get the first
+ *
+ * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns
+ * %NULL if there is no next endpoint, ERR_PTR() in case of error. In case
+ * of success the next endpoint is returned.
+ */
+struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+						   struct fwnode_handle *prev)
+{
+	struct fwnode_handle *port = NULL;
+	struct fwnode_handle *endpoint;
+
+	if (!prev) {
+		do {
+			port = fwnode_get_next_child_node(fwnode, port);
+			/* Ports must have port property */
+			if (fwnode_property_present(port, "port"))
+				break;
+		} while (port);
+	} else {
+		port = fwnode_get_parent(prev);
+	}
+
+	if (!port)
+		return NULL;
+
+	endpoint = fwnode_get_next_child_node(port, prev);
+	while (!endpoint) {
+		port = fwnode_get_next_child_node(fwnode, port);
+		if (!port)
+			break;
+		if (fwnode_property_present(port, "port"))
+			endpoint = fwnode_get_next_child_node(port, NULL);
+	}
+
+	if (endpoint) {
+		/* Endpoints must have "endpoint" property */
+		if (!fwnode_property_present(endpoint, "endpoint"))
+			return ERR_PTR(-EPROTO);
+	}
+
+	return endpoint;
+}
+
+/**
+ * acpi_graph_get_child_prop_value - Return a child with a given property value
+ * @fwnode: device fwnode
+ * @prop_name: The name of the property to look for
+ * @val: the desired property value
+ *
+ * Return the port node corresponding to a given port number. Returns
+ * the child node on success, NULL otherwise.
+ */
+static struct fwnode_handle *acpi_graph_get_child_prop_value(
+	struct fwnode_handle *fwnode, const char *prop_name, unsigned int val)
+{
+	struct fwnode_handle *child;
+
+	fwnode_for_each_child_node(fwnode, child) {
+		u32 nr;
+
+		if (!fwnode_property_read_u32(fwnode, prop_name, &nr))
+			continue;
+
+		if (val == nr)
+			return child;
+	}
+
+	return NULL;
+}
+
+
+/**
+ * acpi_graph_get_remote_enpoint - Parses and returns remote end of an endpoint
+ * @fwnode: Endpoint firmware node pointing to a remote device
+ * @parent: Firmware node of remote port parent is filled here if not %NULL
+ * @port: Firmware node of remote port is filled here if not %NULL
+ * @endpoint: Firmware node of remote endpoint is filled here if not %NULL
+ *
+ * Function parses remote end of ACPI firmware remote endpoint and fills in
+ * fields requested by the caller. Returns %0 in case of success and
+ * negative errno otherwise.
+ */
+int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+				   struct fwnode_handle **parent,
+				   struct fwnode_handle **port,
+				   struct fwnode_handle **endpoint)
+{
+	unsigned int port_nr, endpoint_nr;
+	struct acpi_reference_args args;
+	int ret;
+
+	memset(&args, 0, sizeof(args));
+	ret = acpi_node_get_property_reference(fwnode, "remote-endpoint", 0,
+					       &args);
+	if (ret)
+		return ret;
+
+	/*
+	 * Always require two arguments with the reference: port and
+	 * endpoint indices.
+	 */
+	if (args.nargs != 2)
+		return -EPROTO;
+
+	fwnode = acpi_fwnode_handle(args.adev);
+	port_nr = args.args[0];
+	endpoint_nr = args.args[1];
+
+	if (parent)
+		*parent = fwnode;
+
+	if (!port && !endpoint)
+		return 0;
+
+	fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr);
+	if (!fwnode)
+		return -EPROTO;
+
+	if (port)
+		*port = fwnode;
+
+	if (!endpoint)
+		return 0;
+
+	fwnode = acpi_graph_get_child_prop_value(fwnode, "endpoint",
+						 endpoint_nr);
+	if (!fwnode)
+		return -EPROTO;
+
+	*endpoint = fwnode;
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4eb1f5941ede..add8a96e1977 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1001,6 +1001,13 @@ struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
 struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
 
+struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+						   struct fwnode_handle *prev);
+int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+				   struct fwnode_handle **remote,
+				   struct fwnode_handle **port,
+				   struct fwnode_handle **endpoint);
+
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
 						 struct acpi_probe_entry *);
@@ -1128,6 +1135,22 @@ acpi_node_get_parent(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
+static inline struct fwnode_handle *
+acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+			     struct fwnode_handle *prev)
+{
+	return ERR_PTR(-ENXIO);
+}
+
+static inline int
+acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+			       struct fwnode_handle **remote,
+			       struct fwnode_handle **port,
+			       struct fwnode_handle **endpoint)
+{
+	return -ENXIO;
+}
+
 #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
 	static const void * __acpi_table_##name[]			\
 		__attribute__((unused))					\
-- 
cgit v1.2.3


From 07bb80d40b0e6a43aafb422296d33baed255569a Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:21 +0300
Subject: device property: Add support for remote endpoints

This follows DT implementation of of_graph_* APIs but we call them
fwnode_graph_* instead. For DT nodes the existing of_graph_* implementation
will be used. For ACPI we use the new ACPI graph implementation instead.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |   9 ++++
 2 files changed, 132 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 538b248d5dcf..4e98a6fad33f 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_graph.h>
 #include <linux/property.h>
 #include <linux/etherdevice.h>
 #include <linux/phy.h>
@@ -1176,3 +1177,125 @@ void *device_get_mac_address(struct device *dev, char *addr, int alen)
 	return device_get_mac_addr(dev, "address", addr, alen);
 }
 EXPORT_SYMBOL(device_get_mac_address);
+
+/**
+ * device_graph_get_next_endpoint - Get next endpoint firmware node
+ * @fwnode: Pointer to the parent firmware node
+ * @prev: Previous endpoint node or %NULL to get the first
+ *
+ * Returns an endpoint firmware node pointer or %NULL if no more endpoints
+ * are available.
+ */
+struct fwnode_handle *
+fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+			       struct fwnode_handle *prev)
+{
+	struct fwnode_handle *endpoint = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_graph_get_next_endpoint(to_of_node(fwnode),
+						  to_of_node(prev));
+
+		if (node)
+			endpoint = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		endpoint = acpi_graph_get_next_endpoint(fwnode, prev);
+		if (IS_ERR(endpoint))
+			endpoint = NULL;
+	}
+
+	return endpoint;
+
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint);
+
+/**
+ * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote device the @fwnode points to.
+ */
+struct fwnode_handle *
+fwnode_graph_get_remote_port_parent(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_graph_get_remote_port_parent(to_of_node(fwnode));
+		if (node)
+			parent = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		int ret;
+
+		ret = acpi_graph_get_remote_endpoint(fwnode, &parent, NULL,
+						     NULL);
+		if (ret)
+			return NULL;
+	}
+
+	return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent);
+
+/**
+ * fwnode_graph_get_remote_port - Return fwnode of a remote port
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote port the @fwnode points to.
+ */
+struct fwnode_handle *fwnode_graph_get_remote_port(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *port = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_graph_get_remote_port(to_of_node(fwnode));
+		if (node)
+			port = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		int ret;
+
+		ret = acpi_graph_get_remote_endpoint(fwnode, NULL, &port, NULL);
+		if (ret)
+			return NULL;
+	}
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
+
+/**
+ * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote endpoint the @fwnode points to.
+ */
+struct fwnode_handle *
+fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *endpoint = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_parse_phandle(to_of_node(fwnode), "remote-endpoint",
+					0);
+		if (node)
+			endpoint = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		int ret;
+
+		ret = acpi_graph_get_remote_endpoint(fwnode, NULL, NULL,
+						     &endpoint);
+		if (ret)
+			return NULL;
+	}
+
+	return endpoint;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
diff --git a/include/linux/property.h b/include/linux/property.h
index 514b19559fbe..8d7809c2c42d 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -268,4 +268,13 @@ int device_get_phy_mode(struct device *dev);
 
 void *device_get_mac_address(struct device *dev, char *addr, int alen);
 
+struct fwnode_handle *fwnode_graph_get_next_endpoint(
+	struct fwnode_handle *fwnode, struct fwnode_handle *prev);
+struct fwnode_handle *fwnode_graph_get_remote_port_parent(
+	struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_graph_get_remote_port(
+	struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_graph_get_remote_endpoint(
+	struct fwnode_handle *fwnode);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From e7887c284969a23a98fe1aff2f631c5ccdcd1757 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:22 +0300
Subject: device property: Add fwnode_handle_get()

fwnode_handle_get() is used to obtain a reference to a fwnode_handle
container. In this case this is OF specific struct device_node.

This complements fwnode_handle_put() which is already implemented.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 11 +++++++++++
 include/linux/property.h |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 4e98a6fad33f..23514bf67933 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1040,6 +1040,17 @@ struct fwnode_handle *device_get_named_child_node(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_get_named_child_node);
 
+/**
+ * fwnode_handle_get - Obtain a reference to a device node
+ * @fwnode: Pointer to the device node to obtain the reference to.
+ */
+void fwnode_handle_get(struct fwnode_handle *fwnode)
+{
+	if (is_of_node(fwnode))
+		of_node_get(to_of_node(fwnode));
+}
+EXPORT_SYMBOL_GPL(fwnode_handle_get);
+
 /**
  * fwnode_handle_put - Drop reference to a device node
  * @fwnode: Pointer to the device node to drop the reference to.
diff --git a/include/linux/property.h b/include/linux/property.h
index 8d7809c2c42d..0ae7d209f6c2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -90,6 +90,7 @@ struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
 						  const char *childname);
 
+void fwnode_handle_get(struct fwnode_handle *fwnode);
 void fwnode_handle_put(struct fwnode_handle *fwnode);
 
 unsigned int device_get_child_node_count(struct device *dev);
-- 
cgit v1.2.3


From 67831837e0b192fe0b8ee8b5e502d95ad2c497c0 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:23 +0300
Subject: of: Add of_fwnode_handle() to convert device nodes to fwnode_handle

of_fwnode_handle() returns a struct fwnode_handle of the struct
device_node. This may be used on the fwnode property API.

Use a macro instead of a function in order to support const and non-const
arguments.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/of.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 21e6323de0f3..e5d4225fda35 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -159,6 +159,8 @@ static inline struct device_node *to_of_node(struct fwnode_handle *fwnode)
 		container_of(fwnode, struct device_node, fwnode) : NULL;
 }
 
+#define of_fwnode_handle(node) (&(node)->fwnode)
+
 static inline bool of_have_populated_dt(void)
 {
 	return of_root != NULL;
@@ -602,6 +604,8 @@ static inline struct device_node *of_find_node_with_property(
 	return NULL;
 }
 
+#define of_fwnode_handle(node) NULL
+
 static inline bool of_have_populated_dt(void)
 {
 	return false;
-- 
cgit v1.2.3


From e44bb0cbdc88686c21e2175a990b40bf6db5d005 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:24 +0300
Subject: device property: Make dev_fwnode() public

The function to obtain a fwnode related to a struct device is useful for
drivers that use the fwnode property API: it allows not being aware of the
underlying firmware implementation.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 3 ++-
 include/linux/property.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 23514bf67933..22849a89a8b5 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -183,11 +183,12 @@ static int pset_prop_read_string(struct property_set *pset,
 	return 0;
 }
 
-static inline struct fwnode_handle *dev_fwnode(struct device *dev)
+struct fwnode_handle *dev_fwnode(struct device *dev)
 {
 	return IS_ENABLED(CONFIG_OF) && dev->of_node ?
 		&dev->of_node->fwnode : dev->fwnode;
 }
+EXPORT_SYMBOL_GPL(dev_fwnode);
 
 /**
  * device_property_present - check if a property of a device is present
diff --git a/include/linux/property.h b/include/linux/property.h
index 0ae7d209f6c2..6e20a12a2eec 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -33,6 +33,8 @@ enum dev_dma_attr {
 	DEV_DMA_COHERENT,
 };
 
+struct fwnode_handle *dev_fwnode(struct device *dev);
+
 bool device_property_present(struct device *dev, const char *propname);
 int device_property_read_u8_array(struct device *dev, const char *propname,
 				  u8 *val, size_t nval);
-- 
cgit v1.2.3


From 2bd5452d46df46d99b869b59a1532647e2981d75 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:25 +0300
Subject: device property: Add support for fwnode endpoints

Similar to OF endpoints, endpoint type nodes can be also supported on
ACPI. In order to make it possible for drivers to ignore the matter,
add a type for fwnode_endpoint and a function to parse them.

On ACPI, find the child node index instead of relying on the "endpoint"
property.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 32 ++++++++++++++++++++++++++++++++
 include/linux/fwnode.h   | 12 ++++++++++++
 include/linux/property.h |  3 +++
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 22849a89a8b5..8a53b8f1db66 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1311,3 +1311,35 @@ fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
 	return endpoint;
 }
 EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
+
+/**
+ * fwnode_graph_parse_endpoint - parse common endpoint node properties
+ * @fwnode: pointer to endpoint fwnode_handle
+ * @endpoint: pointer to the fwnode endpoint data structure
+ *
+ * Parse @fwnode representing a graph endpoint node and store the
+ * information in @endpoint. The caller must hold a reference to
+ * @fwnode.
+ */
+int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+				struct fwnode_endpoint *endpoint)
+{
+	struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode);
+
+	memset(endpoint, 0, sizeof(*endpoint));
+
+	endpoint->local_fwnode = fwnode;
+
+	if (is_acpi_node(port_fwnode)) {
+		fwnode_property_read_u32(port_fwnode, "port", &endpoint->port);
+		fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id);
+	} else {
+		fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port);
+		fwnode_property_read_u32(fwnode, "reg", &endpoint->id);
+	}
+
+	fwnode_handle_put(port_fwnode);
+
+	return 0;
+}
+EXPORT_SYMBOL(fwnode_graph_parse_endpoint);
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 8bd28ce6d76e..3dff2398a5f0 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -27,4 +27,16 @@ struct fwnode_handle {
 	struct fwnode_handle *secondary;
 };
 
+/**
+ * struct fwnode_endpoint - Fwnode graph endpoint
+ * @port: Port number
+ * @id: Endpoint id
+ * @local_fwnode: reference to the related fwnode
+ */
+struct fwnode_endpoint {
+	unsigned int port;
+	unsigned int id;
+	const struct fwnode_handle *local_fwnode;
+};
+
 #endif
diff --git a/include/linux/property.h b/include/linux/property.h
index 6e20a12a2eec..3a4e43599e01 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -280,4 +280,7 @@ struct fwnode_handle *fwnode_graph_get_remote_port(
 struct fwnode_handle *fwnode_graph_get_remote_endpoint(
 	struct fwnode_handle *fwnode);
 
+int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+				struct fwnode_endpoint *endpoint);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.2.3


From 233872585de1cf26c3c3da5859ffb3aba45bd486 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:26 +0300
Subject: device property: Add fwnode_get_next_parent()

In order to differentiate the functionality between dropping a reference
to the node (or not) for the benefit of OF, introduce
fwnode_get_next_parent().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 21 +++++++++++++++++++++
 include/linux/property.h |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 8a53b8f1db66..627ebc9b570d 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -933,6 +933,27 @@ int device_add_properties(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_add_properties);
 
+/**
+ * fwnode_get_next_parent - Iterate to the node's parent
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * This is like fwnode_get_parent() except that it drops the refcount
+ * on the passed node, making it suitable for iterating through a
+ * node's parents.
+ *
+ * Returns a node pointer with refcount incremented, use
+ * fwnode_handle_node() on it when done.
+ */
+struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent = fwnode_get_parent(fwnode);
+
+	fwnode_handle_put(fwnode);
+
+	return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
+
 /**
  * fwnode_get_parent - Return parent firwmare node
  * @fwnode: Firmware whose parent is retrieved
diff --git a/include/linux/property.h b/include/linux/property.h
index 3a4e43599e01..2f482616a2f2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -73,6 +73,7 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
 struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
 						 struct fwnode_handle *child);
 
-- 
cgit v1.2.3


From ffaa42e8a40b7f1041e36b022cd28b7c45e2b564 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 20 Mar 2017 11:19:21 +0100
Subject: PM / Domains: Enable users of genpd to specify always on PM domains

The current way to implement an always on PM domain consists of returning
-EBUSY from the ->power_off() callback. This is a bit different compared to
using the always on genpd governor, which prevents the PM domain from being
powered off via runtime suspend, but not via system suspend.

The approach to return -EBUSY from the ->power_off() callback to support
always on PM domains in genpd is suboptimal. That is because it requires
genpd to follow the regular execution path of the power off sequence, which
ends by invoking the ->power_off() callback.

To enable genpd to early abort the power off sequence for always on PM
domains, it needs static information about these configurations. Therefore
let's add a new genpd configuration flag, GENPD_FLAG_ALWAYS_ON.

Users of the new GENPD_FLAG_ALWAYS_ON flag, are by genpd required to make
sure the PM domain is powered on before calling pm_genpd_init(). Moreover,
users don't need to implement the ->power_off() callback, as genpd doesn't
ever invoke it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 14 ++++++++++++--
 include/linux/pm_domain.h   |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 792fbab3dfc4..c71a7ef08b05 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -123,6 +123,7 @@ static const struct genpd_lock_ops genpd_spin_ops = {
 
 #define genpd_status_on(genpd)		(genpd->status == GPD_STATE_ACTIVE)
 #define genpd_is_irq_safe(genpd)	(genpd->flags & GENPD_FLAG_IRQ_SAFE)
+#define genpd_is_always_on(genpd)	(genpd->flags & GENPD_FLAG_ALWAYS_ON)
 
 static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
 		struct generic_pm_domain *genpd)
@@ -300,7 +301,12 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
 	if (!genpd_status_on(genpd) || genpd->prepared_count > 0)
 		return 0;
 
-	if (atomic_read(&genpd->sd_count) > 0)
+	/*
+	 * Abort power off for the PM domain in the following situations:
+	 * (1) The domain is configured as always on.
+	 * (2) When the domain has a subdomain being powered on.
+	 */
+	if (genpd_is_always_on(genpd) || atomic_read(&genpd->sd_count) > 0)
 		return -EBUSY;
 
 	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
@@ -752,7 +758,7 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
 {
 	struct gpd_link *link;
 
-	if (!genpd_status_on(genpd))
+	if (!genpd_status_on(genpd) || genpd_is_always_on(genpd))
 		return;
 
 	if (genpd->suspended_count != genpd->device_count
@@ -1491,6 +1497,10 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 		genpd->dev_ops.start = pm_clk_resume;
 	}
 
+	/* Always-on domains must be powered on at initialization. */
+	if (genpd_is_always_on(genpd) && !genpd_status_on(genpd))
+		return -EINVAL;
+
 	/* Use only one "off" state if there were no states declared */
 	if (genpd->state_count == 0) {
 		ret = genpd_set_default_power_state(genpd);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 5339ed5bd6f9..9b6abe632587 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -20,6 +20,7 @@
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
 #define GENPD_FLAG_IRQ_SAFE	(1U << 1) /* PM domain operates in atomic */
+#define GENPD_FLAG_ALWAYS_ON	(1U << 2) /* PM domain is always powered on */
 
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
-- 
cgit v1.2.3


From d597580d373774b1bdab84b3d26ff0b55162b916 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Mar 2017 21:56:06 -0400
Subject: generic ...copy_..._user primitives

provide raw_copy_..._user() and select ARCH_HAS_RAW_COPY_USER to use those.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/Kconfig                  |   3 +
 include/asm-generic/uaccess.h |  11 +++
 include/linux/uaccess.h       | 187 ++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile                  |   2 +
 lib/usercopy.c                |  26 ++++++
 5 files changed, 229 insertions(+)
 create mode 100644 lib/usercopy.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..315d37626ddc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -847,4 +847,7 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
+config ARCH_HAS_RAW_COPY_USER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index bd7a05e9582b..d65c311eb128 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -86,7 +86,11 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 
 static inline int __put_user_fn(size_t size, void __user *ptr, void *x)
 {
+#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
+	return unlikely(raw_copy_to_user(ptr, x, size)) ? -EFAULT : 0;
+#else
 	return unlikely(__copy_to_user(ptr, x, size)) ? -EFAULT : 0;
+#endif
 }
 
 #define __put_user_fn(sz, u, k)	__put_user_fn(sz, u, k)
@@ -147,7 +151,11 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 #ifndef __get_user_fn
 static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 {
+#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
+	return unlikely(raw_copy_from_user(x, ptr, size)) ? -EFAULT : 0;
+#else
 	return unlikely(__copy_from_user(x, ptr, size)) ? -EFAULT : 0;
+#endif
 }
 
 #define __get_user_fn(sz, u, k)	__get_user_fn(sz, u, k)
@@ -156,6 +164,8 @@ static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 
 extern int __get_user_bad(void) __attribute__((noreturn));
 
+#ifndef CONFIG_ARCH_HAS_RAW_COPY_USER
+
 #ifndef __copy_from_user_inatomic
 #define __copy_from_user_inatomic __copy_from_user
 #endif
@@ -185,6 +195,7 @@ static inline long copy_to_user(void __user *to,
 	else
 		return n;
 }
+#endif
 
 /*
  * Copy a null terminated string from userspace.
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9c3ae8706e9d..5f76bc995d96 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -3,6 +3,7 @@
 
 #include <linux/sched.h>
 #include <linux/thread_info.h>
+#include <linux/kasan-checks.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
@@ -11,6 +12,192 @@
 
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
+/*
+ * Architectures should provide two primitives (raw_copy_{to,from}_user())
+ * select ARCH_HAS_RAW_COPY_FROM_USER and get rid of their private instances
+ * of copy_{to,from}_user() and __copy_{to,from}_user{,_inatomic}().  Once
+ * all of them switch, this part of linux/uaccess.h will become unconditional.
+ *
+ * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
+ * return the amount left to copy.  They should assume that access_ok() has
+ * already been checked (and succeeded); they should *not* zero-pad anything.
+ * No KASAN or object size checks either - those belong here.
+ *
+ * Both of these functions should attempt to copy size bytes starting at from
+ * into the area starting at to.  They must not fetch or store anything
+ * outside of those areas.  Return value must be between 0 (everything
+ * copied successfully) and size (nothing copied).
+ *
+ * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
+ * at to must become equal to the bytes fetched from the corresponding area
+ * starting at from.  All data past to + size - N must be left unmodified.
+ *
+ * If copying succeeds, the return value must be 0.  If some data cannot be
+ * fetched, it is permitted to copy less than had been fetched; the only
+ * hard requirement is that not storing anything at all (i.e. returning size)
+ * should happen only when nothing could be copied.  In other words, you don't
+ * have to squeeze as much as possible - it is allowed, but not necessary.
+ *
+ * For raw_copy_from_user() to always points to kernel memory and no faults
+ * on store should happen.  Interpretation of from is affected by set_fs().
+ * For raw_copy_to_user() it's the other way round.
+ *
+ * Both can be inlined - it's up to architectures whether it wants to bother
+ * with that.  They should not be used directly; they are used to implement
+ * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
+ * that are used instead.  Out of those, __... ones are inlined.  Plain
+ * copy_{to,from}_user() might or might not be inlined.  If you want them
+ * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
+ *
+ * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
+ * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
+ * at all; their callers absolutely must check the return value.
+ *
+ * Biarch ones should also provide raw_copy_in_user() - similar to the above,
+ * but both source and destination are __user pointers (affected by set_fs()
+ * as usual) and both source and destination can trigger faults.
+ */
+
+static __always_inline unsigned long
+__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+{
+	kasan_check_write(to, n);
+	check_object_size(to, n, false);
+	return raw_copy_from_user(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	might_fault();
+	kasan_check_write(to, n);
+	check_object_size(to, n, false);
+	return raw_copy_from_user(to, from, n);
+}
+
+/**
+ * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.
+ *
+ * Copy data from kernel space to user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ * The caller should also make sure he pins the user space address
+ * so that we don't result in page fault and sleep.
+ */
+static __always_inline unsigned long
+__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+{
+	kasan_check_read(from, n);
+	check_object_size(from, n, true);
+	return raw_copy_to_user(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	might_fault();
+	kasan_check_read(from, n);
+	check_object_size(from, n, true);
+	return raw_copy_to_user(to, from, n);
+}
+
+#ifdef INLINE_COPY_FROM_USER
+static inline unsigned long
+_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long res = n;
+	if (likely(access_ok(VERIFY_READ, from, n)))
+		res = raw_copy_from_user(to, from, n);
+	if (unlikely(res))
+		memset(to + (n - res), 0, res);
+	return res;
+}
+#else
+extern unsigned long
+_copy_from_user(void *, const void __user *, unsigned long);
+#endif
+
+#ifdef INLINE_COPY_TO_USER
+static inline unsigned long
+_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n))
+		n = raw_copy_to_user(to, from, n);
+	return n;
+}
+#else
+extern unsigned long
+_copy_to_user(void __user *, const void *, unsigned long);
+#endif
+
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+
+static __always_inline unsigned long __must_check
+copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	int sz = __compiletime_object_size(to);
+
+	might_fault();
+	kasan_check_write(to, n);
+
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(to, n, false);
+		n = _copy_from_user(to, from, n);
+	} else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
+	else
+		__bad_copy_user();
+
+	return n;
+}
+
+static __always_inline unsigned long __must_check
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	int sz = __compiletime_object_size(from);
+
+	kasan_check_read(from, n);
+	might_fault();
+
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(from, n, true);
+		n = _copy_to_user(to, from, n);
+	} else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
+	else
+		__bad_copy_user();
+
+	return n;
+}
+#ifdef CONFIG_COMPAT
+static __always_inline unsigned long __must_check
+__copy_in_user(void __user *to, const void *from, unsigned long n)
+{
+	might_fault();
+	return raw_copy_in_user(to, from, n);
+}
+static __always_inline unsigned long __must_check
+copy_in_user(void __user *to, const void *from, unsigned long n)
+{
+	might_fault();
+	if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n))
+		n = raw_copy_in_user(to, from, n);
+	return n;
+}
+#endif
+#endif
+
 static __always_inline void pagefault_disabled_inc(void)
 {
 	current->pagefault_disabled++;
diff --git a/lib/Makefile b/lib/Makefile
index 320ac46a8725..7d875c389172 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -242,3 +242,5 @@ UBSAN_SANITIZE_ubsan.o := n
 obj-$(CONFIG_SBITMAP) += sbitmap.o
 
 obj-$(CONFIG_PARMAN) += parman.o
+
+obj-$(CONFIG_ARCH_HAS_RAW_COPY_USER) += usercopy.o
diff --git a/lib/usercopy.c b/lib/usercopy.c
new file mode 100644
index 000000000000..1b6010a3beb8
--- /dev/null
+++ b/lib/usercopy.c
@@ -0,0 +1,26 @@
+#include <linux/uaccess.h>
+
+/* out-of-line parts */
+
+#ifndef INLINE_COPY_FROM_USER
+unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long res = n;
+	if (likely(access_ok(VERIFY_READ, from, n)))
+		res = raw_copy_from_user(to, from, n);
+	if (unlikely(res))
+		memset(to + (n - res), 0, res);
+	return res;
+}
+EXPORT_SYMBOL(_copy_from_user);
+#endif
+
+#ifndef INLINE_COPY_TO_USER
+unsigned long _copy_to_user(void *to, const void __user *from, unsigned long n)
+{
+	if (likely(access_ok(VERIFY_WRITE, to, n)))
+		n = raw_copy_to_user(to, from, n);
+	return n;
+}
+EXPORT_SYMBOL(_copy_to_user);
+#endif
-- 
cgit v1.2.3


From 3f763453e6f27d82fa0ac58f8e1ac4094c1fb1f8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 Mar 2017 18:47:28 -0400
Subject: kill __copy_from_user_nocache()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/uaccess_32.h |  30 ----------
 arch/x86/include/asm/uaccess_64.h |   8 ---
 arch/x86/lib/usercopy_32.c        | 118 --------------------------------------
 include/linux/uaccess.h           |   6 --
 lib/iov_iter.c                    |   4 +-
 5 files changed, 2 insertions(+), 164 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5268ecceea96..19e6c050c438 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -14,8 +14,6 @@ unsigned long __must_check __copy_from_user_ll
 		(void *to, const void __user *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll_nozero
 		(void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nocache
-		(void *to, const void __user *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll_nocache_nozero
 		(void *to, const void __user *from, unsigned long n);
 
@@ -119,34 +117,6 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 	return __copy_from_user_ll(to, from, n);
 }
 
-static __always_inline unsigned long __copy_from_user_nocache(void *to,
-				const void __user *from, unsigned long n)
-{
-	might_fault();
-	if (__builtin_constant_p(n)) {
-		unsigned long ret;
-
-		switch (n) {
-		case 1:
-			__uaccess_begin();
-			__get_user_size(*(u8 *)to, from, 1, ret, 1);
-			__uaccess_end();
-			return ret;
-		case 2:
-			__uaccess_begin();
-			__get_user_size(*(u16 *)to, from, 2, ret, 2);
-			__uaccess_end();
-			return ret;
-		case 4:
-			__uaccess_begin();
-			__get_user_size(*(u32 *)to, from, 4, ret, 4);
-			__uaccess_end();
-			return ret;
-		}
-	}
-	return __copy_from_user_ll_nocache(to, from, n);
-}
-
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
 				  unsigned long n)
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 142f0f1230be..242936b0cb4b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -260,14 +260,6 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
-static inline int
-__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
-{
-	might_fault();
-	kasan_check_write(dst, size);
-	return __copy_user_nocache(dst, src, size, 1);
-}
-
 static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f65ff6540f0..02aa7aa8b9f3 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -293,105 +293,6 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 	return size;
 }
 
-/*
- * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
- * hyoshiok@miraclelinux.com
- */
-
-static unsigned long __copy_user_zeroing_intel_nocache(void *to,
-				const void __user *from, unsigned long size)
-{
-	int d0, d1;
-
-	__asm__ __volatile__(
-	       "        .align 2,0x90\n"
-	       "0:      movl 32(%4), %%eax\n"
-	       "        cmpl $67, %0\n"
-	       "        jbe 2f\n"
-	       "1:      movl 64(%4), %%eax\n"
-	       "        .align 2,0x90\n"
-	       "2:      movl 0(%4), %%eax\n"
-	       "21:     movl 4(%4), %%edx\n"
-	       "        movnti %%eax, 0(%3)\n"
-	       "        movnti %%edx, 4(%3)\n"
-	       "3:      movl 8(%4), %%eax\n"
-	       "31:     movl 12(%4),%%edx\n"
-	       "        movnti %%eax, 8(%3)\n"
-	       "        movnti %%edx, 12(%3)\n"
-	       "4:      movl 16(%4), %%eax\n"
-	       "41:     movl 20(%4), %%edx\n"
-	       "        movnti %%eax, 16(%3)\n"
-	       "        movnti %%edx, 20(%3)\n"
-	       "10:     movl 24(%4), %%eax\n"
-	       "51:     movl 28(%4), %%edx\n"
-	       "        movnti %%eax, 24(%3)\n"
-	       "        movnti %%edx, 28(%3)\n"
-	       "11:     movl 32(%4), %%eax\n"
-	       "61:     movl 36(%4), %%edx\n"
-	       "        movnti %%eax, 32(%3)\n"
-	       "        movnti %%edx, 36(%3)\n"
-	       "12:     movl 40(%4), %%eax\n"
-	       "71:     movl 44(%4), %%edx\n"
-	       "        movnti %%eax, 40(%3)\n"
-	       "        movnti %%edx, 44(%3)\n"
-	       "13:     movl 48(%4), %%eax\n"
-	       "81:     movl 52(%4), %%edx\n"
-	       "        movnti %%eax, 48(%3)\n"
-	       "        movnti %%edx, 52(%3)\n"
-	       "14:     movl 56(%4), %%eax\n"
-	       "91:     movl 60(%4), %%edx\n"
-	       "        movnti %%eax, 56(%3)\n"
-	       "        movnti %%edx, 60(%3)\n"
-	       "        addl $-64, %0\n"
-	       "        addl $64, %4\n"
-	       "        addl $64, %3\n"
-	       "        cmpl $63, %0\n"
-	       "        ja  0b\n"
-	       "        sfence \n"
-	       "5:      movl  %0, %%eax\n"
-	       "        shrl  $2, %0\n"
-	       "        andl $3, %%eax\n"
-	       "        cld\n"
-	       "6:      rep; movsl\n"
-	       "        movl %%eax,%0\n"
-	       "7:      rep; movsb\n"
-	       "8:\n"
-	       ".section .fixup,\"ax\"\n"
-	       "9:      lea 0(%%eax,%0,4),%0\n"
-	       "16:     pushl %0\n"
-	       "        pushl %%eax\n"
-	       "        xorl %%eax,%%eax\n"
-	       "        rep; stosb\n"
-	       "        popl %%eax\n"
-	       "        popl %0\n"
-	       "        jmp 8b\n"
-	       ".previous\n"
-	       _ASM_EXTABLE(0b,16b)
-	       _ASM_EXTABLE(1b,16b)
-	       _ASM_EXTABLE(2b,16b)
-	       _ASM_EXTABLE(21b,16b)
-	       _ASM_EXTABLE(3b,16b)
-	       _ASM_EXTABLE(31b,16b)
-	       _ASM_EXTABLE(4b,16b)
-	       _ASM_EXTABLE(41b,16b)
-	       _ASM_EXTABLE(10b,16b)
-	       _ASM_EXTABLE(51b,16b)
-	       _ASM_EXTABLE(11b,16b)
-	       _ASM_EXTABLE(61b,16b)
-	       _ASM_EXTABLE(12b,16b)
-	       _ASM_EXTABLE(71b,16b)
-	       _ASM_EXTABLE(13b,16b)
-	       _ASM_EXTABLE(81b,16b)
-	       _ASM_EXTABLE(14b,16b)
-	       _ASM_EXTABLE(91b,16b)
-	       _ASM_EXTABLE(6b,9b)
-	       _ASM_EXTABLE(7b,16b)
-	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
-	       :  "1"(to), "2"(from), "0"(size)
-	       : "eax", "edx", "memory");
-	return size;
-}
-
 static unsigned long __copy_user_intel_nocache(void *to,
 				const void __user *from, unsigned long size)
 {
@@ -490,8 +391,6 @@ unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
 					unsigned long size);
 unsigned long __copy_user_intel(void __user *to, const void *from,
 					unsigned long size);
-unsigned long __copy_user_zeroing_intel_nocache(void *to,
-				const void __user *from, unsigned long size);
 #endif /* CONFIG_X86_INTEL_USERCOPY */
 
 /* Generic arbitrary sized copy.  */
@@ -607,23 +506,6 @@ unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nozero);
 
-unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
-					unsigned long n)
-{
-	stac();
-#ifdef CONFIG_X86_INTEL_USERCOPY
-	if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
-		n = __copy_user_zeroing_intel_nocache(to, from, n);
-	else
-		__copy_user_zeroing(to, from, n);
-#else
-	__copy_user_zeroing(to, from, n);
-#endif
-	clac();
-	return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll_nocache);
-
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
 {
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5f76bc995d96..7fc2104b88bc 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -261,12 +261,6 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
 	return __copy_from_user_inatomic(to, from, n);
 }
 
-static inline unsigned long __copy_from_user_nocache(void *to,
-				const void __user *from, unsigned long n)
-{
-	return __copy_from_user(to, from, n);
-}
-
 #endif		/* ARCH_HAS_NOCACHE_UACCESS */
 
 /*
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 97db876c6862..672c32f9f960 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -604,7 +604,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	}
 	iterate_and_advance(i, bytes, v,
-		__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+		__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
 					 v.iov_base, v.iov_len),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
@@ -625,7 +625,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
 	if (unlikely(i->count < bytes))
 		return false;
 	iterate_all_kinds(i, bytes, v, ({
-		if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+		if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
 					     v.iov_base, v.iov_len))
 			return false;
 		0;}),
-- 
cgit v1.2.3


From 5052de8deff5619a9b7071f00084fd0264b58e17 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 27 Mar 2017 22:26:33 -0700
Subject: soc: qcom: smd: Transition client drivers from smd to rpmsg

By moving these client drivers to use RPMSG instead of the direct SMD
API we can reuse them ontop of the newly added GLINK wire-protocol
support found in the 820 and 835 Qualcomm platforms.

As the new (RPMSG-based) and old SMD implementations are mutually
exclusive we have to change all client drivers in one commit, to make
sure we have a working system before and after this transition.

Acked-by: Andy Gross <andy.gross@linaro.org>
Acked-by: Kalle Valo <kvalo@codeaurora.org>
Acked-by: Marcel Holtmann <marcel@holtmann.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bluetooth/Kconfig                  |  2 +-
 drivers/bluetooth/btqcomsmd.c              | 32 +++++++++----------
 drivers/net/wireless/ath/wcn36xx/Kconfig   |  2 +-
 drivers/net/wireless/ath/wcn36xx/main.c    |  6 ++--
 drivers/net/wireless/ath/wcn36xx/smd.c     | 10 +++---
 drivers/net/wireless/ath/wcn36xx/smd.h     |  6 ++--
 drivers/net/wireless/ath/wcn36xx/wcn36xx.h |  2 +-
 drivers/soc/qcom/Kconfig                   |  6 ++--
 drivers/soc/qcom/smd-rpm.c                 | 43 +++++++++++++------------
 drivers/soc/qcom/wcnss_ctrl.c              | 50 +++++++++++++++++-------------
 include/linux/soc/qcom/wcnss_ctrl.h        | 11 ++++---
 net/qrtr/Kconfig                           |  2 +-
 net/qrtr/smd.c                             | 42 ++++++++++++-------------
 13 files changed, 110 insertions(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig
index 08e054507d0b..a6a9dd4d0eef 100644
--- a/drivers/bluetooth/Kconfig
+++ b/drivers/bluetooth/Kconfig
@@ -344,7 +344,7 @@ config BT_WILINK
 
 config BT_QCOMSMD
 	tristate "Qualcomm SMD based HCI support"
-	depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n)
+	depends on RPMSG || (COMPILE_TEST && RPMSG=n)
 	depends on QCOM_WCNSS_CTRL || (COMPILE_TEST && QCOM_WCNSS_CTRL=n)
 	select BT_QCA
 	help
diff --git a/drivers/bluetooth/btqcomsmd.c b/drivers/bluetooth/btqcomsmd.c
index 8d4868af9bbd..ef730c173d4b 100644
--- a/drivers/bluetooth/btqcomsmd.c
+++ b/drivers/bluetooth/btqcomsmd.c
@@ -14,7 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/soc/qcom/smd.h>
+#include <linux/rpmsg.h>
 #include <linux/soc/qcom/wcnss_ctrl.h>
 #include <linux/platform_device.h>
 
@@ -26,8 +26,8 @@
 struct btqcomsmd {
 	struct hci_dev *hdev;
 
-	struct qcom_smd_channel *acl_channel;
-	struct qcom_smd_channel *cmd_channel;
+	struct rpmsg_endpoint *acl_channel;
+	struct rpmsg_endpoint *cmd_channel;
 };
 
 static int btqcomsmd_recv(struct hci_dev *hdev, unsigned int type,
@@ -48,19 +48,19 @@ static int btqcomsmd_recv(struct hci_dev *hdev, unsigned int type,
 	return hci_recv_frame(hdev, skb);
 }
 
-static int btqcomsmd_acl_callback(struct qcom_smd_channel *channel,
-				  const void *data, size_t count)
+static int btqcomsmd_acl_callback(struct rpmsg_device *rpdev, void *data,
+				  int count, void *priv, u32 addr)
 {
-	struct btqcomsmd *btq = qcom_smd_get_drvdata(channel);
+	struct btqcomsmd *btq = priv;
 
 	btq->hdev->stat.byte_rx += count;
 	return btqcomsmd_recv(btq->hdev, HCI_ACLDATA_PKT, data, count);
 }
 
-static int btqcomsmd_cmd_callback(struct qcom_smd_channel *channel,
-				  const void *data, size_t count)
+static int btqcomsmd_cmd_callback(struct rpmsg_device *rpdev, void *data,
+				  int count, void *priv, u32 addr)
 {
-	struct btqcomsmd *btq = qcom_smd_get_drvdata(channel);
+	struct btqcomsmd *btq = priv;
 
 	return btqcomsmd_recv(btq->hdev, HCI_EVENT_PKT, data, count);
 }
@@ -72,12 +72,12 @@ static int btqcomsmd_send(struct hci_dev *hdev, struct sk_buff *skb)
 
 	switch (hci_skb_pkt_type(skb)) {
 	case HCI_ACLDATA_PKT:
-		ret = qcom_smd_send(btq->acl_channel, skb->data, skb->len);
+		ret = rpmsg_send(btq->acl_channel, skb->data, skb->len);
 		hdev->stat.acl_tx++;
 		hdev->stat.byte_tx += skb->len;
 		break;
 	case HCI_COMMAND_PKT:
-		ret = qcom_smd_send(btq->cmd_channel, skb->data, skb->len);
+		ret = rpmsg_send(btq->cmd_channel, skb->data, skb->len);
 		hdev->stat.cmd_tx++;
 		break;
 	default:
@@ -114,18 +114,15 @@ static int btqcomsmd_probe(struct platform_device *pdev)
 	wcnss = dev_get_drvdata(pdev->dev.parent);
 
 	btq->acl_channel = qcom_wcnss_open_channel(wcnss, "APPS_RIVA_BT_ACL",
-						   btqcomsmd_acl_callback);
+						   btqcomsmd_acl_callback, btq);
 	if (IS_ERR(btq->acl_channel))
 		return PTR_ERR(btq->acl_channel);
 
 	btq->cmd_channel = qcom_wcnss_open_channel(wcnss, "APPS_RIVA_BT_CMD",
-						   btqcomsmd_cmd_callback);
+						   btqcomsmd_cmd_callback, btq);
 	if (IS_ERR(btq->cmd_channel))
 		return PTR_ERR(btq->cmd_channel);
 
-	qcom_smd_set_drvdata(btq->acl_channel, btq);
-	qcom_smd_set_drvdata(btq->cmd_channel, btq);
-
 	hdev = hci_alloc_dev();
 	if (!hdev)
 		return -ENOMEM;
@@ -158,6 +155,9 @@ static int btqcomsmd_remove(struct platform_device *pdev)
 	hci_unregister_dev(btq->hdev);
 	hci_free_dev(btq->hdev);
 
+	rpmsg_destroy_ept(btq->cmd_channel);
+	rpmsg_destroy_ept(btq->acl_channel);
+
 	return 0;
 }
 
diff --git a/drivers/net/wireless/ath/wcn36xx/Kconfig b/drivers/net/wireless/ath/wcn36xx/Kconfig
index 4b83e87f0b94..20bf967a70b9 100644
--- a/drivers/net/wireless/ath/wcn36xx/Kconfig
+++ b/drivers/net/wireless/ath/wcn36xx/Kconfig
@@ -2,7 +2,7 @@ config WCN36XX
 	tristate "Qualcomm Atheros WCN3660/3680 support"
 	depends on MAC80211 && HAS_DMA
 	depends on QCOM_WCNSS_CTRL || QCOM_WCNSS_CTRL=n
-	depends on QCOM_SMD || QCOM_SMD=n
+	depends on RPMSG || RPMSG=n
 	---help---
 	  This module adds support for wireless adapters based on
 	  Qualcomm Atheros WCN3660 and WCN3680 mobile chipsets.
diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c
index 7a0c2e7da7f6..bb7110f7fc86 100644
--- a/drivers/net/wireless/ath/wcn36xx/main.c
+++ b/drivers/net/wireless/ath/wcn36xx/main.c
@@ -22,7 +22,7 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 #include <linux/of_irq.h>
-#include <linux/soc/qcom/smd.h>
+#include <linux/rpmsg.h>
 #include <linux/soc/qcom/smem_state.h>
 #include <linux/soc/qcom/wcnss_ctrl.h>
 #include "wcn36xx.h"
@@ -1218,15 +1218,13 @@ static int wcn36xx_probe(struct platform_device *pdev)
 
 	INIT_WORK(&wcn->scan_work, wcn36xx_hw_scan_worker);
 
-	wcn->smd_channel = qcom_wcnss_open_channel(wcnss, "WLAN_CTRL", wcn36xx_smd_rsp_process);
+	wcn->smd_channel = qcom_wcnss_open_channel(wcnss, "WLAN_CTRL", wcn36xx_smd_rsp_process, hw);
 	if (IS_ERR(wcn->smd_channel)) {
 		wcn36xx_err("failed to open WLAN_CTRL channel\n");
 		ret = PTR_ERR(wcn->smd_channel);
 		goto out_wq;
 	}
 
-	qcom_smd_set_drvdata(wcn->smd_channel, hw);
-
 	addr = of_get_property(pdev->dev.of_node, "local-mac-address", &ret);
 	if (addr && ret != ETH_ALEN) {
 		wcn36xx_err("invalid local-mac-address\n");
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.c b/drivers/net/wireless/ath/wcn36xx/smd.c
index 1c2966f7db7a..9c6590d5348a 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.c
+++ b/drivers/net/wireless/ath/wcn36xx/smd.c
@@ -19,7 +19,7 @@
 #include <linux/etherdevice.h>
 #include <linux/firmware.h>
 #include <linux/bitops.h>
-#include <linux/soc/qcom/smd.h>
+#include <linux/rpmsg.h>
 #include "smd.h"
 
 struct wcn36xx_cfg_val {
@@ -254,7 +254,7 @@ static int wcn36xx_smd_send_and_wait(struct wcn36xx *wcn, size_t len)
 
 	init_completion(&wcn->hal_rsp_compl);
 	start = jiffies;
-	ret = qcom_smd_send(wcn->smd_channel, wcn->hal_buf, len);
+	ret = rpmsg_send(wcn->smd_channel, wcn->hal_buf, len);
 	if (ret) {
 		wcn36xx_err("HAL TX failed\n");
 		goto out;
@@ -2205,11 +2205,11 @@ out:
 	return ret;
 }
 
-int wcn36xx_smd_rsp_process(struct qcom_smd_channel *channel,
-			    const void *buf, size_t len)
+int wcn36xx_smd_rsp_process(struct rpmsg_device *rpdev,
+			    void *buf, int len, void *priv, u32 addr)
 {
 	const struct wcn36xx_hal_msg_header *msg_header = buf;
-	struct ieee80211_hw *hw = qcom_smd_get_drvdata(channel);
+	struct ieee80211_hw *hw = priv;
 	struct wcn36xx *wcn = hw->priv;
 	struct wcn36xx_hal_ind_msg *msg_ind;
 	wcn36xx_dbg_dump(WCN36XX_DBG_SMD_DUMP, "SMD <<< ", buf, len);
diff --git a/drivers/net/wireless/ath/wcn36xx/smd.h b/drivers/net/wireless/ath/wcn36xx/smd.h
index 8892ccd67b14..013fc9546f56 100644
--- a/drivers/net/wireless/ath/wcn36xx/smd.h
+++ b/drivers/net/wireless/ath/wcn36xx/smd.h
@@ -51,7 +51,7 @@ struct wcn36xx_hal_ind_msg {
 };
 
 struct wcn36xx;
-struct qcom_smd_channel;
+struct rpmsg_device;
 
 int wcn36xx_smd_open(struct wcn36xx *wcn);
 void wcn36xx_smd_close(struct wcn36xx *wcn);
@@ -129,8 +129,8 @@ int wcn36xx_smd_trigger_ba(struct wcn36xx *wcn, u8 sta_index);
 
 int wcn36xx_smd_update_cfg(struct wcn36xx *wcn, u32 cfg_id, u32 value);
 
-int wcn36xx_smd_rsp_process(struct qcom_smd_channel *channel,
-			    const void *buf, size_t len);
+int wcn36xx_smd_rsp_process(struct rpmsg_device *rpdev,
+			    void *buf, int len, void *priv, u32 addr);
 
 int wcn36xx_smd_set_mc_list(struct wcn36xx *wcn,
 			    struct ieee80211_vif *vif,
diff --git a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
index 7423998ddeb4..b52b4da9a967 100644
--- a/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
+++ b/drivers/net/wireless/ath/wcn36xx/wcn36xx.h
@@ -195,7 +195,7 @@ struct wcn36xx {
 	void __iomem		*ccu_base;
 	void __iomem		*dxe_base;
 
-	struct qcom_smd_channel *smd_channel;
+	struct rpmsg_endpoint	*smd_channel;
 
 	struct qcom_smem_state  *tx_enable_state;
 	unsigned		tx_enable_state_bit;
diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 78b1bb7bcf20..4e090c697eb6 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -43,7 +43,8 @@ config QCOM_SMD
 
 config QCOM_SMD_RPM
 	tristate "Qualcomm Resource Power Manager (RPM) over SMD"
-	depends on QCOM_SMD && OF
+	depends on ARCH_QCOM
+	depends on RPMSG && OF
 	help
 	  If you say yes to this option, support will be included for the
 	  Resource Power Manager system found in the Qualcomm 8974 based
@@ -76,7 +77,8 @@ config QCOM_SMSM
 
 config QCOM_WCNSS_CTRL
 	tristate "Qualcomm WCNSS control driver"
-	depends on QCOM_SMD
+	depends on ARCH_QCOM
+	depends on RPMSG
 	help
 	  Client driver for the WCNSS_CTRL SMD channel, used to download nv
 	  firmware to a newly booted WCNSS chip.
diff --git a/drivers/soc/qcom/smd-rpm.c b/drivers/soc/qcom/smd-rpm.c
index 6609d7e0edb0..0dcf1bf33126 100644
--- a/drivers/soc/qcom/smd-rpm.c
+++ b/drivers/soc/qcom/smd-rpm.c
@@ -19,7 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 
-#include <linux/soc/qcom/smd.h>
+#include <linux/rpmsg.h>
 #include <linux/soc/qcom/smd-rpm.h>
 
 #define RPM_REQUEST_TIMEOUT     (5 * HZ)
@@ -32,7 +32,7 @@
  * @ack_status:		result of the rpm request
  */
 struct qcom_smd_rpm {
-	struct qcom_smd_channel *rpm_channel;
+	struct rpmsg_endpoint *rpm_channel;
 	struct device *dev;
 
 	struct completion ack;
@@ -133,7 +133,7 @@ int qcom_rpm_smd_write(struct qcom_smd_rpm *rpm,
 	pkt->req.data_len = cpu_to_le32(count);
 	memcpy(pkt->payload, buf, count);
 
-	ret = qcom_smd_send(rpm->rpm_channel, pkt, size);
+	ret = rpmsg_send(rpm->rpm_channel, pkt, size);
 	if (ret)
 		goto out;
 
@@ -150,14 +150,16 @@ out:
 }
 EXPORT_SYMBOL(qcom_rpm_smd_write);
 
-static int qcom_smd_rpm_callback(struct qcom_smd_channel *channel,
-				 const void *data,
-				 size_t count)
+static int qcom_smd_rpm_callback(struct rpmsg_device *rpdev,
+				 void *data,
+				 int count,
+				 void *priv,
+				 u32 addr)
 {
 	const struct qcom_rpm_header *hdr = data;
 	size_t hdr_length = le32_to_cpu(hdr->length);
 	const struct qcom_rpm_message *msg;
-	struct qcom_smd_rpm *rpm = qcom_smd_get_drvdata(channel);
+	struct qcom_smd_rpm *rpm = dev_get_drvdata(&rpdev->dev);
 	const u8 *buf = data + sizeof(struct qcom_rpm_header);
 	const u8 *end = buf + hdr_length;
 	char msgbuf[32];
@@ -196,29 +198,27 @@ static int qcom_smd_rpm_callback(struct qcom_smd_channel *channel,
 	return 0;
 }
 
-static int qcom_smd_rpm_probe(struct qcom_smd_device *sdev)
+static int qcom_smd_rpm_probe(struct rpmsg_device *rpdev)
 {
 	struct qcom_smd_rpm *rpm;
 
-	rpm = devm_kzalloc(&sdev->dev, sizeof(*rpm), GFP_KERNEL);
+	rpm = devm_kzalloc(&rpdev->dev, sizeof(*rpm), GFP_KERNEL);
 	if (!rpm)
 		return -ENOMEM;
 
 	mutex_init(&rpm->lock);
 	init_completion(&rpm->ack);
 
-	rpm->dev = &sdev->dev;
-	rpm->rpm_channel = sdev->channel;
-	qcom_smd_set_drvdata(sdev->channel, rpm);
+	rpm->dev = &rpdev->dev;
+	rpm->rpm_channel = rpdev->ept;
+	dev_set_drvdata(&rpdev->dev, rpm);
 
-	dev_set_drvdata(&sdev->dev, rpm);
-
-	return of_platform_populate(sdev->dev.of_node, NULL, NULL, &sdev->dev);
+	return of_platform_populate(rpdev->dev.of_node, NULL, NULL, &rpdev->dev);
 }
 
-static void qcom_smd_rpm_remove(struct qcom_smd_device *sdev)
+static void qcom_smd_rpm_remove(struct rpmsg_device *rpdev)
 {
-	of_platform_depopulate(&sdev->dev);
+	of_platform_depopulate(&rpdev->dev);
 }
 
 static const struct of_device_id qcom_smd_rpm_of_match[] = {
@@ -229,26 +229,25 @@ static const struct of_device_id qcom_smd_rpm_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, qcom_smd_rpm_of_match);
 
-static struct qcom_smd_driver qcom_smd_rpm_driver = {
+static struct rpmsg_driver qcom_smd_rpm_driver = {
 	.probe = qcom_smd_rpm_probe,
 	.remove = qcom_smd_rpm_remove,
 	.callback = qcom_smd_rpm_callback,
-	.driver  = {
+	.drv  = {
 		.name  = "qcom_smd_rpm",
-		.owner = THIS_MODULE,
 		.of_match_table = qcom_smd_rpm_of_match,
 	},
 };
 
 static int __init qcom_smd_rpm_init(void)
 {
-	return qcom_smd_driver_register(&qcom_smd_rpm_driver);
+	return register_rpmsg_driver(&qcom_smd_rpm_driver);
 }
 arch_initcall(qcom_smd_rpm_init);
 
 static void __exit qcom_smd_rpm_exit(void)
 {
-	qcom_smd_driver_unregister(&qcom_smd_rpm_driver);
+	unregister_rpmsg_driver(&qcom_smd_rpm_driver);
 }
 module_exit(qcom_smd_rpm_exit);
 
diff --git a/drivers/soc/qcom/wcnss_ctrl.c b/drivers/soc/qcom/wcnss_ctrl.c
index 520aedd29965..b9069184df19 100644
--- a/drivers/soc/qcom/wcnss_ctrl.c
+++ b/drivers/soc/qcom/wcnss_ctrl.c
@@ -14,10 +14,10 @@
 #include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/soc/qcom/smd.h>
 #include <linux/io.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
+#include <linux/rpmsg.h>
 #include <linux/soc/qcom/wcnss_ctrl.h>
 
 #define WCNSS_REQUEST_TIMEOUT	(5 * HZ)
@@ -40,7 +40,7 @@
  */
 struct wcnss_ctrl {
 	struct device *dev;
-	struct qcom_smd_channel *channel;
+	struct rpmsg_endpoint *channel;
 
 	struct completion ack;
 	struct completion cbc;
@@ -122,11 +122,13 @@ struct wcnss_download_nv_resp {
  *
  * Handles any incoming packets from the remote WCNSS_CTRL service.
  */
-static int wcnss_ctrl_smd_callback(struct qcom_smd_channel *channel,
-				   const void *data,
-				   size_t count)
+static int wcnss_ctrl_smd_callback(struct rpmsg_device *rpdev,
+				   void *data,
+				   int count,
+				   void *priv,
+				   u32 addr)
 {
-	struct wcnss_ctrl *wcnss = qcom_smd_get_drvdata(channel);
+	struct wcnss_ctrl *wcnss = dev_get_drvdata(&rpdev->dev);
 	const struct wcnss_download_nv_resp *nvresp;
 	const struct wcnss_version_resp *version;
 	const struct wcnss_msg_hdr *hdr = data;
@@ -180,7 +182,7 @@ static int wcnss_request_version(struct wcnss_ctrl *wcnss)
 
 	msg.type = WCNSS_VERSION_REQ;
 	msg.len = sizeof(msg);
-	ret = qcom_smd_send(wcnss->channel, &msg, sizeof(msg));
+	ret = rpmsg_send(wcnss->channel, &msg, sizeof(msg));
 	if (ret < 0)
 		return ret;
 
@@ -238,7 +240,7 @@ static int wcnss_download_nv(struct wcnss_ctrl *wcnss, bool *expect_cbc)
 
 		memcpy(req->fragment, data, req->frag_size);
 
-		ret = qcom_smd_send(wcnss->channel, req, req->hdr.len);
+		ret = rpmsg_send(wcnss->channel, req, req->hdr.len);
 		if (ret < 0) {
 			dev_err(wcnss->dev, "failed to send smd packet\n");
 			goto release_fw;
@@ -274,11 +276,16 @@ free_req:
  * @name:	SMD channel name
  * @cb:		callback to handle incoming data on the channel
  */
-struct qcom_smd_channel *qcom_wcnss_open_channel(void *wcnss, const char *name, qcom_smd_cb_t cb)
+struct rpmsg_endpoint *qcom_wcnss_open_channel(void *wcnss, const char *name, rpmsg_rx_cb_t cb, void *priv)
 {
+	struct rpmsg_channel_info chinfo;
 	struct wcnss_ctrl *_wcnss = wcnss;
 
-	return qcom_smd_open_channel(_wcnss->channel, name, cb);
+	strncpy(chinfo.name, name, sizeof(chinfo.name));
+	chinfo.src = RPMSG_ADDR_ANY;
+	chinfo.dst = RPMSG_ADDR_ANY;
+
+	return rpmsg_create_ept(_wcnss->channel->rpdev, cb, priv, chinfo);
 }
 EXPORT_SYMBOL(qcom_wcnss_open_channel);
 
@@ -306,35 +313,34 @@ static void wcnss_async_probe(struct work_struct *work)
 	of_platform_populate(wcnss->dev->of_node, NULL, NULL, wcnss->dev);
 }
 
-static int wcnss_ctrl_probe(struct qcom_smd_device *sdev)
+static int wcnss_ctrl_probe(struct rpmsg_device *rpdev)
 {
 	struct wcnss_ctrl *wcnss;
 
-	wcnss = devm_kzalloc(&sdev->dev, sizeof(*wcnss), GFP_KERNEL);
+	wcnss = devm_kzalloc(&rpdev->dev, sizeof(*wcnss), GFP_KERNEL);
 	if (!wcnss)
 		return -ENOMEM;
 
-	wcnss->dev = &sdev->dev;
-	wcnss->channel = sdev->channel;
+	wcnss->dev = &rpdev->dev;
+	wcnss->channel = rpdev->ept;
 
 	init_completion(&wcnss->ack);
 	init_completion(&wcnss->cbc);
 	INIT_WORK(&wcnss->probe_work, wcnss_async_probe);
 
-	qcom_smd_set_drvdata(sdev->channel, wcnss);
-	dev_set_drvdata(&sdev->dev, wcnss);
+	dev_set_drvdata(&rpdev->dev, wcnss);
 
 	schedule_work(&wcnss->probe_work);
 
 	return 0;
 }
 
-static void wcnss_ctrl_remove(struct qcom_smd_device *sdev)
+static void wcnss_ctrl_remove(struct rpmsg_device *rpdev)
 {
-	struct wcnss_ctrl *wcnss = qcom_smd_get_drvdata(sdev->channel);
+	struct wcnss_ctrl *wcnss = dev_get_drvdata(&rpdev->dev);
 
 	cancel_work_sync(&wcnss->probe_work);
-	of_platform_depopulate(&sdev->dev);
+	of_platform_depopulate(&rpdev->dev);
 }
 
 static const struct of_device_id wcnss_ctrl_of_match[] = {
@@ -342,18 +348,18 @@ static const struct of_device_id wcnss_ctrl_of_match[] = {
 	{}
 };
 
-static struct qcom_smd_driver wcnss_ctrl_driver = {
+static struct rpmsg_driver wcnss_ctrl_driver = {
 	.probe = wcnss_ctrl_probe,
 	.remove = wcnss_ctrl_remove,
 	.callback = wcnss_ctrl_smd_callback,
-	.driver  = {
+	.drv  = {
 		.name  = "qcom_wcnss_ctrl",
 		.owner = THIS_MODULE,
 		.of_match_table = wcnss_ctrl_of_match,
 	},
 };
 
-module_qcom_smd_driver(wcnss_ctrl_driver);
+module_rpmsg_driver(wcnss_ctrl_driver);
 
 MODULE_DESCRIPTION("Qualcomm WCNSS control driver");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/qcom/wcnss_ctrl.h b/include/linux/soc/qcom/wcnss_ctrl.h
index eab64976a73b..a4dd4d7c711d 100644
--- a/include/linux/soc/qcom/wcnss_ctrl.h
+++ b/include/linux/soc/qcom/wcnss_ctrl.h
@@ -1,16 +1,19 @@
 #ifndef __WCNSS_CTRL_H__
 #define __WCNSS_CTRL_H__
 
-#include <linux/soc/qcom/smd.h>
+#include <linux/rpmsg.h>
 
 #if IS_ENABLED(CONFIG_QCOM_WCNSS_CTRL)
 
-struct qcom_smd_channel *qcom_wcnss_open_channel(void *wcnss, const char *name, qcom_smd_cb_t cb);
+struct rpmsg_endpoint *qcom_wcnss_open_channel(void *wcnss, const char *name,
+					       rpmsg_rx_cb_t cb, void *priv);
 
 #else
 
-static inline struct qcom_smd_channel*
-qcom_wcnss_open_channel(void *wcnss, const char *name, qcom_smd_cb_t cb)
+static struct rpmsg_endpoint *qcom_wcnss_open_channel(void *wcnss,
+						      const char *name,
+						      rpmsg_rx_cb_t cb,
+						      void *priv)
 {
 	WARN_ON(1);
 	return ERR_PTR(-ENXIO);
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
index b83c6807a5ae..326fd97444f5 100644
--- a/net/qrtr/Kconfig
+++ b/net/qrtr/Kconfig
@@ -16,7 +16,7 @@ if QRTR
 
 config QRTR_SMD
 	tristate "SMD IPC Router channels"
-	depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n)
+	depends on RPMSG || (COMPILE_TEST && RPMSG=n)
 	---help---
 	  Say Y here to support SMD based ipcrouter channels.  SMD is the
 	  most common transport for IPC Router.
diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c
index 0d11132b3370..50615d5efac1 100644
--- a/net/qrtr/smd.c
+++ b/net/qrtr/smd.c
@@ -14,21 +14,21 @@
 
 #include <linux/module.h>
 #include <linux/skbuff.h>
-#include <linux/soc/qcom/smd.h>
+#include <linux/rpmsg.h>
 
 #include "qrtr.h"
 
 struct qrtr_smd_dev {
 	struct qrtr_endpoint ep;
-	struct qcom_smd_channel *channel;
+	struct rpmsg_endpoint *channel;
 	struct device *dev;
 };
 
 /* from smd to qrtr */
-static int qcom_smd_qrtr_callback(struct qcom_smd_channel *channel,
-				  const void *data, size_t len)
+static int qcom_smd_qrtr_callback(struct rpmsg_device *rpdev,
+				  void *data, int len, void *priv, u32 addr)
 {
-	struct qrtr_smd_dev *qdev = qcom_smd_get_drvdata(channel);
+	struct qrtr_smd_dev *qdev = dev_get_drvdata(&rpdev->dev);
 	int rc;
 
 	if (!qdev)
@@ -54,7 +54,7 @@ static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
 	if (rc)
 		goto out;
 
-	rc = qcom_smd_send(qdev->channel, skb->data, skb->len);
+	rc = rpmsg_send(qdev->channel, skb->data, skb->len);
 
 out:
 	if (rc)
@@ -64,57 +64,55 @@ out:
 	return rc;
 }
 
-static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev)
+static int qcom_smd_qrtr_probe(struct rpmsg_device *rpdev)
 {
 	struct qrtr_smd_dev *qdev;
 	int rc;
 
-	qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL);
+	qdev = devm_kzalloc(&rpdev->dev, sizeof(*qdev), GFP_KERNEL);
 	if (!qdev)
 		return -ENOMEM;
 
-	qdev->channel = sdev->channel;
-	qdev->dev = &sdev->dev;
+	qdev->channel = rpdev->ept;
+	qdev->dev = &rpdev->dev;
 	qdev->ep.xmit = qcom_smd_qrtr_send;
 
 	rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO);
 	if (rc)
 		return rc;
 
-	qcom_smd_set_drvdata(sdev->channel, qdev);
-	dev_set_drvdata(&sdev->dev, qdev);
+	dev_set_drvdata(&rpdev->dev, qdev);
 
-	dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n");
+	dev_dbg(&rpdev->dev, "Qualcomm SMD QRTR driver probed\n");
 
 	return 0;
 }
 
-static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev)
+static void qcom_smd_qrtr_remove(struct rpmsg_device *rpdev)
 {
-	struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev);
+	struct qrtr_smd_dev *qdev = dev_get_drvdata(&rpdev->dev);
 
 	qrtr_endpoint_unregister(&qdev->ep);
 
-	dev_set_drvdata(&sdev->dev, NULL);
+	dev_set_drvdata(&rpdev->dev, NULL);
 }
 
-static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = {
+static const struct rpmsg_device_id qcom_smd_qrtr_smd_match[] = {
 	{ "IPCRTR" },
 	{}
 };
 
-static struct qcom_smd_driver qcom_smd_qrtr_driver = {
+static struct rpmsg_driver qcom_smd_qrtr_driver = {
 	.probe = qcom_smd_qrtr_probe,
 	.remove = qcom_smd_qrtr_remove,
 	.callback = qcom_smd_qrtr_callback,
-	.smd_match_table = qcom_smd_qrtr_smd_match,
-	.driver = {
+	.id_table = qcom_smd_qrtr_smd_match,
+	.drv = {
 		.name = "qcom_smd_qrtr",
-		.owner = THIS_MODULE,
 	},
 };
 
-module_qcom_smd_driver(qcom_smd_qrtr_driver);
+module_rpmsg_driver(qcom_smd_qrtr_driver);
 
 MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver");
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 395a48053af6c5e0f0217b610dcb7225ea3e3e42 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Mon, 27 Mar 2017 22:26:34 -0700
Subject: soc: qcom: smd: Remove standalone driver

Remove the standalone SMD implementation as we have transitioned the
client drivers to use the RPMSG based one.

Also remove all dependencies on QCOM_SMD from Kconfig files, in order to
keep them selectable in the absence of the removed symbol.

Acked-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/remoteproc/Kconfig     |    6 +-
 drivers/rpmsg/Kconfig          |    1 -
 drivers/soc/qcom/Kconfig       |    8 -
 drivers/soc/qcom/Makefile      |    1 -
 drivers/soc/qcom/smd.c         | 1560 ----------------------------------------
 include/linux/rpmsg/qcom_smd.h |    2 +-
 include/linux/soc/qcom/smd.h   |  139 ----
 7 files changed, 4 insertions(+), 1713 deletions(-)
 delete mode 100644 drivers/soc/qcom/smd.c
 delete mode 100644 include/linux/soc/qcom/smd.h

(limited to 'include/linux')

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index 1dc43fc5f65f..faad69a1a597 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -76,7 +76,7 @@ config QCOM_ADSP_PIL
 	depends on OF && ARCH_QCOM
 	depends on REMOTEPROC
 	depends on QCOM_SMEM
-	depends on RPMSG_QCOM_SMD || QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n && RPMSG_QCOM_SMD=n)
+	depends on RPMSG_QCOM_SMD || (COMPILE_TEST && RPMSG_QCOM_SMD=n)
 	select MFD_SYSCON
 	select QCOM_MDT_LOADER
 	select QCOM_RPROC_COMMON
@@ -93,7 +93,7 @@ config QCOM_Q6V5_PIL
 	depends on OF && ARCH_QCOM
 	depends on QCOM_SMEM
 	depends on REMOTEPROC
-	depends on RPMSG_QCOM_SMD || QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n && RPMSG_QCOM_SMD=n)
+	depends on RPMSG_QCOM_SMD || (COMPILE_TEST && RPMSG_QCOM_SMD=n)
 	select MFD_SYSCON
 	select QCOM_RPROC_COMMON
 	select QCOM_SCM
@@ -104,7 +104,7 @@ config QCOM_Q6V5_PIL
 config QCOM_WCNSS_PIL
 	tristate "Qualcomm WCNSS Peripheral Image Loader"
 	depends on OF && ARCH_QCOM
-	depends on RPMSG_QCOM_SMD || QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n && RPMSG_QCOM_SMD=n)
+	depends on RPMSG_QCOM_SMD || (COMPILE_TEST && RPMSG_QCOM_SMD=n)
 	depends on QCOM_SMEM
 	depends on REMOTEPROC
 	select QCOM_MDT_LOADER
diff --git a/drivers/rpmsg/Kconfig b/drivers/rpmsg/Kconfig
index f12ac0b28263..edc008f55663 100644
--- a/drivers/rpmsg/Kconfig
+++ b/drivers/rpmsg/Kconfig
@@ -16,7 +16,6 @@ config RPMSG_CHAR
 config RPMSG_QCOM_SMD
 	tristate "Qualcomm Shared Memory Driver (SMD)"
 	depends on QCOM_SMEM
-	depends on QCOM_SMD=n
 	select RPMSG
 	help
 	  Say y here to enable support for the Qualcomm Shared Memory Driver
diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 4e090c697eb6..9fca977ef18d 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -33,14 +33,6 @@ config QCOM_SMEM
 	  The driver provides an interface to items in a heap shared among all
 	  processors in a Qualcomm platform.
 
-config QCOM_SMD
-	tristate "Qualcomm Shared Memory Driver (SMD)"
-	depends on QCOM_SMEM
-	help
-	  Say y here to enable support for the Qualcomm Shared Memory Driver
-	  providing communication channels to remote processors in Qualcomm
-	  platforms.
-
 config QCOM_SMD_RPM
 	tristate "Qualcomm Resource Power Manager (RPM) over SMD"
 	depends on ARCH_QCOM
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index 1f30260b06b8..414f0de274fa 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_QCOM_GSBI)	+=	qcom_gsbi.o
 obj-$(CONFIG_QCOM_MDT_LOADER)	+= mdt_loader.o
 obj-$(CONFIG_QCOM_PM)	+=	spm.o
-obj-$(CONFIG_QCOM_SMD) +=	smd.o
 obj-$(CONFIG_QCOM_SMD_RPM)	+= smd-rpm.o
 obj-$(CONFIG_QCOM_SMEM) +=	smem.o
 obj-$(CONFIG_QCOM_SMEM_STATE) += smem_state.o
diff --git a/drivers/soc/qcom/smd.c b/drivers/soc/qcom/smd.c
deleted file mode 100644
index 322034ab9d37..000000000000
--- a/drivers/soc/qcom/smd.c
+++ /dev/null
@@ -1,1560 +0,0 @@
-/*
- * Copyright (c) 2015, Sony Mobile Communications AB.
- * Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/mfd/syscon.h>
-#include <linux/module.h>
-#include <linux/of_irq.h>
-#include <linux/of_platform.h>
-#include <linux/platform_device.h>
-#include <linux/regmap.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/soc/qcom/smd.h>
-#include <linux/soc/qcom/smem.h>
-#include <linux/wait.h>
-
-/*
- * The Qualcomm Shared Memory communication solution provides point-to-point
- * channels for clients to send and receive streaming or packet based data.
- *
- * Each channel consists of a control item (channel info) and a ring buffer
- * pair. The channel info carry information related to channel state, flow
- * control and the offsets within the ring buffer.
- *
- * All allocated channels are listed in an allocation table, identifying the
- * pair of items by name, type and remote processor.
- *
- * Upon creating a new channel the remote processor allocates channel info and
- * ring buffer items from the smem heap and populate the allocation table. An
- * interrupt is sent to the other end of the channel and a scan for new
- * channels should be done. A channel never goes away, it will only change
- * state.
- *
- * The remote processor signals it intent for bring up the communication
- * channel by setting the state of its end of the channel to "opening" and
- * sends out an interrupt. We detect this change and register a smd device to
- * consume the channel. Upon finding a consumer we finish the handshake and the
- * channel is up.
- *
- * Upon closing a channel, the remote processor will update the state of its
- * end of the channel and signal us, we will then unregister any attached
- * device and close our end of the channel.
- *
- * Devices attached to a channel can use the qcom_smd_send function to push
- * data to the channel, this is done by copying the data into the tx ring
- * buffer, updating the pointers in the channel info and signaling the remote
- * processor.
- *
- * The remote processor does the equivalent when it transfer data and upon
- * receiving the interrupt we check the channel info for new data and delivers
- * this to the attached device. If the device is not ready to receive the data
- * we leave it in the ring buffer for now.
- */
-
-struct smd_channel_info;
-struct smd_channel_info_pair;
-struct smd_channel_info_word;
-struct smd_channel_info_word_pair;
-
-#define SMD_ALLOC_TBL_COUNT	2
-#define SMD_ALLOC_TBL_SIZE	64
-
-/*
- * This lists the various smem heap items relevant for the allocation table and
- * smd channel entries.
- */
-static const struct {
-	unsigned alloc_tbl_id;
-	unsigned info_base_id;
-	unsigned fifo_base_id;
-} smem_items[SMD_ALLOC_TBL_COUNT] = {
-	{
-		.alloc_tbl_id = 13,
-		.info_base_id = 14,
-		.fifo_base_id = 338
-	},
-	{
-		.alloc_tbl_id = 266,
-		.info_base_id = 138,
-		.fifo_base_id = 202,
-	},
-};
-
-/**
- * struct qcom_smd_edge - representing a remote processor
- * @dev:		device for this edge
- * @of_node:		of_node handle for information related to this edge
- * @edge_id:		identifier of this edge
- * @remote_pid:		identifier of remote processor
- * @irq:		interrupt for signals on this edge
- * @ipc_regmap:		regmap handle holding the outgoing ipc register
- * @ipc_offset:		offset within @ipc_regmap of the register for ipc
- * @ipc_bit:		bit in the register at @ipc_offset of @ipc_regmap
- * @channels:		list of all channels detected on this edge
- * @channels_lock:	guard for modifications of @channels
- * @allocated:		array of bitmaps representing already allocated channels
- * @smem_available:	last available amount of smem triggering a channel scan
- * @scan_work:		work item for discovering new channels
- * @state_work:		work item for edge state changes
- */
-struct qcom_smd_edge {
-	struct device dev;
-
-	struct device_node *of_node;
-	unsigned edge_id;
-	unsigned remote_pid;
-
-	int irq;
-
-	struct regmap *ipc_regmap;
-	int ipc_offset;
-	int ipc_bit;
-
-	struct list_head channels;
-	spinlock_t channels_lock;
-
-	DECLARE_BITMAP(allocated[SMD_ALLOC_TBL_COUNT], SMD_ALLOC_TBL_SIZE);
-
-	unsigned smem_available;
-
-	wait_queue_head_t new_channel_event;
-
-	struct work_struct scan_work;
-	struct work_struct state_work;
-};
-
-#define to_smd_edge(d) container_of(d, struct qcom_smd_edge, dev)
-
-/*
- * SMD channel states.
- */
-enum smd_channel_state {
-	SMD_CHANNEL_CLOSED,
-	SMD_CHANNEL_OPENING,
-	SMD_CHANNEL_OPENED,
-	SMD_CHANNEL_FLUSHING,
-	SMD_CHANNEL_CLOSING,
-	SMD_CHANNEL_RESET,
-	SMD_CHANNEL_RESET_OPENING
-};
-
-/**
- * struct qcom_smd_channel - smd channel struct
- * @edge:		qcom_smd_edge this channel is living on
- * @qsdev:		reference to a associated smd client device
- * @name:		name of the channel
- * @state:		local state of the channel
- * @remote_state:	remote state of the channel
- * @info:		byte aligned outgoing/incoming channel info
- * @info_word:		word aligned outgoing/incoming channel info
- * @tx_lock:		lock to make writes to the channel mutually exclusive
- * @fblockread_event:	wakeup event tied to tx fBLOCKREADINTR
- * @tx_fifo:		pointer to the outgoing ring buffer
- * @rx_fifo:		pointer to the incoming ring buffer
- * @fifo_size:		size of each ring buffer
- * @bounce_buffer:	bounce buffer for reading wrapped packets
- * @cb:			callback function registered for this channel
- * @recv_lock:		guard for rx info modifications and cb pointer
- * @pkt_size:		size of the currently handled packet
- * @list:		lite entry for @channels in qcom_smd_edge
- */
-struct qcom_smd_channel {
-	struct qcom_smd_edge *edge;
-
-	struct qcom_smd_device *qsdev;
-
-	char *name;
-	enum smd_channel_state state;
-	enum smd_channel_state remote_state;
-
-	struct smd_channel_info_pair *info;
-	struct smd_channel_info_word_pair *info_word;
-
-	struct mutex tx_lock;
-	wait_queue_head_t fblockread_event;
-
-	void *tx_fifo;
-	void *rx_fifo;
-	int fifo_size;
-
-	void *bounce_buffer;
-	qcom_smd_cb_t cb;
-
-	spinlock_t recv_lock;
-
-	int pkt_size;
-
-	void *drvdata;
-
-	struct list_head list;
-};
-
-/*
- * Format of the smd_info smem items, for byte aligned channels.
- */
-struct smd_channel_info {
-	__le32 state;
-	u8  fDSR;
-	u8  fCTS;
-	u8  fCD;
-	u8  fRI;
-	u8  fHEAD;
-	u8  fTAIL;
-	u8  fSTATE;
-	u8  fBLOCKREADINTR;
-	__le32 tail;
-	__le32 head;
-};
-
-struct smd_channel_info_pair {
-	struct smd_channel_info tx;
-	struct smd_channel_info rx;
-};
-
-/*
- * Format of the smd_info smem items, for word aligned channels.
- */
-struct smd_channel_info_word {
-	__le32 state;
-	__le32 fDSR;
-	__le32 fCTS;
-	__le32 fCD;
-	__le32 fRI;
-	__le32 fHEAD;
-	__le32 fTAIL;
-	__le32 fSTATE;
-	__le32 fBLOCKREADINTR;
-	__le32 tail;
-	__le32 head;
-};
-
-struct smd_channel_info_word_pair {
-	struct smd_channel_info_word tx;
-	struct smd_channel_info_word rx;
-};
-
-#define GET_RX_CHANNEL_FLAG(channel, param)				     \
-	({								     \
-		BUILD_BUG_ON(sizeof(channel->info->rx.param) != sizeof(u8)); \
-		channel->info_word ?					     \
-			le32_to_cpu(channel->info_word->rx.param) :	     \
-			channel->info->rx.param;			     \
-	})
-
-#define GET_RX_CHANNEL_INFO(channel, param)				      \
-	({								      \
-		BUILD_BUG_ON(sizeof(channel->info->rx.param) != sizeof(u32)); \
-		le32_to_cpu(channel->info_word ?			      \
-			channel->info_word->rx.param :			      \
-			channel->info->rx.param);			      \
-	})
-
-#define SET_RX_CHANNEL_FLAG(channel, param, value)			     \
-	({								     \
-		BUILD_BUG_ON(sizeof(channel->info->rx.param) != sizeof(u8)); \
-		if (channel->info_word)					     \
-			channel->info_word->rx.param = cpu_to_le32(value);   \
-		else							     \
-			channel->info->rx.param = value;		     \
-	})
-
-#define SET_RX_CHANNEL_INFO(channel, param, value)			      \
-	({								      \
-		BUILD_BUG_ON(sizeof(channel->info->rx.param) != sizeof(u32)); \
-		if (channel->info_word)					      \
-			channel->info_word->rx.param = cpu_to_le32(value);    \
-		else							      \
-			channel->info->rx.param = cpu_to_le32(value);	      \
-	})
-
-#define GET_TX_CHANNEL_FLAG(channel, param)				     \
-	({								     \
-		BUILD_BUG_ON(sizeof(channel->info->tx.param) != sizeof(u8)); \
-		channel->info_word ?					     \
-			le32_to_cpu(channel->info_word->tx.param) :          \
-			channel->info->tx.param;			     \
-	})
-
-#define GET_TX_CHANNEL_INFO(channel, param)				      \
-	({								      \
-		BUILD_BUG_ON(sizeof(channel->info->tx.param) != sizeof(u32)); \
-		le32_to_cpu(channel->info_word ?			      \
-			channel->info_word->tx.param :			      \
-			channel->info->tx.param);			      \
-	})
-
-#define SET_TX_CHANNEL_FLAG(channel, param, value)			     \
-	({								     \
-		BUILD_BUG_ON(sizeof(channel->info->tx.param) != sizeof(u8)); \
-		if (channel->info_word)					     \
-			channel->info_word->tx.param = cpu_to_le32(value);   \
-		else							     \
-			channel->info->tx.param = value;		     \
-	})
-
-#define SET_TX_CHANNEL_INFO(channel, param, value)			      \
-	({								      \
-		BUILD_BUG_ON(sizeof(channel->info->tx.param) != sizeof(u32)); \
-		if (channel->info_word)					      \
-			channel->info_word->tx.param = cpu_to_le32(value);   \
-		else							      \
-			channel->info->tx.param = cpu_to_le32(value);	      \
-	})
-
-/**
- * struct qcom_smd_alloc_entry - channel allocation entry
- * @name:	channel name
- * @cid:	channel index
- * @flags:	channel flags and edge id
- * @ref_count:	reference count of the channel
- */
-struct qcom_smd_alloc_entry {
-	u8 name[20];
-	__le32 cid;
-	__le32 flags;
-	__le32 ref_count;
-} __packed;
-
-#define SMD_CHANNEL_FLAGS_EDGE_MASK	0xff
-#define SMD_CHANNEL_FLAGS_STREAM	BIT(8)
-#define SMD_CHANNEL_FLAGS_PACKET	BIT(9)
-
-/*
- * Each smd packet contains a 20 byte header, with the first 4 being the length
- * of the packet.
- */
-#define SMD_PACKET_HEADER_LEN	20
-
-/*
- * Signal the remote processor associated with 'channel'.
- */
-static void qcom_smd_signal_channel(struct qcom_smd_channel *channel)
-{
-	struct qcom_smd_edge *edge = channel->edge;
-
-	regmap_write(edge->ipc_regmap, edge->ipc_offset, BIT(edge->ipc_bit));
-}
-
-/*
- * Initialize the tx channel info
- */
-static void qcom_smd_channel_reset(struct qcom_smd_channel *channel)
-{
-	SET_TX_CHANNEL_INFO(channel, state, SMD_CHANNEL_CLOSED);
-	SET_TX_CHANNEL_FLAG(channel, fDSR, 0);
-	SET_TX_CHANNEL_FLAG(channel, fCTS, 0);
-	SET_TX_CHANNEL_FLAG(channel, fCD, 0);
-	SET_TX_CHANNEL_FLAG(channel, fRI, 0);
-	SET_TX_CHANNEL_FLAG(channel, fHEAD, 0);
-	SET_TX_CHANNEL_FLAG(channel, fTAIL, 0);
-	SET_TX_CHANNEL_FLAG(channel, fSTATE, 1);
-	SET_TX_CHANNEL_FLAG(channel, fBLOCKREADINTR, 1);
-	SET_TX_CHANNEL_INFO(channel, head, 0);
-	SET_RX_CHANNEL_INFO(channel, tail, 0);
-
-	qcom_smd_signal_channel(channel);
-
-	channel->state = SMD_CHANNEL_CLOSED;
-	channel->pkt_size = 0;
-}
-
-/*
- * Set the callback for a channel, with appropriate locking
- */
-static void qcom_smd_channel_set_callback(struct qcom_smd_channel *channel,
-					  qcom_smd_cb_t cb)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&channel->recv_lock, flags);
-	channel->cb = cb;
-	spin_unlock_irqrestore(&channel->recv_lock, flags);
-};
-
-/*
- * Calculate the amount of data available in the rx fifo
- */
-static size_t qcom_smd_channel_get_rx_avail(struct qcom_smd_channel *channel)
-{
-	unsigned head;
-	unsigned tail;
-
-	head = GET_RX_CHANNEL_INFO(channel, head);
-	tail = GET_RX_CHANNEL_INFO(channel, tail);
-
-	return (head - tail) & (channel->fifo_size - 1);
-}
-
-/*
- * Set tx channel state and inform the remote processor
- */
-static void qcom_smd_channel_set_state(struct qcom_smd_channel *channel,
-				       int state)
-{
-	struct qcom_smd_edge *edge = channel->edge;
-	bool is_open = state == SMD_CHANNEL_OPENED;
-
-	if (channel->state == state)
-		return;
-
-	dev_dbg(&edge->dev, "set_state(%s, %d)\n", channel->name, state);
-
-	SET_TX_CHANNEL_FLAG(channel, fDSR, is_open);
-	SET_TX_CHANNEL_FLAG(channel, fCTS, is_open);
-	SET_TX_CHANNEL_FLAG(channel, fCD, is_open);
-
-	SET_TX_CHANNEL_INFO(channel, state, state);
-	SET_TX_CHANNEL_FLAG(channel, fSTATE, 1);
-
-	channel->state = state;
-	qcom_smd_signal_channel(channel);
-}
-
-/*
- * Copy count bytes of data using 32bit accesses, if that's required.
- */
-static void smd_copy_to_fifo(void __iomem *dst,
-			     const void *src,
-			     size_t count,
-			     bool word_aligned)
-{
-	if (word_aligned) {
-		__iowrite32_copy(dst, src, count / sizeof(u32));
-	} else {
-		memcpy_toio(dst, src, count);
-	}
-}
-
-/*
- * Copy count bytes of data using 32bit accesses, if that is required.
- */
-static void smd_copy_from_fifo(void *dst,
-			       const void __iomem *src,
-			       size_t count,
-			       bool word_aligned)
-{
-	if (word_aligned) {
-		__ioread32_copy(dst, src, count / sizeof(u32));
-	} else {
-		memcpy_fromio(dst, src, count);
-	}
-}
-
-/*
- * Read count bytes of data from the rx fifo into buf, but don't advance the
- * tail.
- */
-static size_t qcom_smd_channel_peek(struct qcom_smd_channel *channel,
-				    void *buf, size_t count)
-{
-	bool word_aligned;
-	unsigned tail;
-	size_t len;
-
-	word_aligned = channel->info_word;
-	tail = GET_RX_CHANNEL_INFO(channel, tail);
-
-	len = min_t(size_t, count, channel->fifo_size - tail);
-	if (len) {
-		smd_copy_from_fifo(buf,
-				   channel->rx_fifo + tail,
-				   len,
-				   word_aligned);
-	}
-
-	if (len != count) {
-		smd_copy_from_fifo(buf + len,
-				   channel->rx_fifo,
-				   count - len,
-				   word_aligned);
-	}
-
-	return count;
-}
-
-/*
- * Advance the rx tail by count bytes.
- */
-static void qcom_smd_channel_advance(struct qcom_smd_channel *channel,
-				     size_t count)
-{
-	unsigned tail;
-
-	tail = GET_RX_CHANNEL_INFO(channel, tail);
-	tail += count;
-	tail &= (channel->fifo_size - 1);
-	SET_RX_CHANNEL_INFO(channel, tail, tail);
-}
-
-/*
- * Read out a single packet from the rx fifo and deliver it to the device
- */
-static int qcom_smd_channel_recv_single(struct qcom_smd_channel *channel)
-{
-	unsigned tail;
-	size_t len;
-	void *ptr;
-	int ret;
-
-	if (!channel->cb)
-		return 0;
-
-	tail = GET_RX_CHANNEL_INFO(channel, tail);
-
-	/* Use bounce buffer if the data wraps */
-	if (tail + channel->pkt_size >= channel->fifo_size) {
-		ptr = channel->bounce_buffer;
-		len = qcom_smd_channel_peek(channel, ptr, channel->pkt_size);
-	} else {
-		ptr = channel->rx_fifo + tail;
-		len = channel->pkt_size;
-	}
-
-	ret = channel->cb(channel, ptr, len);
-	if (ret < 0)
-		return ret;
-
-	/* Only forward the tail if the client consumed the data */
-	qcom_smd_channel_advance(channel, len);
-
-	channel->pkt_size = 0;
-
-	return 0;
-}
-
-/*
- * Per channel interrupt handling
- */
-static bool qcom_smd_channel_intr(struct qcom_smd_channel *channel)
-{
-	bool need_state_scan = false;
-	int remote_state;
-	__le32 pktlen;
-	int avail;
-	int ret;
-
-	/* Handle state changes */
-	remote_state = GET_RX_CHANNEL_INFO(channel, state);
-	if (remote_state != channel->remote_state) {
-		channel->remote_state = remote_state;
-		need_state_scan = true;
-	}
-	/* Indicate that we have seen any state change */
-	SET_RX_CHANNEL_FLAG(channel, fSTATE, 0);
-
-	/* Signal waiting qcom_smd_send() about the interrupt */
-	if (!GET_TX_CHANNEL_FLAG(channel, fBLOCKREADINTR))
-		wake_up_interruptible(&channel->fblockread_event);
-
-	/* Don't consume any data until we've opened the channel */
-	if (channel->state != SMD_CHANNEL_OPENED)
-		goto out;
-
-	/* Indicate that we've seen the new data */
-	SET_RX_CHANNEL_FLAG(channel, fHEAD, 0);
-
-	/* Consume data */
-	for (;;) {
-		avail = qcom_smd_channel_get_rx_avail(channel);
-
-		if (!channel->pkt_size && avail >= SMD_PACKET_HEADER_LEN) {
-			qcom_smd_channel_peek(channel, &pktlen, sizeof(pktlen));
-			qcom_smd_channel_advance(channel, SMD_PACKET_HEADER_LEN);
-			channel->pkt_size = le32_to_cpu(pktlen);
-		} else if (channel->pkt_size && avail >= channel->pkt_size) {
-			ret = qcom_smd_channel_recv_single(channel);
-			if (ret)
-				break;
-		} else {
-			break;
-		}
-	}
-
-	/* Indicate that we have seen and updated tail */
-	SET_RX_CHANNEL_FLAG(channel, fTAIL, 1);
-
-	/* Signal the remote that we've consumed the data (if requested) */
-	if (!GET_RX_CHANNEL_FLAG(channel, fBLOCKREADINTR)) {
-		/* Ensure ordering of channel info updates */
-		wmb();
-
-		qcom_smd_signal_channel(channel);
-	}
-
-out:
-	return need_state_scan;
-}
-
-/*
- * The edge interrupts are triggered by the remote processor on state changes,
- * channel info updates or when new channels are created.
- */
-static irqreturn_t qcom_smd_edge_intr(int irq, void *data)
-{
-	struct qcom_smd_edge *edge = data;
-	struct qcom_smd_channel *channel;
-	unsigned available;
-	bool kick_scanner = false;
-	bool kick_state = false;
-
-	/*
-	 * Handle state changes or data on each of the channels on this edge
-	 */
-	spin_lock(&edge->channels_lock);
-	list_for_each_entry(channel, &edge->channels, list) {
-		spin_lock(&channel->recv_lock);
-		kick_state |= qcom_smd_channel_intr(channel);
-		spin_unlock(&channel->recv_lock);
-	}
-	spin_unlock(&edge->channels_lock);
-
-	/*
-	 * Creating a new channel requires allocating an smem entry, so we only
-	 * have to scan if the amount of available space in smem have changed
-	 * since last scan.
-	 */
-	available = qcom_smem_get_free_space(edge->remote_pid);
-	if (available != edge->smem_available) {
-		edge->smem_available = available;
-		kick_scanner = true;
-	}
-
-	if (kick_scanner)
-		schedule_work(&edge->scan_work);
-	if (kick_state)
-		schedule_work(&edge->state_work);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * Delivers any outstanding packets in the rx fifo, can be used after probe of
- * the clients to deliver any packets that wasn't delivered before the client
- * was setup.
- */
-static void qcom_smd_channel_resume(struct qcom_smd_channel *channel)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&channel->recv_lock, flags);
-	qcom_smd_channel_intr(channel);
-	spin_unlock_irqrestore(&channel->recv_lock, flags);
-}
-
-/*
- * Calculate how much space is available in the tx fifo.
- */
-static size_t qcom_smd_get_tx_avail(struct qcom_smd_channel *channel)
-{
-	unsigned head;
-	unsigned tail;
-	unsigned mask = channel->fifo_size - 1;
-
-	head = GET_TX_CHANNEL_INFO(channel, head);
-	tail = GET_TX_CHANNEL_INFO(channel, tail);
-
-	return mask - ((head - tail) & mask);
-}
-
-/*
- * Write count bytes of data into channel, possibly wrapping in the ring buffer
- */
-static int qcom_smd_write_fifo(struct qcom_smd_channel *channel,
-			       const void *data,
-			       size_t count)
-{
-	bool word_aligned;
-	unsigned head;
-	size_t len;
-
-	word_aligned = channel->info_word;
-	head = GET_TX_CHANNEL_INFO(channel, head);
-
-	len = min_t(size_t, count, channel->fifo_size - head);
-	if (len) {
-		smd_copy_to_fifo(channel->tx_fifo + head,
-				 data,
-				 len,
-				 word_aligned);
-	}
-
-	if (len != count) {
-		smd_copy_to_fifo(channel->tx_fifo,
-				 data + len,
-				 count - len,
-				 word_aligned);
-	}
-
-	head += count;
-	head &= (channel->fifo_size - 1);
-	SET_TX_CHANNEL_INFO(channel, head, head);
-
-	return count;
-}
-
-/**
- * qcom_smd_send - write data to smd channel
- * @channel:	channel handle
- * @data:	buffer of data to write
- * @len:	number of bytes to write
- *
- * This is a blocking write of len bytes into the channel's tx ring buffer and
- * signal the remote end. It will sleep until there is enough space available
- * in the tx buffer, utilizing the fBLOCKREADINTR signaling mechanism to avoid
- * polling.
- */
-int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len)
-{
-	__le32 hdr[5] = { cpu_to_le32(len), };
-	int tlen = sizeof(hdr) + len;
-	int ret;
-
-	/* Word aligned channels only accept word size aligned data */
-	if (channel->info_word && len % 4)
-		return -EINVAL;
-
-	/* Reject packets that are too big */
-	if (tlen >= channel->fifo_size)
-		return -EINVAL;
-
-	ret = mutex_lock_interruptible(&channel->tx_lock);
-	if (ret)
-		return ret;
-
-	while (qcom_smd_get_tx_avail(channel) < tlen) {
-		if (channel->state != SMD_CHANNEL_OPENED) {
-			ret = -EPIPE;
-			goto out;
-		}
-
-		SET_TX_CHANNEL_FLAG(channel, fBLOCKREADINTR, 0);
-
-		ret = wait_event_interruptible(channel->fblockread_event,
-				       qcom_smd_get_tx_avail(channel) >= tlen ||
-				       channel->state != SMD_CHANNEL_OPENED);
-		if (ret)
-			goto out;
-
-		SET_TX_CHANNEL_FLAG(channel, fBLOCKREADINTR, 1);
-	}
-
-	SET_TX_CHANNEL_FLAG(channel, fTAIL, 0);
-
-	qcom_smd_write_fifo(channel, hdr, sizeof(hdr));
-	qcom_smd_write_fifo(channel, data, len);
-
-	SET_TX_CHANNEL_FLAG(channel, fHEAD, 1);
-
-	/* Ensure ordering of channel info updates */
-	wmb();
-
-	qcom_smd_signal_channel(channel);
-
-out:
-	mutex_unlock(&channel->tx_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(qcom_smd_send);
-
-static struct qcom_smd_device *to_smd_device(struct device *dev)
-{
-	return container_of(dev, struct qcom_smd_device, dev);
-}
-
-static struct qcom_smd_driver *to_smd_driver(struct device *dev)
-{
-	struct qcom_smd_device *qsdev = to_smd_device(dev);
-
-	return container_of(qsdev->dev.driver, struct qcom_smd_driver, driver);
-}
-
-static int qcom_smd_dev_match(struct device *dev, struct device_driver *drv)
-{
-	struct qcom_smd_device *qsdev = to_smd_device(dev);
-	struct qcom_smd_driver *qsdrv = container_of(drv, struct qcom_smd_driver, driver);
-	const struct qcom_smd_id *match = qsdrv->smd_match_table;
-	const char *name = qsdev->channel->name;
-
-	if (match) {
-		while (match->name[0]) {
-			if (!strcmp(match->name, name))
-				return 1;
-			match++;
-		}
-	}
-
-	return of_driver_match_device(dev, drv);
-}
-
-/*
- * Helper for opening a channel
- */
-static int qcom_smd_channel_open(struct qcom_smd_channel *channel,
-				 qcom_smd_cb_t cb)
-{
-	size_t bb_size;
-
-	/*
-	 * Packets are maximum 4k, but reduce if the fifo is smaller
-	 */
-	bb_size = min(channel->fifo_size, SZ_4K);
-	channel->bounce_buffer = kmalloc(bb_size, GFP_KERNEL);
-	if (!channel->bounce_buffer)
-		return -ENOMEM;
-
-	qcom_smd_channel_set_callback(channel, cb);
-	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENING);
-	qcom_smd_channel_set_state(channel, SMD_CHANNEL_OPENED);
-
-	return 0;
-}
-
-/*
- * Helper for closing and resetting a channel
- */
-static void qcom_smd_channel_close(struct qcom_smd_channel *channel)
-{
-	qcom_smd_channel_set_callback(channel, NULL);
-
-	kfree(channel->bounce_buffer);
-	channel->bounce_buffer = NULL;
-
-	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSED);
-	qcom_smd_channel_reset(channel);
-}
-
-/*
- * Probe the smd client.
- *
- * The remote side have indicated that it want the channel to be opened, so
- * complete the state handshake and probe our client driver.
- */
-static int qcom_smd_dev_probe(struct device *dev)
-{
-	struct qcom_smd_device *qsdev = to_smd_device(dev);
-	struct qcom_smd_driver *qsdrv = to_smd_driver(dev);
-	struct qcom_smd_channel *channel = qsdev->channel;
-	int ret;
-
-	ret = qcom_smd_channel_open(channel, qsdrv->callback);
-	if (ret)
-		return ret;
-
-	ret = qsdrv->probe(qsdev);
-	if (ret)
-		goto err;
-
-	qcom_smd_channel_resume(channel);
-
-	return 0;
-
-err:
-	dev_err(&qsdev->dev, "probe failed\n");
-
-	qcom_smd_channel_close(channel);
-	return ret;
-}
-
-/*
- * Remove the smd client.
- *
- * The channel is going away, for some reason, so remove the smd client and
- * reset the channel state.
- */
-static int qcom_smd_dev_remove(struct device *dev)
-{
-	struct qcom_smd_device *qsdev = to_smd_device(dev);
-	struct qcom_smd_driver *qsdrv = to_smd_driver(dev);
-	struct qcom_smd_channel *channel = qsdev->channel;
-
-	qcom_smd_channel_set_state(channel, SMD_CHANNEL_CLOSING);
-
-	/*
-	 * Make sure we don't race with the code receiving data.
-	 */
-	qcom_smd_channel_set_callback(channel, NULL);
-
-	/* Wake up any sleepers in qcom_smd_send() */
-	wake_up_interruptible(&channel->fblockread_event);
-
-	/*
-	 * We expect that the client might block in remove() waiting for any
-	 * outstanding calls to qcom_smd_send() to wake up and finish.
-	 */
-	if (qsdrv->remove)
-		qsdrv->remove(qsdev);
-
-	/* The client is now gone, close the primary channel */
-	qcom_smd_channel_close(channel);
-	channel->qsdev = NULL;
-
-	return 0;
-}
-
-static struct bus_type qcom_smd_bus = {
-	.name = "qcom_smd",
-	.match = qcom_smd_dev_match,
-	.probe = qcom_smd_dev_probe,
-	.remove = qcom_smd_dev_remove,
-};
-
-/*
- * Release function for the qcom_smd_device object.
- */
-static void qcom_smd_release_device(struct device *dev)
-{
-	struct qcom_smd_device *qsdev = to_smd_device(dev);
-
-	kfree(qsdev);
-}
-
-/*
- * Finds the device_node for the smd child interested in this channel.
- */
-static struct device_node *qcom_smd_match_channel(struct device_node *edge_node,
-						  const char *channel)
-{
-	struct device_node *child;
-	const char *name;
-	const char *key;
-	int ret;
-
-	for_each_available_child_of_node(edge_node, child) {
-		key = "qcom,smd-channels";
-		ret = of_property_read_string(child, key, &name);
-		if (ret)
-			continue;
-
-		if (strcmp(name, channel) == 0)
-			return child;
-	}
-
-	return NULL;
-}
-
-/*
- * Create a smd client device for channel that is being opened.
- */
-static int qcom_smd_create_device(struct qcom_smd_channel *channel)
-{
-	struct qcom_smd_device *qsdev;
-	struct qcom_smd_edge *edge = channel->edge;
-	struct device_node *node;
-	int ret;
-
-	if (channel->qsdev)
-		return -EEXIST;
-
-	dev_dbg(&edge->dev, "registering '%s'\n", channel->name);
-
-	qsdev = kzalloc(sizeof(*qsdev), GFP_KERNEL);
-	if (!qsdev)
-		return -ENOMEM;
-
-	node = qcom_smd_match_channel(edge->of_node, channel->name);
-	dev_set_name(&qsdev->dev, "%s.%s",
-		     edge->of_node->name,
-		     node ? node->name : channel->name);
-
-	qsdev->dev.parent = &edge->dev;
-	qsdev->dev.bus = &qcom_smd_bus;
-	qsdev->dev.release = qcom_smd_release_device;
-	qsdev->dev.of_node = node;
-
-	qsdev->channel = channel;
-
-	channel->qsdev = qsdev;
-
-	ret = device_register(&qsdev->dev);
-	if (ret) {
-		dev_err(&edge->dev, "device_register failed: %d\n", ret);
-		put_device(&qsdev->dev);
-	}
-
-	return ret;
-}
-
-/*
- * Destroy a smd client device for a channel that's going away.
- */
-static void qcom_smd_destroy_device(struct qcom_smd_channel *channel)
-{
-	struct device *dev;
-
-	BUG_ON(!channel->qsdev);
-
-	dev = &channel->qsdev->dev;
-
-	device_unregister(dev);
-	of_node_put(dev->of_node);
-	put_device(dev);
-}
-
-/**
- * qcom_smd_driver_register - register a smd driver
- * @qsdrv:	qcom_smd_driver struct
- */
-int qcom_smd_driver_register(struct qcom_smd_driver *qsdrv)
-{
-	qsdrv->driver.bus = &qcom_smd_bus;
-	return driver_register(&qsdrv->driver);
-}
-EXPORT_SYMBOL(qcom_smd_driver_register);
-
-void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel)
-{
-	return channel->drvdata;
-}
-EXPORT_SYMBOL(qcom_smd_get_drvdata);
-
-void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data)
-{
-	channel->drvdata = data;
-}
-EXPORT_SYMBOL(qcom_smd_set_drvdata);
-
-/**
- * qcom_smd_driver_unregister - unregister a smd driver
- * @qsdrv:	qcom_smd_driver struct
- */
-void qcom_smd_driver_unregister(struct qcom_smd_driver *qsdrv)
-{
-	driver_unregister(&qsdrv->driver);
-}
-EXPORT_SYMBOL(qcom_smd_driver_unregister);
-
-static struct qcom_smd_channel *
-qcom_smd_find_channel(struct qcom_smd_edge *edge, const char *name)
-{
-	struct qcom_smd_channel *channel;
-	struct qcom_smd_channel *ret = NULL;
-	unsigned long flags;
-	unsigned state;
-
-	spin_lock_irqsave(&edge->channels_lock, flags);
-	list_for_each_entry(channel, &edge->channels, list) {
-		if (strcmp(channel->name, name))
-			continue;
-
-		state = GET_RX_CHANNEL_INFO(channel, state);
-		if (state != SMD_CHANNEL_OPENING &&
-		    state != SMD_CHANNEL_OPENED)
-			continue;
-
-		ret = channel;
-		break;
-	}
-	spin_unlock_irqrestore(&edge->channels_lock, flags);
-
-	return ret;
-}
-
-/**
- * qcom_smd_open_channel() - claim additional channels on the same edge
- * @sdev:	smd_device handle
- * @name:	channel name
- * @cb:		callback method to use for incoming data
- *
- * Returns a channel handle on success, or -EPROBE_DEFER if the channel isn't
- * ready.
- *
- * Any channels returned must be closed with a call to qcom_smd_close_channel()
- */
-struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *parent,
-					       const char *name,
-					       qcom_smd_cb_t cb)
-{
-	struct qcom_smd_channel *channel;
-	struct qcom_smd_device *sdev = parent->qsdev;
-	struct qcom_smd_edge *edge = parent->edge;
-	int ret;
-
-	/* Wait up to HZ for the channel to appear */
-	ret = wait_event_interruptible_timeout(edge->new_channel_event,
-			(channel = qcom_smd_find_channel(edge, name)) != NULL,
-			HZ);
-	if (!ret)
-		return ERR_PTR(-ETIMEDOUT);
-
-	if (channel->state != SMD_CHANNEL_CLOSED) {
-		dev_err(&sdev->dev, "channel %s is busy\n", channel->name);
-		return ERR_PTR(-EBUSY);
-	}
-
-	channel->qsdev = sdev;
-	ret = qcom_smd_channel_open(channel, cb);
-	if (ret) {
-		channel->qsdev = NULL;
-		return ERR_PTR(ret);
-	}
-
-	return channel;
-}
-EXPORT_SYMBOL(qcom_smd_open_channel);
-
-/**
- * qcom_smd_close_channel() - close an additionally opened channel
- * @channel:	channel handle, returned by qcom_smd_open_channel()
- */
-void qcom_smd_close_channel(struct qcom_smd_channel *channel)
-{
-	qcom_smd_channel_close(channel);
-	channel->qsdev = NULL;
-}
-EXPORT_SYMBOL(qcom_smd_close_channel);
-
-/*
- * Allocate the qcom_smd_channel object for a newly found smd channel,
- * retrieving and validating the smem items involved.
- */
-static struct qcom_smd_channel *qcom_smd_create_channel(struct qcom_smd_edge *edge,
-							unsigned smem_info_item,
-							unsigned smem_fifo_item,
-							char *name)
-{
-	struct qcom_smd_channel *channel;
-	size_t fifo_size;
-	size_t info_size;
-	void *fifo_base;
-	void *info;
-	int ret;
-
-	channel = devm_kzalloc(&edge->dev, sizeof(*channel), GFP_KERNEL);
-	if (!channel)
-		return ERR_PTR(-ENOMEM);
-
-	channel->edge = edge;
-	channel->name = devm_kstrdup(&edge->dev, name, GFP_KERNEL);
-	if (!channel->name)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_init(&channel->tx_lock);
-	spin_lock_init(&channel->recv_lock);
-	init_waitqueue_head(&channel->fblockread_event);
-
-	info = qcom_smem_get(edge->remote_pid, smem_info_item, &info_size);
-	if (IS_ERR(info)) {
-		ret = PTR_ERR(info);
-		goto free_name_and_channel;
-	}
-
-	/*
-	 * Use the size of the item to figure out which channel info struct to
-	 * use.
-	 */
-	if (info_size == 2 * sizeof(struct smd_channel_info_word)) {
-		channel->info_word = info;
-	} else if (info_size == 2 * sizeof(struct smd_channel_info)) {
-		channel->info = info;
-	} else {
-		dev_err(&edge->dev,
-			"channel info of size %zu not supported\n", info_size);
-		ret = -EINVAL;
-		goto free_name_and_channel;
-	}
-
-	fifo_base = qcom_smem_get(edge->remote_pid, smem_fifo_item, &fifo_size);
-	if (IS_ERR(fifo_base)) {
-		ret =  PTR_ERR(fifo_base);
-		goto free_name_and_channel;
-	}
-
-	/* The channel consist of a rx and tx fifo of equal size */
-	fifo_size /= 2;
-
-	dev_dbg(&edge->dev, "new channel '%s' info-size: %zu fifo-size: %zu\n",
-			  name, info_size, fifo_size);
-
-	channel->tx_fifo = fifo_base;
-	channel->rx_fifo = fifo_base + fifo_size;
-	channel->fifo_size = fifo_size;
-
-	qcom_smd_channel_reset(channel);
-
-	return channel;
-
-free_name_and_channel:
-	devm_kfree(&edge->dev, channel->name);
-	devm_kfree(&edge->dev, channel);
-
-	return ERR_PTR(ret);
-}
-
-/*
- * Scans the allocation table for any newly allocated channels, calls
- * qcom_smd_create_channel() to create representations of these and add
- * them to the edge's list of channels.
- */
-static void qcom_channel_scan_worker(struct work_struct *work)
-{
-	struct qcom_smd_edge *edge = container_of(work, struct qcom_smd_edge, scan_work);
-	struct qcom_smd_alloc_entry *alloc_tbl;
-	struct qcom_smd_alloc_entry *entry;
-	struct qcom_smd_channel *channel;
-	unsigned long flags;
-	unsigned fifo_id;
-	unsigned info_id;
-	int tbl;
-	int i;
-	u32 eflags, cid;
-
-	for (tbl = 0; tbl < SMD_ALLOC_TBL_COUNT; tbl++) {
-		alloc_tbl = qcom_smem_get(edge->remote_pid,
-				    smem_items[tbl].alloc_tbl_id, NULL);
-		if (IS_ERR(alloc_tbl))
-			continue;
-
-		for (i = 0; i < SMD_ALLOC_TBL_SIZE; i++) {
-			entry = &alloc_tbl[i];
-			eflags = le32_to_cpu(entry->flags);
-			if (test_bit(i, edge->allocated[tbl]))
-				continue;
-
-			if (entry->ref_count == 0)
-				continue;
-
-			if (!entry->name[0])
-				continue;
-
-			if (!(eflags & SMD_CHANNEL_FLAGS_PACKET))
-				continue;
-
-			if ((eflags & SMD_CHANNEL_FLAGS_EDGE_MASK) != edge->edge_id)
-				continue;
-
-			cid = le32_to_cpu(entry->cid);
-			info_id = smem_items[tbl].info_base_id + cid;
-			fifo_id = smem_items[tbl].fifo_base_id + cid;
-
-			channel = qcom_smd_create_channel(edge, info_id, fifo_id, entry->name);
-			if (IS_ERR(channel))
-				continue;
-
-			spin_lock_irqsave(&edge->channels_lock, flags);
-			list_add(&channel->list, &edge->channels);
-			spin_unlock_irqrestore(&edge->channels_lock, flags);
-
-			dev_dbg(&edge->dev, "new channel found: '%s'\n", channel->name);
-			set_bit(i, edge->allocated[tbl]);
-
-			wake_up_interruptible(&edge->new_channel_event);
-		}
-	}
-
-	schedule_work(&edge->state_work);
-}
-
-/*
- * This per edge worker scans smem for any new channels and register these. It
- * then scans all registered channels for state changes that should be handled
- * by creating or destroying smd client devices for the registered channels.
- *
- * LOCKING: edge->channels_lock only needs to cover the list operations, as the
- * worker is killed before any channels are deallocated
- */
-static void qcom_channel_state_worker(struct work_struct *work)
-{
-	struct qcom_smd_channel *channel;
-	struct qcom_smd_edge *edge = container_of(work,
-						  struct qcom_smd_edge,
-						  state_work);
-	unsigned remote_state;
-	unsigned long flags;
-
-	/*
-	 * Register a device for any closed channel where the remote processor
-	 * is showing interest in opening the channel.
-	 */
-	spin_lock_irqsave(&edge->channels_lock, flags);
-	list_for_each_entry(channel, &edge->channels, list) {
-		if (channel->state != SMD_CHANNEL_CLOSED)
-			continue;
-
-		remote_state = GET_RX_CHANNEL_INFO(channel, state);
-		if (remote_state != SMD_CHANNEL_OPENING &&
-		    remote_state != SMD_CHANNEL_OPENED)
-			continue;
-
-		spin_unlock_irqrestore(&edge->channels_lock, flags);
-		qcom_smd_create_device(channel);
-		spin_lock_irqsave(&edge->channels_lock, flags);
-	}
-
-	/*
-	 * Unregister the device for any channel that is opened where the
-	 * remote processor is closing the channel.
-	 */
-	list_for_each_entry(channel, &edge->channels, list) {
-		if (channel->state != SMD_CHANNEL_OPENING &&
-		    channel->state != SMD_CHANNEL_OPENED)
-			continue;
-
-		remote_state = GET_RX_CHANNEL_INFO(channel, state);
-		if (remote_state == SMD_CHANNEL_OPENING ||
-		    remote_state == SMD_CHANNEL_OPENED)
-			continue;
-
-		spin_unlock_irqrestore(&edge->channels_lock, flags);
-		qcom_smd_destroy_device(channel);
-		spin_lock_irqsave(&edge->channels_lock, flags);
-	}
-	spin_unlock_irqrestore(&edge->channels_lock, flags);
-}
-
-/*
- * Parses an of_node describing an edge.
- */
-static int qcom_smd_parse_edge(struct device *dev,
-			       struct device_node *node,
-			       struct qcom_smd_edge *edge)
-{
-	struct device_node *syscon_np;
-	const char *key;
-	int irq;
-	int ret;
-
-	INIT_LIST_HEAD(&edge->channels);
-	spin_lock_init(&edge->channels_lock);
-
-	INIT_WORK(&edge->scan_work, qcom_channel_scan_worker);
-	INIT_WORK(&edge->state_work, qcom_channel_state_worker);
-
-	edge->of_node = of_node_get(node);
-
-	key = "qcom,smd-edge";
-	ret = of_property_read_u32(node, key, &edge->edge_id);
-	if (ret) {
-		dev_err(dev, "edge missing %s property\n", key);
-		return -EINVAL;
-	}
-
-	edge->remote_pid = QCOM_SMEM_HOST_ANY;
-	key = "qcom,remote-pid";
-	of_property_read_u32(node, key, &edge->remote_pid);
-
-	syscon_np = of_parse_phandle(node, "qcom,ipc", 0);
-	if (!syscon_np) {
-		dev_err(dev, "no qcom,ipc node\n");
-		return -ENODEV;
-	}
-
-	edge->ipc_regmap = syscon_node_to_regmap(syscon_np);
-	if (IS_ERR(edge->ipc_regmap))
-		return PTR_ERR(edge->ipc_regmap);
-
-	key = "qcom,ipc";
-	ret = of_property_read_u32_index(node, key, 1, &edge->ipc_offset);
-	if (ret < 0) {
-		dev_err(dev, "no offset in %s\n", key);
-		return -EINVAL;
-	}
-
-	ret = of_property_read_u32_index(node, key, 2, &edge->ipc_bit);
-	if (ret < 0) {
-		dev_err(dev, "no bit in %s\n", key);
-		return -EINVAL;
-	}
-
-	irq = irq_of_parse_and_map(node, 0);
-	if (irq < 0) {
-		dev_err(dev, "required smd interrupt missing\n");
-		return -EINVAL;
-	}
-
-	ret = devm_request_irq(dev, irq,
-			       qcom_smd_edge_intr, IRQF_TRIGGER_RISING,
-			       node->name, edge);
-	if (ret) {
-		dev_err(dev, "failed to request smd irq\n");
-		return ret;
-	}
-
-	edge->irq = irq;
-
-	return 0;
-}
-
-/*
- * Release function for an edge.
- * Reset the state of each associated channel and free the edge context.
- */
-static void qcom_smd_edge_release(struct device *dev)
-{
-	struct qcom_smd_channel *channel;
-	struct qcom_smd_edge *edge = to_smd_edge(dev);
-
-	list_for_each_entry(channel, &edge->channels, list) {
-		SET_RX_CHANNEL_INFO(channel, state, SMD_CHANNEL_CLOSED);
-		SET_RX_CHANNEL_INFO(channel, head, 0);
-		SET_RX_CHANNEL_INFO(channel, tail, 0);
-	}
-
-	kfree(edge);
-}
-
-/**
- * qcom_smd_register_edge() - register an edge based on an device_node
- * @parent:	parent device for the edge
- * @node:	device_node describing the edge
- *
- * Returns an edge reference, or negative ERR_PTR() on failure.
- */
-struct qcom_smd_edge *qcom_smd_register_edge(struct device *parent,
-					     struct device_node *node)
-{
-	struct qcom_smd_edge *edge;
-	int ret;
-
-	edge = kzalloc(sizeof(*edge), GFP_KERNEL);
-	if (!edge)
-		return ERR_PTR(-ENOMEM);
-
-	init_waitqueue_head(&edge->new_channel_event);
-
-	edge->dev.parent = parent;
-	edge->dev.release = qcom_smd_edge_release;
-	dev_set_name(&edge->dev, "%s:%s", dev_name(parent), node->name);
-	ret = device_register(&edge->dev);
-	if (ret) {
-		pr_err("failed to register smd edge\n");
-		return ERR_PTR(ret);
-	}
-
-	ret = qcom_smd_parse_edge(&edge->dev, node, edge);
-	if (ret) {
-		dev_err(&edge->dev, "failed to parse smd edge\n");
-		goto unregister_dev;
-	}
-
-	schedule_work(&edge->scan_work);
-
-	return edge;
-
-unregister_dev:
-	put_device(&edge->dev);
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(qcom_smd_register_edge);
-
-static int qcom_smd_remove_device(struct device *dev, void *data)
-{
-	device_unregister(dev);
-	of_node_put(dev->of_node);
-	put_device(dev);
-
-	return 0;
-}
-
-/**
- * qcom_smd_unregister_edge() - release an edge and its children
- * @edge:	edge reference acquired from qcom_smd_register_edge
- */
-int qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
-{
-	int ret;
-
-	disable_irq(edge->irq);
-	cancel_work_sync(&edge->scan_work);
-	cancel_work_sync(&edge->state_work);
-
-	ret = device_for_each_child(&edge->dev, NULL, qcom_smd_remove_device);
-	if (ret)
-		dev_warn(&edge->dev, "can't remove smd device: %d\n", ret);
-
-	device_unregister(&edge->dev);
-
-	return 0;
-}
-EXPORT_SYMBOL(qcom_smd_unregister_edge);
-
-static int qcom_smd_probe(struct platform_device *pdev)
-{
-	struct device_node *node;
-	void *p;
-
-	/* Wait for smem */
-	p = qcom_smem_get(QCOM_SMEM_HOST_ANY, smem_items[0].alloc_tbl_id, NULL);
-	if (PTR_ERR(p) == -EPROBE_DEFER)
-		return PTR_ERR(p);
-
-	for_each_available_child_of_node(pdev->dev.of_node, node)
-		qcom_smd_register_edge(&pdev->dev, node);
-
-	return 0;
-}
-
-static int qcom_smd_remove_edge(struct device *dev, void *data)
-{
-	struct qcom_smd_edge *edge = to_smd_edge(dev);
-
-	return qcom_smd_unregister_edge(edge);
-}
-
-/*
- * Shut down all smd clients by making sure that each edge stops processing
- * events and scanning for new channels, then call destroy on the devices.
- */
-static int qcom_smd_remove(struct platform_device *pdev)
-{
-	int ret;
-
-	ret = device_for_each_child(&pdev->dev, NULL, qcom_smd_remove_edge);
-	if (ret)
-		dev_warn(&pdev->dev, "can't remove smd device: %d\n", ret);
-
-	return ret;
-}
-
-static const struct of_device_id qcom_smd_of_match[] = {
-	{ .compatible = "qcom,smd" },
-	{}
-};
-MODULE_DEVICE_TABLE(of, qcom_smd_of_match);
-
-static struct platform_driver qcom_smd_driver = {
-	.probe = qcom_smd_probe,
-	.remove = qcom_smd_remove,
-	.driver = {
-		.name = "qcom-smd",
-		.of_match_table = qcom_smd_of_match,
-	},
-};
-
-static int __init qcom_smd_init(void)
-{
-	int ret;
-
-	ret = bus_register(&qcom_smd_bus);
-	if (ret) {
-		pr_err("failed to register smd bus: %d\n", ret);
-		return ret;
-	}
-
-	return platform_driver_register(&qcom_smd_driver);
-}
-postcore_initcall(qcom_smd_init);
-
-static void __exit qcom_smd_exit(void)
-{
-	platform_driver_unregister(&qcom_smd_driver);
-	bus_unregister(&qcom_smd_bus);
-}
-module_exit(qcom_smd_exit);
-
-MODULE_AUTHOR("Bjorn Andersson <bjorn.andersson@sonymobile.com>");
-MODULE_DESCRIPTION("Qualcomm Shared Memory Driver");
-MODULE_LICENSE("GPL v2");
diff --git a/include/linux/rpmsg/qcom_smd.h b/include/linux/rpmsg/qcom_smd.h
index 8ec8b6439b25..f27917e0a101 100644
--- a/include/linux/rpmsg/qcom_smd.h
+++ b/include/linux/rpmsg/qcom_smd.h
@@ -6,7 +6,7 @@
 
 struct qcom_smd_edge;
 
-#if IS_ENABLED(CONFIG_RPMSG_QCOM_SMD) || IS_ENABLED(CONFIG_QCOM_SMD)
+#if IS_ENABLED(CONFIG_RPMSG_QCOM_SMD)
 
 struct qcom_smd_edge *qcom_smd_register_edge(struct device *parent,
 					     struct device_node *node);
diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
deleted file mode 100644
index f148e0ffbec7..000000000000
--- a/include/linux/soc/qcom/smd.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef __QCOM_SMD_H__
-#define __QCOM_SMD_H__
-
-#include <linux/device.h>
-#include <linux/mod_devicetable.h>
-
-struct qcom_smd;
-struct qcom_smd_channel;
-struct qcom_smd_lookup;
-
-/**
- * struct qcom_smd_id - struct used for matching a smd device
- * @name:	name of the channel
- */
-struct qcom_smd_id {
-	char name[20];
-};
-
-/**
- * struct qcom_smd_device - smd device struct
- * @dev:	the device struct
- * @channel:	handle to the smd channel for this device
- */
-struct qcom_smd_device {
-	struct device dev;
-	struct qcom_smd_channel *channel;
-};
-
-typedef int (*qcom_smd_cb_t)(struct qcom_smd_channel *, const void *, size_t);
-
-/**
- * struct qcom_smd_driver - smd driver struct
- * @driver:	underlying device driver
- * @smd_match_table: static channel match table
- * @probe:	invoked when the smd channel is found
- * @remove:	invoked when the smd channel is closed
- * @callback:	invoked when an inbound message is received on the channel,
- *		should return 0 on success or -EBUSY if the data cannot be
- *		consumed at this time
- */
-struct qcom_smd_driver {
-	struct device_driver driver;
-	const struct qcom_smd_id *smd_match_table;
-
-	int (*probe)(struct qcom_smd_device *dev);
-	void (*remove)(struct qcom_smd_device *dev);
-	qcom_smd_cb_t callback;
-};
-
-#if IS_ENABLED(CONFIG_QCOM_SMD)
-
-int qcom_smd_driver_register(struct qcom_smd_driver *drv);
-void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
-
-struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *channel,
-					       const char *name,
-					       qcom_smd_cb_t cb);
-void qcom_smd_close_channel(struct qcom_smd_channel *channel);
-void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel);
-void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data);
-int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
-
-
-struct qcom_smd_edge *qcom_smd_register_edge(struct device *parent,
-					     struct device_node *node);
-int qcom_smd_unregister_edge(struct qcom_smd_edge *edge);
-
-#else
-
-static inline int qcom_smd_driver_register(struct qcom_smd_driver *drv)
-{
-	return -ENXIO;
-}
-
-static inline void qcom_smd_driver_unregister(struct qcom_smd_driver *drv)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-}
-
-static inline struct qcom_smd_channel *
-qcom_smd_open_channel(struct qcom_smd_channel *channel,
-		      const char *name,
-		      qcom_smd_cb_t cb)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-	return NULL;
-}
-
-static inline void qcom_smd_close_channel(struct qcom_smd_channel *channel)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-}
-
-static inline void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-	return NULL;
-}
-
-static inline void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-}
-
-static inline int qcom_smd_send(struct qcom_smd_channel *channel,
-				const void *data, int len)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-	return -ENXIO;
-}
-
-static inline struct qcom_smd_edge *
-qcom_smd_register_edge(struct device *parent,
-		       struct device_node *node)
-{
-	return ERR_PTR(-ENXIO);
-}
-
-static inline int qcom_smd_unregister_edge(struct qcom_smd_edge *edge)
-{
-	/* This shouldn't be possible */
-	WARN_ON(1);
-	return -ENXIO;
-}
-
-#endif
-
-#define module_qcom_smd_driver(__smd_driver) \
-	module_driver(__smd_driver, qcom_smd_driver_register, \
-		      qcom_smd_driver_unregister)
-
-
-#endif
-- 
cgit v1.2.3


From 3b0228656dcb07a1c9fc81e8516475c2d7c4300e Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 28 Mar 2017 14:28:02 -0700
Subject: net: devinet: Refactor inet_netconf_notify_devconf to take event

Refactor inet_netconf_notify_devconf to take the event as an input arg.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  4 ++--
 net/ipv4/devinet.c         | 32 ++++++++++++++++++++------------
 net/ipv4/ipmr.c            | 12 +++++++-----
 3 files changed, 29 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index ee971f335a8b..a2e9d6ea1349 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -153,8 +153,8 @@ struct in_ifaddr {
 int register_inetaddr_notifier(struct notifier_block *nb);
 int unregister_inetaddr_notifier(struct notifier_block *nb);
 
-void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
-				 struct ipv4_devconf *devconf);
+void inet_netconf_notify_devconf(struct net *net, int event, int type,
+				 int ifindex, struct ipv4_devconf *devconf);
 
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref);
 static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 927f1d4b8c80..fd3218cd1870 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1831,8 +1831,8 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
-void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
-				 struct ipv4_devconf *devconf)
+void inet_netconf_notify_devconf(struct net *net, int event, int type,
+				 int ifindex, struct ipv4_devconf *devconf)
 {
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
@@ -1842,7 +1842,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex,
 		goto errout;
 
 	err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
-					RTM_NEWNETCONF, 0, type);
+					event, 0, type);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2021,10 +2021,12 @@ static void inet_forward_change(struct net *net)
 
 	IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
 	IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
-	inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+				    NETCONFA_FORWARDING,
 				    NETCONFA_IFINDEX_ALL,
 				    net->ipv4.devconf_all);
-	inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+				    NETCONFA_FORWARDING,
 				    NETCONFA_IFINDEX_DEFAULT,
 				    net->ipv4.devconf_dflt);
 
@@ -2037,7 +2039,8 @@ static void inet_forward_change(struct net *net)
 		in_dev = __in_dev_get_rtnl(dev);
 		if (in_dev) {
 			IN_DEV_CONF_SET(in_dev, FORWARDING, on);
-			inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_FORWARDING,
 						    dev->ifindex, &in_dev->cnf);
 		}
 	}
@@ -2082,19 +2085,22 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
 		if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
 		    new_value != old_value) {
 			ifindex = devinet_conf_ifindex(net, cnf);
-			inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_RP_FILTER,
 						    ifindex, cnf);
 		}
 		if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
 		    new_value != old_value) {
 			ifindex = devinet_conf_ifindex(net, cnf);
-			inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_PROXY_NEIGH,
 						    ifindex, cnf);
 		}
 		if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
 		    new_value != old_value) {
 			ifindex = devinet_conf_ifindex(net, cnf);
-			inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 						    ifindex, cnf);
 		}
 	}
@@ -2129,7 +2135,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
 					container_of(cnf, struct in_device, cnf);
 				if (*valp)
 					dev_disable_lro(idev->dev);
-				inet_netconf_notify_devconf(net,
+				inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
 							    NETCONFA_FORWARDING,
 							    idev->dev->ifindex,
 							    cnf);
@@ -2137,7 +2143,8 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
 			rtnl_unlock();
 			rt_cache_flush(net);
 		} else
-			inet_netconf_notify_devconf(net, NETCONFA_FORWARDING,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_FORWARDING,
 						    NETCONFA_IFINDEX_DEFAULT,
 						    net->ipv4.devconf_dflt);
 	}
@@ -2259,7 +2266,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
 
 	p->sysctl = t;
 
-	inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
+				    ifindex, p);
 	return 0;
 
 free:
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index c0317c940bcd..5bca64fc71b7 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -631,7 +631,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 	in_dev = __in_dev_get_rtnl(dev);
 	if (in_dev) {
 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
-		inet_netconf_notify_devconf(dev_net(dev),
+		inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
 					    NETCONFA_MC_FORWARDING,
 					    dev->ifindex, &in_dev->cnf);
 		ip_rt_multicast_event(in_dev);
@@ -820,8 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		return -EADDRNOTAVAIL;
 	}
 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
-	inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
-				    &in_dev->cnf);
+	inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
+				    dev->ifindex, &in_dev->cnf);
 	ip_rt_multicast_event(in_dev);
 
 	/* Fill in the VIF structures */
@@ -1282,7 +1282,8 @@ static void mrtsock_destruct(struct sock *sk)
 	ipmr_for_each_table(mrt, net) {
 		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-			inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_MC_FORWARDING,
 						    NETCONFA_IFINDEX_ALL,
 						    net->ipv4.devconf_all);
 			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
@@ -1344,7 +1345,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 		if (ret == 0) {
 			rcu_assign_pointer(mrt->mroute_sk, sk);
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
-			inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
+			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
+						    NETCONFA_MC_FORWARDING,
 						    NETCONFA_IFINDEX_ALL,
 						    net->ipv4.devconf_all);
 		}
-- 
cgit v1.2.3


From c6e970a04bdceb7ef1fdbac6be3bd4cd0a0a02bd Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Tue, 28 Mar 2017 23:45:06 +0200
Subject: net: break include loop netdevice.h, dsa.h, devlink.h

There is an include loop between netdevice.h, dsa.h, devlink.h because
of NETDEV_ALIGN, making it impossible to use devlink structures in
dsa.h.

Break this loop by taking dsa.h out of netdevice.h, add a forward
declaration of dsa_switch_tree and netdev_set_default_ethtool_ops()
function, which is what netdevice.h requires.

No longer having dsa.h in netdevice.h means the includes in dsa.h no
longer get included. This breaks a few other files which depend on
these includes. Add these directly in the affected file.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/dsa/bcm_sf2_cfp.c                    |  3 ++-
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h            |  1 +
 drivers/net/ethernet/broadcom/bcmsysport.c       |  1 +
 drivers/net/ethernet/freescale/fman/fman_memac.h |  1 +
 drivers/net/ethernet/hisilicon/hns/hnae.c        |  2 +-
 drivers/net/ieee802154/mrf24j40.c                |  1 +
 drivers/net/phy/smsc.c                           |  1 +
 drivers/net/usb/lan78xx.c                        |  1 +
 fs/cifs/cifsfs.c                                 |  1 +
 fs/cifs/connect.c                                |  1 +
 fs/cifs/smb2pdu.c                                |  1 +
 include/linux/netdevice.h                        | 12 ++----------
 include/net/dsa.h                                |  9 +++++++++
 net/bridge/br_if.c                               |  1 +
 net/core/netprio_cgroup.c                        |  1 +
 net/dsa/dsa.c                                    |  3 ++-
 net/dsa/dsa2.c                                   |  3 ++-
 net/dsa/slave.c                                  |  1 +
 net/dsa/tag_brcm.c                               |  1 +
 net/dsa/tag_dsa.c                                |  1 +
 net/dsa/tag_edsa.c                               |  1 +
 net/dsa/tag_qca.c                                |  1 +
 net/dsa/tag_trailer.c                            |  1 +
 net/ipv4/ipconfig.c                              |  1 +
 24 files changed, 36 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/dsa/bcm_sf2_cfp.c b/drivers/net/dsa/bcm_sf2_cfp.c
index 346dd9a1232d..2fb32d67065f 100644
--- a/drivers/net/dsa/bcm_sf2_cfp.c
+++ b/drivers/net/dsa/bcm_sf2_cfp.c
@@ -10,10 +10,11 @@
  */
 
 #include <linux/list.h>
-#include <net/dsa.h>
 #include <linux/ethtool.h>
 #include <linux/if_ether.h>
 #include <linux/in.h>
+#include <linux/netdevice.h>
+#include <net/dsa.h>
 #include <linux/bitmap.h>
 
 #include "bcm_sf2.h"
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index 75be2c339a49..55367d05374e 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -16,6 +16,7 @@
 #include <linux/irq.h>
 #include <linux/gpio/consumer.h>
 #include <linux/phy.h>
+#include <net/dsa.h>
 
 #ifndef UINT64_MAX
 #define UINT64_MAX		(u64)(~((u64)0))
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
index 61e26c6b26ab..099b374c1b17 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -22,6 +22,7 @@
 #include <linux/of_mdio.h>
 #include <linux/phy.h>
 #include <linux/phy_fixed.h>
+#include <net/dsa.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
 
diff --git a/drivers/net/ethernet/freescale/fman/fman_memac.h b/drivers/net/ethernet/freescale/fman/fman_memac.h
index 173d8e0fd716..c4a66469a907 100644
--- a/drivers/net/ethernet/freescale/fman/fman_memac.h
+++ b/drivers/net/ethernet/freescale/fman/fman_memac.h
@@ -36,6 +36,7 @@
 #include "fman_mac.h"
 
 #include <linux/netdevice.h>
+#include <linux/phy_fixed.h>
 
 struct fman_mac *memac_config(struct fman_mac_params *params);
 int memac_set_promiscuous(struct fman_mac *memac, bool new_val);
diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c b/drivers/net/ethernet/hisilicon/hns/hnae.c
index b6ed818f78ff..120427a40883 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.c
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.c
@@ -9,9 +9,9 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
+#include <linux/of.h>
 #include <linux/skbuff.h>
 #include <linux/slab.h>
-
 #include "hnae.h"
 
 #define cls_to_ae_dev(dev) container_of(dev, struct hnae_ae_dev, cls_dev)
diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 7b131f8e4093..bd63289c55e8 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -18,6 +18,7 @@
 #include <linux/spi/spi.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/of.h>
 #include <linux/regmap.h>
 #include <linux/ieee802154.h>
 #include <linux/irq.h>
diff --git a/drivers/net/phy/smsc.c b/drivers/net/phy/smsc.c
index fb32eaf2255d..cef6967b0396 100644
--- a/drivers/net/phy/smsc.c
+++ b/drivers/net/phy/smsc.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mii.h>
 #include <linux/ethtool.h>
+#include <linux/of.h>
 #include <linux/phy.h>
 #include <linux/netdevice.h>
 #include <linux/smscphy.h>
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index d885e0325422..a17e32bf5f92 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -29,6 +29,7 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/mdio.h>
+#include <linux/phy.h>
 #include <net/ip6_checksum.h>
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 15e1db8738ae..8c91f37ac0eb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -37,6 +37,7 @@
 #include <linux/freezer.h>
 #include <linux/namei.h>
 #include <linux/random.h>
+#include <linux/uuid.h>
 #include <linux/xattr.h>
 #include <net/ipv6.h>
 #include "cifsfs.h"
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9ae695ae3ed7..858698dcde3c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <linux/namei.h>
+#include <linux/uuid.h>
 #include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 7446496850a3..fb75fe908225 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -33,6 +33,7 @@
 #include <linux/vfs.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uaccess.h>
+#include <linux/uuid.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
 #include "smb2pdu.h"
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b7365b587818..cc07c3be2705 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -41,7 +41,6 @@
 
 #include <linux/ethtool.h>
 #include <net/net_namespace.h>
-#include <net/dsa.h>
 #ifdef CONFIG_DCB
 #include <net/dcbnl.h>
 #endif
@@ -57,6 +56,8 @@
 struct netpoll_info;
 struct device;
 struct phy_device;
+struct dsa_switch_tree;
+
 /* 802.11 specific */
 struct wireless_dev;
 /* 802.15.4 specific */
@@ -2004,15 +2005,6 @@ void dev_net_set(struct net_device *dev, struct net *net)
 	write_pnet(&dev->nd_net, net);
 }
 
-static inline bool netdev_uses_dsa(struct net_device *dev)
-{
-#if IS_ENABLED(CONFIG_NET_DSA)
-	if (dev->dsa_ptr != NULL)
-		return dsa_uses_tagged_protocol(dev->dsa_ptr);
-#endif
-	return false;
-}
-
 /**
  *	netdev_priv - access network device private data
  *	@dev: network device
diff --git a/include/net/dsa.h b/include/net/dsa.h
index e42897fd7a96..f80e81912b83 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -468,6 +468,15 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
 	return dst->rcv != NULL;
 }
 
+static inline bool netdev_uses_dsa(struct net_device *dev)
+{
+#if IS_ENABLED(CONFIG_NET_DSA)
+	if (dev->dsa_ptr != NULL)
+		return dsa_uses_tagged_protocol(dev->dsa_ptr);
+#endif
+	return false;
+}
+
 struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n);
 void dsa_unregister_switch(struct dsa_switch *ds);
 int dsa_register_switch(struct dsa_switch *ds, struct device *dev);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 8ac1770aa222..6eb52d422dd9 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -22,6 +22,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/if_ether.h>
 #include <linux/slab.h>
+#include <net/dsa.h>
 #include <net/sock.h>
 #include <linux/if_vlan.h>
 #include <net/switchdev.h>
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 0f9275ee5595..1c4810919a0a 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -11,6 +11,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/module.h>
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index b6d4f6a23f06..95d1a756202c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -14,15 +14,16 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/module.h>
-#include <net/dsa.h>
 #include <linux/of.h>
 #include <linux/of_mdio.h>
 #include <linux/of_platform.h>
 #include <linux/of_net.h>
 #include <linux/of_gpio.h>
+#include <linux/netdevice.h>
 #include <linux/sysfs.h>
 #include <linux/phy_fixed.h>
 #include <linux/gpio/consumer.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 737be6470c7f..d039c8d7adfd 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -13,11 +13,12 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/list.h>
+#include <linux/netdevice.h>
 #include <linux/slab.h>
 #include <linux/rtnetlink.h>
-#include <net/dsa.h>
 #include <linux/of.h>
 #include <linux/of_net.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 static LIST_HEAD(dsa_switch_trees);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 78128acfbf63..7693182df81e 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -17,6 +17,7 @@
 #include <linux/of_mdio.h>
 #include <linux/mdio.h>
 #include <linux/list.h>
+#include <net/dsa.h>
 #include <net/rtnetlink.h>
 #include <net/switchdev.h>
 #include <net/pkt_cls.h>
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 5d925b6b2bb1..e2ed6cf68261 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -12,6 +12,7 @@
 #include <linux/etherdevice.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 /* This tag length is 4 bytes, older ones were 6 bytes, we do not
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 72579ceea381..e42ba906100c 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -11,6 +11,7 @@
 #include <linux/etherdevice.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 #define DSA_HLEN	4
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 648c051817a1..6a9b7a9e4e15 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -11,6 +11,7 @@
 #include <linux/etherdevice.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 #define DSA_HLEN	4
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 30240f343aea..4e0dad759d04 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -12,6 +12,7 @@
  */
 
 #include <linux/etherdevice.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 #define QCA_HDR_LEN	2
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 26f977176978..74c948512550 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -11,6 +11,7 @@
 #include <linux/etherdevice.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <net/dsa.h>
 #include "dsa_priv.h"
 
 static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index fd9f34bbd740..9def8ed31c76 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -57,6 +57,7 @@
 #include <linux/export.h>
 #include <net/net_namespace.h>
 #include <net/arp.h>
+#include <net/dsa.h>
 #include <net/ip.h>
 #include <net/ipconfig.h>
 #include <net/route.h>
-- 
cgit v1.2.3


From ed77d6bcafd75d247cf3c6ad685aa221cda1b8ba Mon Sep 17 00:00:00 2001
From: Emiliano Ingrassia <ingrassia@epigenesys.com>
Date: Tue, 28 Mar 2017 09:49:29 +0200
Subject: spi: dynamycally allocated message initialization

Invoke the proper function while initializing
a dynamically allocated spi_message to avoid
NULL pointer dereference during resources deallocation.

Signed-off-by: Emiliano Ingrassia <ingrassia@epigenesys.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 75c6bd0ac605..3b0070695375 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -891,7 +891,7 @@ static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags
 		unsigned i;
 		struct spi_transfer *t = (struct spi_transfer *)(m + 1);
 
-		INIT_LIST_HEAD(&m->transfers);
+		spi_message_init_no_memset(m);
 		for (i = 0; i < ntrans; i++, t++)
 			spi_message_add_tail(t, m);
 	}
-- 
cgit v1.2.3


From fd086045559d90cd7854818b4c60a7119eda6231 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 27 Mar 2017 16:54:12 -0700
Subject: regulator: core: Limit propagation of parent voltage count and list

Commit 26988efe11b1 ("regulator: core: Allow to get voltage count and
list from parent") introduces the propagation of the parent voltage
count and list for regulators that don't provide this information
themselves. The goal is to support simple switch regulators, however as
a side effect normal continuous regulators can leak details of their
supplies and provide consumers with inconsistent information.

Limit the propagation of the voltage count and list to switch
regulators.

Fixes: 26988efe11b1 ("regulator: core: Allow to get voltage count and
  list from parent")
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Javier Martinez Canillas <javier@osg.samsung.com>
Tested-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         | 9 +++++++--
 include/linux/regulator/driver.h | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c20b28a63d15..aff302dfab5d 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2484,7 +2484,7 @@ static int _regulator_list_voltage(struct regulator *regulator,
 		ret = ops->list_voltage(rdev, selector);
 		if (lock)
 			mutex_unlock(&rdev->mutex);
-	} else if (rdev->supply) {
+	} else if (rdev->is_switch && rdev->supply) {
 		ret = _regulator_list_voltage(rdev->supply, selector, lock);
 	} else {
 		return -EINVAL;
@@ -2542,7 +2542,7 @@ int regulator_count_voltages(struct regulator *regulator)
 	if (rdev->desc->n_voltages)
 		return rdev->desc->n_voltages;
 
-	if (!rdev->supply)
+	if (!rdev->is_switch || !rdev->supply)
 		return -EINVAL;
 
 	return regulator_count_voltages(rdev->supply);
@@ -4097,6 +4097,11 @@ regulator_register(const struct regulator_desc *regulator_desc,
 		mutex_unlock(&regulator_list_mutex);
 	}
 
+	if (!rdev->desc->ops->get_voltage &&
+	    !rdev->desc->ops->list_voltage &&
+	    !rdev->desc->fixed_uV)
+		rdev->is_switch = true;
+
 	ret = device_register(&rdev->dev);
 	if (ret != 0) {
 		put_device(&rdev->dev);
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index dac8e7b16bc6..4cb1c9be6073 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -429,6 +429,8 @@ struct regulator_dev {
 	struct regulator_enable_gpio *ena_pin;
 	unsigned int ena_gpio_state:1;
 
+	unsigned int is_switch:1;
+
 	/* time when this regulator was disabled last time */
 	unsigned long last_off_jiffy;
 };
-- 
cgit v1.2.3


From 1671d522cdd9933dee7dddfcf9f62c561283824a Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 27 Mar 2017 20:06:57 +0800
Subject: block: rename blk_mq_freeze_queue_start()

As the .q_usage_counter is used by both legacy and
mq path, we need to block new I/O if queue becomes
dead in blk_queue_enter().

So rename it and we can use this function in both
paths.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                  |  2 +-
 block/blk-mq.c                    | 10 +++++-----
 drivers/block/mtip32xx/mtip32xx.c |  2 +-
 drivers/nvme/host/core.c          |  2 +-
 include/linux/blk-mq.h            |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6373febc7716..7b66f76f9cff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -670,7 +670,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
 			return -EBUSY;
 
 		/*
-		 * read pair of barrier in blk_mq_freeze_queue_start(),
+		 * read pair of barrier in blk_freeze_queue_start(),
 		 * we need to order reading __PERCPU_REF_DEAD flag of
 		 * .q_usage_counter and reading .mq_freeze_depth,
 		 * otherwise the following wait may never return if the
diff --git a/block/blk-mq.c b/block/blk-mq.c
index baebd6c8210e..0ed00eca4d5a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -68,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
 {
 	int freeze_depth;
 
@@ -78,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 		blk_mq_run_hw_queues(q, false);
 	}
 }
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
 void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
@@ -108,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q)
 	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 */
-	blk_mq_freeze_queue_start(q);
+	blk_freeze_queue_start(q);
 	blk_mq_freeze_queue_wait(q);
 }
 
@@ -746,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	 * percpu_ref_tryget directly, because we need to be able to
 	 * obtain a reference even in the short window between the queue
 	 * starting to freeze, by dropping the first reference in
-	 * blk_mq_freeze_queue_start, and the moment the last request is
+	 * blk_freeze_queue_start, and the moment the last request is
 	 * consumed, marked by the instant q_usage_counter reaches
 	 * zero.
 	 */
@@ -2376,7 +2376,7 @@ static void blk_mq_queue_reinit_work(void)
 	 * take place in parallel.
 	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_freeze_queue_start(q);
+		blk_freeze_queue_start(q);
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_wait(q);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index f96ab717534c..c96c35ab39df 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4162,7 +4162,7 @@ static int mtip_block_remove(struct driver_data *dd)
 		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
 						dd->disk->disk_name);
 
-	blk_mq_freeze_queue_start(dd->queue);
+	blk_freeze_queue_start(dd->queue);
 	blk_mq_stop_hw_queues(dd->queue);
 	blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9b3b57fef446..4a6d7f408769 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2386,7 +2386,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_freeze_queue_start(ns->queue);
+		blk_freeze_queue_start(ns->queue);
 	mutex_unlock(&ctrl->namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5b3e201c8d4f..ea2e9dcd3aef 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -243,7 +243,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
-- 
cgit v1.2.3


From 334335d2f7a077a5ff561d86b0ad43bedd83ca05 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 28 Mar 2017 16:12:15 -0700
Subject: block: warn if sharing request queue across gendisks

Now that the remaining drivers have been converted to one request queue
per gendisk, let's warn if a request queue gets registered more than
once. This will catch future drivers which might do it inadvertently or
any old drivers that I may have missed.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c      | 7 +++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7f090dd15ca6..833fb7f9ce9d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -871,6 +871,11 @@ int blk_register_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return -ENXIO;
 
+	WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+		  "%s is registering an already registered queue\n",
+		  kobject_name(&dev->kobj));
+	queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
 	/*
 	 * SCSI probing may synchronously create and destroy a lot of
 	 * request_queues for non-existent devices.  Shutting down a fully
@@ -931,6 +936,8 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
+	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a7dc42a8918..a2dc6b390d48 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -617,6 +617,7 @@ struct request_queue {
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
 #define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
 #define QUEUE_FLAG_POLL_STATS  29	/* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED  30	/* queue has been registered to a disk */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From d708f0d5026f48081debdd1c5b0a5636455a9589 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 29 Mar 2017 11:25:48 -0600
Subject: Revert "blkcg: allocate struct blkcg_gq outside request queue
 spinlock"

I inadvertently applied the v5 version of this patch, whereas
the agreed upon version was v5. Revert this one so we can apply
the right one.

This reverts commit 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788.
---
 block/blk-cgroup.c         | 138 +++++++++++++++++----------------------------
 include/linux/blk-cgroup.h |   6 +-
 2 files changed, 53 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bdf87f0c1b1b..bbe7ee00bd3d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -165,18 +165,16 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
- * If gfp mask allows blocking, this function temporarily drops rcu and queue
- * locks to allocate memory.
+ * If @new_blkg is %NULL, this function tries to allocate a new one as
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
-				    struct request_queue *q, gfp_t gfp,
-				    const struct blkcg_policy *pol)
+				    struct request_queue *q,
+				    struct blkcg_gq *new_blkg)
 {
-	struct blkcg_gq *blkg = NULL;
+	struct blkcg_gq *blkg;
 	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
-	const bool drop_locks = gfpflags_allow_blocking(gfp);
-	bool preloaded = false;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -187,53 +185,31 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
-	if (drop_locks) {
-		spin_unlock_irq(q->queue_lock);
-		rcu_read_unlock();
-	}
-
 	wb_congested = wb_congested_get_create(q->backing_dev_info,
-					       blkcg->css.id, gfp);
-	blkg = blkg_alloc(blkcg, q, gfp);
-
-	if (drop_locks) {
-		preloaded = !radix_tree_preload(gfp);
-		rcu_read_lock();
-		spin_lock_irq(q->queue_lock);
-	}
-
-	if (unlikely(!wb_congested || !blkg)) {
+					       blkcg->css.id,
+					       GFP_NOWAIT | __GFP_NOWARN);
+	if (!wb_congested) {
 		ret = -ENOMEM;
-		goto err_put;
+		goto err_put_css;
 	}
 
-	blkg->wb_congested = wb_congested;
-
-	if (pol) {
-		WARN_ON(!drop_locks);
-
-		if (!blkcg_policy_enabled(q, pol)) {
-			ret = -EOPNOTSUPP;
-			goto err_put;
-		}
-
-		/*
-		 * This could be the first entry point of blkcg implementation
-		 * and we shouldn't allow anything to go through for a bypassing
-		 * queue.
-		 */
-		if (unlikely(blk_queue_bypass(q))) {
-			ret = blk_queue_dying(q) ? -ENODEV : -EBUSY;
-			goto err_put;
+	/* allocate */
+	if (!new_blkg) {
+		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
+		if (unlikely(!new_blkg)) {
+			ret = -ENOMEM;
+			goto err_put_congested;
 		}
 	}
+	blkg = new_blkg;
+	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put;
+			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -260,9 +236,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
-
-	if (preloaded)
-		radix_tree_preload_end();
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
@@ -273,45 +246,44 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put:
-	if (preloaded)
-		radix_tree_preload_end();
-	if (wb_congested)
-		wb_congested_put(wb_congested);
+err_put_congested:
+	wb_congested_put(wb_congested);
+err_put_css:
 	css_put(&blkcg->css);
 err_free_blkg:
-	blkg_free(blkg);
+	blkg_free(new_blkg);
 	return ERR_PTR(ret);
 }
 
 /**
- * __blkg_lookup_create - lookup blkg, try to create one if not there
+ * blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
- * @gfp: gfp mask
- * @pol: blkcg policy (optional)
  *
  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
  * create one.  blkg creation is performed recursively from blkcg_root such
  * that all non-root blkg's have access to the parent blkg.  This function
  * should be called under RCU read lock and @q->queue_lock.
  *
- * When gfp mask allows blocking, rcu and queue locks may be dropped for
- * allocating memory. In this case, the locks will be reacquired on return.
- *
  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
  * dead and bypassing, returns ERR_PTR(-EBUSY).
  */
-struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
-				      struct request_queue *q, gfp_t gfp,
-				      const struct blkcg_policy *pol)
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q)
 {
 	struct blkcg_gq *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
 		return blkg;
@@ -329,35 +301,12 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 			parent = blkcg_parent(parent);
 		}
 
-		blkg = blkg_create(pos, q, gfp, pol);
+		blkg = blkg_create(pos, q, NULL);
 		if (pos == blkcg || IS_ERR(blkg))
 			return blkg;
 	}
 }
 
-/**
- * blkg_lookup_create - lookup blkg, try to create one if not there
- *
- * Performs an initial queue bypass check and then passes control to
- * __blkg_lookup_create().
- */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q, gfp_t gfp,
-				    const struct blkcg_policy *pol)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	lockdep_assert_held(q->queue_lock);
-
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
-
-	return __blkg_lookup_create(blkcg, q, gfp, pol);
-}
-
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
@@ -868,7 +817,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	spin_lock_irq(disk->queue->queue_lock);
 
 	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue, GFP_KERNEL, pol);
+		blkg = blkg_lookup_create(blkcg, disk->queue);
 	else
 		blkg = ERR_PTR(-EOPNOTSUPP);
 
@@ -1107,15 +1056,30 @@ free_blkcg:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *new_blkg, *blkg;
+	bool preloaded;
 	int ret;
 
+	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+	if (!new_blkg)
+		return -ENOMEM;
+
+	preloaded = !radix_tree_preload(GFP_KERNEL);
+
+	/*
+	 * Make sure the root blkg exists and count the existing blkgs.  As
+	 * @q is bypassing at this point, blkg_lookup_create() can't be
+	 * used.  Open code insertion.
+	 */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
-	blkg = __blkg_lookup_create(&blkcg_root, q, GFP_KERNEL, NULL);
+	blkg = blkg_create(&blkcg_root, q, new_blkg);
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
+	if (preloaded)
+		radix_tree_preload_end();
+
 	if (IS_ERR(blkg))
 		return PTR_ERR(blkg);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 955903a8f6cb..01b62e7bac74 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -172,8 +172,7 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q, gfp_t gfp,
-				    const struct blkcg_policy *pol);
+				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
 void blkcg_drain_queue(struct request_queue *q);
 void blkcg_exit_queue(struct request_queue *q);
@@ -695,8 +694,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN,
-					  NULL);
+		blkg = blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(q->queue_lock);
-- 
cgit v1.2.3


From b2e33536c010513e07e92ca914fcc11108d5eef5 Mon Sep 17 00:00:00 2001
From: sayli karnik <karniksayli1995@gmail.com>
Date: Thu, 30 Mar 2017 02:01:16 +0530
Subject: Documentation: Add flexible-arrays.rst to the documentation tree

Add flexible-arrays.rst to Documentation/core-api. Add kernel-doc
comments to allow referencing.

Signed-off-by: sayli karnik <karniksayli1995@gmail.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/flexible-arrays.rst | 130 +++++++++++++++++++++++++++++
 Documentation/core-api/index.rst           |   1 +
 include/linux/flex_array.h                 |  67 +++++++++++++++
 3 files changed, 198 insertions(+)
 create mode 100644 Documentation/core-api/flexible-arrays.rst

(limited to 'include/linux')

diff --git a/Documentation/core-api/flexible-arrays.rst b/Documentation/core-api/flexible-arrays.rst
new file mode 100644
index 000000000000..b6b85a1b518e
--- /dev/null
+++ b/Documentation/core-api/flexible-arrays.rst
@@ -0,0 +1,130 @@
+
+===================================
+Using flexible arrays in the kernel
+===================================
+
+Large contiguous memory allocations can be unreliable in the Linux kernel.
+Kernel programmers will sometimes respond to this problem by allocating
+pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
+systems, memory from vmalloc() must be mapped into a relatively small address
+space; it's easy to run out.  On SMP systems, the page table changes required
+by vmalloc() allocations can require expensive cross-processor interrupts on
+all CPUs.  And, on all systems, use of space in the vmalloc() range increases
+pressure on the translation lookaside buffer (TLB), reducing the performance
+of the system.
+
+In many cases, the need for memory from vmalloc() can be eliminated by piecing
+together an array from smaller parts; the flexible array library exists to make
+this task easier.
+
+A flexible array holds an arbitrary (within limits) number of fixed-sized
+objects, accessed via an integer index.  Sparse arrays are handled
+reasonably well.  Only single-page allocations are made, so memory
+allocation failures should be relatively rare.  The down sides are that the
+arrays cannot be indexed directly, individual object size cannot exceed the
+system page size, and putting data into a flexible array requires a copy
+operation.  It's also worth noting that flexible arrays do no internal
+locking at all; if concurrent access to an array is possible, then the
+caller must arrange for appropriate mutual exclusion.
+
+The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
+
+    #include <linux/flex_array.h>
+
+    struct flex_array *flex_array_alloc(int element_size,
+					unsigned int total,
+					gfp_t flags);
+
+The individual object size is provided by ``element_size``, while total is the
+maximum number of objects which can be stored in the array.  The flags
+argument is passed directly to the internal memory allocation calls.  With
+the current code, using flags to ask for high memory is likely to lead to
+notably unpleasant side effects.
+
+It is also possible to define flexible arrays at compile time with::
+
+    DEFINE_FLEX_ARRAY(name, element_size, total);
+
+This macro will result in a definition of an array with the given name; the
+element size and total will be checked for validity at compile time.
+
+Storing data into a flexible array is accomplished with a call to
+:c:func:`flex_array_put()`::
+
+    int flex_array_put(struct flex_array *array, unsigned int element_nr,
+    		       void *src, gfp_t flags);
+
+This call will copy the data from src into the array, in the position
+indicated by ``element_nr`` (which must be less than the maximum specified when
+the array was created).  If any memory allocations must be performed, flags
+will be used.  The return value is zero on success, a negative error code
+otherwise.
+
+There might possibly be a need to store data into a flexible array while
+running in some sort of atomic context; in this situation, sleeping in the
+memory allocator would be a bad thing.  That can be avoided by using
+``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
+trick is to ensure that any needed memory allocations are done before
+entering atomic context, using :c:func:`flex_array_prealloc()`::
+
+    int flex_array_prealloc(struct flex_array *array, unsigned int start,
+			    unsigned int nr_elements, gfp_t flags);
+
+This function will ensure that memory for the elements indexed in the range
+defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
+``flex_array_put()`` call on an element in that range is guaranteed not to
+block.
+
+Getting data back out of the array is done with :c:func:`flex_array_get()`::
+
+    void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
+
+The return value is a pointer to the data element, or NULL if that
+particular element has never been allocated.
+
+Note that it is possible to get back a valid pointer for an element which
+has never been stored in the array.  Memory for array elements is allocated
+one page at a time; a single allocation could provide memory for several
+adjacent elements.  Flexible array elements are normally initialized to the
+value ``FLEX_ARRAY_FREE`` (defined as 0x6c in <linux/poison.h>), so errors
+involving that number probably result from use of unstored array entries.
+Note that, if array elements are allocated with ``__GFP_ZERO``, they will be
+initialized to zero and this poisoning will not happen.
+
+Individual elements in the array can be cleared with
+:c:func:`flex_array_clear()`::
+
+    int flex_array_clear(struct flex_array *array, unsigned int element_nr);
+
+This function will set the given element to ``FLEX_ARRAY_FREE`` and return
+zero.  If storage for the indicated element is not allocated for the array,
+``flex_array_clear()`` will return ``-EINVAL`` instead.  Note that clearing an
+element does not release the storage associated with it; to reduce the
+allocated size of an array, call :c:func:`flex_array_shrink()`::
+
+    int flex_array_shrink(struct flex_array *array);
+
+The return value will be the number of pages of memory actually freed.
+This function works by scanning the array for pages containing nothing but
+``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work
+if the array's pages are allocated with ``__GFP_ZERO``.
+
+It is possible to remove all elements of an array with a call to
+:c:func:`flex_array_free_parts()`::
+
+    void flex_array_free_parts(struct flex_array *array);
+
+This call frees all elements, but leaves the array itself in place.
+Freeing the entire array is done with :c:func:`flex_array_free()`::
+
+    void flex_array_free(struct flex_array *array);
+
+As of this writing, there are no users of flexible arrays in the mainline
+kernel.  The functions described here are also not exported to modules;
+that will probably be fixed when somebody comes up with a need for it.
+
+
+Flexible array functions
+------------------------
+
+.. kernel-doc:: include/linux/flex_array.h
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 0d93d8089136..2b908ac41021 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -16,6 +16,7 @@ Core utilities
    cpu_hotplug
    local_ops
    workqueue
+   flexible-arrays
 
 Interfaces for kernel debugging
 ===============================
diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index b6efb0c64408..11366b3ff0b4 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -61,16 +61,83 @@ struct flex_array {
 			FLEX_ARRAY_ELEMENTS_PER_PART(__element_size));	\
 	}
 
+/**
+ * flex_array_alloc() - Creates a flexible array.
+ * @element_size:	individual object size.
+ * @total:		maximum number of objects which can be stored.
+ * @flags:		GFP flags
+ *
+ * Return:		Returns an object of structure flex_array.
+ */
 struct flex_array *flex_array_alloc(int element_size, unsigned int total,
 		gfp_t flags);
+
+/**
+ * flex_array_prealloc() - Ensures that memory for the elements indexed in the
+ * range defined by start and nr_elements has been allocated.
+ * @fa:			array to allocate memory to.
+ * @start:		start address
+ * @nr_elements:	number of elements to be allocated.
+ * @flags:		GFP flags
+ *
+ */
 int flex_array_prealloc(struct flex_array *fa, unsigned int start,
 		unsigned int nr_elements, gfp_t flags);
+
+/**
+ * flex_array_free() - Removes all elements of a flexible array.
+ * @fa:		array to be freed.
+ */
 void flex_array_free(struct flex_array *fa);
+
+/**
+ * flex_array_free_parts() - Removes all elements of a flexible array, but
+ * leaves the array itself in place.
+ * @fa:		array to be emptied.
+ */
 void flex_array_free_parts(struct flex_array *fa);
+
+/**
+ * flex_array_put() - Stores data into a flexible array.
+ * @fa:		array where element is to be stored.
+ * @element_nr:	position to copy, must be less than the maximum specified when
+ *		the array was created.
+ * @src:	data source to be copied into the array.
+ * @flags:	GFP flags
+ *
+ * Return:	Returns zero on success, a negative error code otherwise.
+ */
 int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src,
 		gfp_t flags);
+
+/**
+ * flex_array_clear() - Clears an individual element in the array, sets the
+ * given element to FLEX_ARRAY_FREE.
+ * @element_nr:	element position to clear.
+ * @fa:		array to which element to be cleared belongs.
+ *
+ * Return:	Returns zero on success, -EINVAL otherwise.
+ */
 int flex_array_clear(struct flex_array *fa, unsigned int element_nr);
+
+/**
+ * flex_array_get() - Retrieves data into a flexible array.
+ *
+ * @element_nr:	Element position to retrieve data from.
+ * @fa:		array from which data is to be retrieved.
+ *
+ * Return:	Returns a pointer to the data element, or NULL if that
+ *		particular element has never been allocated.
+ */
 void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
+
+/**
+ * flex_array_shrink() - Reduces the allocated size of an array.
+ * @fa:		array to shrink.
+ *
+ * Return:	Returns number of pages of memory actually freed.
+ *
+ */
 int flex_array_shrink(struct flex_array *fa);
 
 #define flex_array_put_ptr(fa, nr, src, gfp) \
-- 
cgit v1.2.3


From d3881e5015421a578bc328136471fcf1d02ac389 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 7 Feb 2017 14:32:33 -0500
Subject: PCI: Export PCI device config accessors

Replace the inline PCI device config read and write accessors with exported
functions.  This is preparing for these functions to make use of private
data.

Tested-by: Krishna Dhulipala <krishnad@fb.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Wei Zhang <wzhang@fb.com>
---
 drivers/pci/access.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h  | 32 ++++++--------------------------
 2 files changed, 44 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 8b7382705bf2..c70e3113df7a 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -890,3 +890,41 @@ int pcie_capability_clear_and_set_dword(struct pci_dev *dev, int pos,
 	return ret;
 }
 EXPORT_SYMBOL(pcie_capability_clear_and_set_dword);
+
+int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
+{
+	return pci_bus_read_config_byte(dev->bus, dev->devfn, where, val);
+}
+EXPORT_SYMBOL(pci_read_config_byte);
+
+int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val)
+{
+	return pci_bus_read_config_word(dev->bus, dev->devfn, where, val);
+}
+EXPORT_SYMBOL(pci_read_config_word);
+
+int pci_read_config_dword(const struct pci_dev *dev, int where,
+					u32 *val)
+{
+	return pci_bus_read_config_dword(dev->bus, dev->devfn, where, val);
+}
+EXPORT_SYMBOL(pci_read_config_dword);
+
+int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val)
+{
+	return pci_bus_write_config_byte(dev->bus, dev->devfn, where, val);
+}
+EXPORT_SYMBOL(pci_write_config_byte);
+
+int pci_write_config_word(const struct pci_dev *dev, int where, u16 val)
+{
+	return pci_bus_write_config_word(dev->bus, dev->devfn, where, val);
+}
+EXPORT_SYMBOL(pci_write_config_word);
+
+int pci_write_config_dword(const struct pci_dev *dev, int where,
+					 u32 val)
+{
+	return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val);
+}
+EXPORT_SYMBOL(pci_write_config_dword);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..d705f3088ff9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -940,32 +940,12 @@ int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn,
 
 struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops);
 
-static inline int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
-{
-	return pci_bus_read_config_byte(dev->bus, dev->devfn, where, val);
-}
-static inline int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val)
-{
-	return pci_bus_read_config_word(dev->bus, dev->devfn, where, val);
-}
-static inline int pci_read_config_dword(const struct pci_dev *dev, int where,
-					u32 *val)
-{
-	return pci_bus_read_config_dword(dev->bus, dev->devfn, where, val);
-}
-static inline int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val)
-{
-	return pci_bus_write_config_byte(dev->bus, dev->devfn, where, val);
-}
-static inline int pci_write_config_word(const struct pci_dev *dev, int where, u16 val)
-{
-	return pci_bus_write_config_word(dev->bus, dev->devfn, where, val);
-}
-static inline int pci_write_config_dword(const struct pci_dev *dev, int where,
-					 u32 val)
-{
-	return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val);
-}
+int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val);
+int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val);
+int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val);
+int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val);
+int pci_write_config_word(const struct pci_dev *dev, int where, u16 val);
+int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val);
 
 int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val);
 int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val);
-- 
cgit v1.2.3


From 89ee9f7680031d7df91a1a27abac69e034c2e892 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 29 Mar 2017 22:48:59 -0500
Subject: PCI: Add device disconnected state

Add a new state to pci_dev to be set when it is unexpectedly disconnected.
The PCI driver tear down functions can observe this new device state so
they may skip operations that will fail.

The pciehp and pcie-dpc drivers are aware when the link is down, so these
set the flag when their handlers detect the device is disconnected.

Tested-by: Krishna Dhulipala <krishnad@fb.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Wei Zhang <wzhang@fb.com>
---
 drivers/pci/hotplug/pciehp_pci.c |  6 ++++++
 drivers/pci/pci.h                | 14 ++++++++++++++
 drivers/pci/pcie/pcie-dpc.c      |  5 +++++
 include/linux/pci.h              |  2 ++
 4 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index 9e69403be632..19f30a9f461d 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -109,6 +109,12 @@ int pciehp_unconfigure_device(struct slot *p_slot)
 				break;
 			}
 		}
+		if (!presence) {
+			pci_dev_set_disconnected(dev, NULL);
+			if (pci_has_subordinate(dev))
+				pci_walk_bus(dev->subordinate,
+					     pci_dev_set_disconnected, NULL);
+		}
 		pci_stop_and_remove_bus_device(dev);
 		/*
 		 * Ensure that no new Requests will be generated from
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8dd38e69d6f2..245719c3e409 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -274,6 +274,20 @@ struct pci_sriov {
 	resource_size_t barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
 };
 
+/* pci_dev priv_flags */
+#define PCI_DEV_DISCONNECTED 0
+
+static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
+{
+	set_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
+	return 0;
+}
+
+static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
+{
+	return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
+}
+
 #ifdef CONFIG_PCI_ATS
 void pci_restore_ats_state(struct pci_dev *dev);
 #else
diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index d4d70ef4a2d7..77d2ca99d2ec 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/pcieport_if.h>
+#include "../pci.h"
 
 struct dpc_dev {
 	struct pcie_device	*dev;
@@ -66,6 +67,10 @@ static void interrupt_event_handler(struct work_struct *work)
 	list_for_each_entry_safe_reverse(dev, temp, &parent->devices,
 					 bus_list) {
 		pci_dev_get(dev);
+		pci_dev_set_disconnected(dev, NULL);
+		if (pci_has_subordinate(dev))
+			pci_walk_bus(dev->subordinate,
+				     pci_dev_set_disconnected, NULL);
 		pci_stop_and_remove_bus_device(dev);
 		pci_dev_put(dev);
 	}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index d705f3088ff9..2887933329a2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -396,6 +396,8 @@ struct pci_dev {
 	phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
 	size_t romlen; /* Length of ROM if it's not from the BAR */
 	char *driver_override; /* Driver name to force a match */
+
+	unsigned long priv_flags; /* Private flags for the pci driver */
 };
 
 static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
-- 
cgit v1.2.3


From 44fe84459faf1a7781595b7c64cd36daf2f2827d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 27 Mar 2017 13:54:38 +0200
Subject: locking/atomic: Fix atomic_try_cmpxchg() semantics

Dmitry noted that the new atomic_try_cmpxchg() primitive is broken when
the old pointer doesn't point to the local stack.

He writes:

  "Consider a classical lock-free stack push:

    node->next = atomic_read(&head);
    do {
    } while (!atomic_try_cmpxchg(&head, &node->next, node));

  This code is broken with the current implementation, the problem is
  with unconditional update of *__po.

  In case of success it writes the same value back into *__po, but in
  case of cmpxchg success we might have lose ownership of some memory
  locations and potentially over what __po has pointed to. The same
  holds for the re-read of *__po. "

He also points out that this makes it surprisingly different from the
similar C/C++ atomic operation.

After investigating the code-gen differences caused by this patch; and
a number of alternatives (Linus dislikes this interface lots), we
arrived at these results (size x86_64-defconfig/vmlinux):

  GCC-6.3.0:

  10735757        cmpxchg
  10726413        try_cmpxchg
  10730509        try_cmpxchg + patch
  10730445        try_cmpxchg-linus

  GCC-7 (20170327):

  10709514        cmpxchg
  10704266        try_cmpxchg
  10704266        try_cmpxchg + patch
  10704394        try_cmpxchg-linus

From this we see that the patch has the advantage of better code-gen
on GCC-7 and keeps the interface roughly consistent with the C
language variant.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: a9ebf306f52c ("locking/atomic: Introduce atomic_try_cmpxchg()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cmpxchg.h |  5 +++--
 include/linux/atomic.h         | 16 ++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index fb961db51a2a..d90296d061e8 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -212,8 +212,9 @@ extern void __add_wrong_size(void)
 	default:							\
 		__cmpxchg_wrong_size();					\
 	}								\
-	*_old = __old;							\
-	success;							\
+	if (unlikely(!success))						\
+		*_old = __old;						\
+	likely(success);						\
 })
 
 #define __try_cmpxchg(ptr, pold, new, size)				\
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index aae5953817d6..c56be7410130 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -428,9 +428,11 @@
 #define __atomic_try_cmpxchg(type, _p, _po, _n)				\
 ({									\
 	typeof(_po) __po = (_po);					\
-	typeof(*(_po)) __o = *__po;					\
-	*__po = atomic_cmpxchg##type((_p), __o, (_n));			\
-	(*__po == __o);							\
+	typeof(*(_po)) __r, __o = *__po;				\
+	__r = atomic_cmpxchg##type((_p), __o, (_n));			\
+	if (unlikely(__r != __o))					\
+		*__po = __r;						\
+	likely(__r == __o);						\
 })
 
 #define atomic_try_cmpxchg(_p, _po, _n)		__atomic_try_cmpxchg(, _p, _po, _n)
@@ -1022,9 +1024,11 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #define __atomic64_try_cmpxchg(type, _p, _po, _n)			\
 ({									\
 	typeof(_po) __po = (_po);					\
-	typeof(*(_po)) __o = *__po;					\
-	*__po = atomic64_cmpxchg##type((_p), __o, (_n));		\
-	(*__po == __o);							\
+	typeof(*(_po)) __r, __o = *__po;				\
+	__r = atomic64_cmpxchg##type((_p), __o, (_n));			\
+	if (unlikely(__r != __o))					\
+		*__po = __r;						\
+	likely(__r == __o);						\
 })
 
 #define atomic64_try_cmpxchg(_p, _po, _n)		__atomic64_try_cmpxchg(, _p, _po, _n)
-- 
cgit v1.2.3


From 19d436268dde95389c616bb3819da73f0a8b28a8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 25 Feb 2017 08:56:53 +0100
Subject: debug: Add _ONCE() logic to report_bug()

Josh suggested moving the _ONCE logic inside the trap handler, using a
bit in the bug_entry::flags field, avoiding the need for the extra
variable.

Sadly this only works for WARN_ON_ONCE(), since the others have
printk() statements prior to triggering the trap.

Still, this saves a fair amount of text and some data:

  text         data       filename
  10682460     4530992    defconfig-build/vmlinux.orig
  10665111     4530096    defconfig-build/vmlinux.patched

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/include/asm/bug.h      |  2 +-
 arch/parisc/include/asm/bug.h     |  8 ++++----
 arch/powerpc/include/asm/bug.h    |  4 ++--
 arch/s390/include/asm/bug.h       |  4 ++--
 arch/sh/include/asm/bug.h         |  4 ++--
 arch/x86/include/asm/bug.h        |  2 +-
 include/asm-generic/bug.h         | 18 +++++++++++++++++-
 include/asm-generic/vmlinux.lds.h |  5 ++---
 include/linux/bug.h               |  2 +-
 lib/bug.c                         | 28 ++++++++++++++++++++--------
 10 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 561190d15881..a9be1072933c 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -55,7 +55,7 @@ _BUGVERBOSE_LOCATION(__FILE__, __LINE__)		\
 	unreachable();				\
 } while (0)
 
-#define __WARN_TAINT(taint) _BUG_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_FLAGS(flags) _BUG_FLAGS(BUGFLAG_WARNING|(flags))
 
 #endif /* ! CONFIG_GENERIC_BUG */
 
diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h
index 62a33338549c..d2742273a685 100644
--- a/arch/parisc/include/asm/bug.h
+++ b/arch/parisc/include/asm/bug.h
@@ -46,7 +46,7 @@
 #endif
 
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-#define __WARN_TAINT(taint)						\
+#define __WARN_FLAGS(flags)						\
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
@@ -56,11 +56,11 @@
 			     "\t.org 2b+%c3\n"				\
 			     "\t.popsection"				\
 			     : : "i" (__FILE__), "i" (__LINE__),	\
-			     "i" (BUGFLAG_TAINT(taint)), 		\
+			     "i" (BUGFLAG_WARNING|(flags)),		\
 			     "i" (sizeof(struct bug_entry)) );		\
 	} while(0)
 #else
-#define __WARN_TAINT(taint)						\
+#define __WARN_FLAGS(flags)						\
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
@@ -69,7 +69,7 @@
 			     "\t.short %c0\n"				\
 			     "\t.org 2b+%c1\n"				\
 			     "\t.popsection"				\
-			     : : "i" (BUGFLAG_TAINT(taint)),		\
+			     : : "i" (BUGFLAG_WARNING|(flags)),		\
 			     "i" (sizeof(struct bug_entry)) );		\
 	} while(0)
 #endif
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 3a39283333c3..f2c562a0a427 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -85,12 +85,12 @@
 	}							\
 } while (0)
 
-#define __WARN_TAINT(taint) do {				\
+#define __WARN_FLAGS(flags) do {				\
 	__asm__ __volatile__(					\
 		"1:	twi 31,0,0\n"				\
 		_EMIT_BUG_ENTRY					\
 		: : "i" (__FILE__), "i" (__LINE__),		\
-		  "i" (BUGFLAG_TAINT(taint)),			\
+		  "i" (BUGFLAG_WARNING|(flags)),		\
 		  "i" (sizeof(struct bug_entry)));		\
 } while (0)
 
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index bf90d1fd97a5..1bbd9dbfe4e0 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -46,8 +46,8 @@
 	unreachable();					\
 } while (0)
 
-#define __WARN_TAINT(taint) do {			\
-	__EMIT_BUG(BUGFLAG_TAINT(taint));		\
+#define __WARN_FLAGS(flags) do {			\
+	__EMIT_BUG(BUGFLAG_WARNING|(flags));		\
 } while (0)
 
 #define WARN_ON(x) ({					\
diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h
index dcf278075429..1b77f068be2b 100644
--- a/arch/sh/include/asm/bug.h
+++ b/arch/sh/include/asm/bug.h
@@ -50,7 +50,7 @@ do {							\
 		   "i" (sizeof(struct bug_entry)));	\
 } while (0)
 
-#define __WARN_TAINT(taint)				\
+#define __WARN_FLAGS(flags)				\
 do {							\
 	__asm__ __volatile__ (				\
 		"1:\t.short %O0\n"			\
@@ -59,7 +59,7 @@ do {							\
 		 : "n" (TRAPA_BUG_OPCODE),		\
 		   "i" (__FILE__),			\
 		   "i" (__LINE__),			\
-		   "i" (BUGFLAG_TAINT(taint)),		\
+		   "i" (BUGFLAG_WARNING|(flags)),	\
 		   "i" (sizeof(struct bug_entry)));	\
 } while (0)
 
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index cecf559d0012..39e702d90cdb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -76,7 +76,7 @@ do {								\
 	unreachable();						\
 } while (0)
 
-#define __WARN_TAINT(taint)	_BUG_FLAGS(ASM_UD0, BUGFLAG_TAINT(taint))
+#define __WARN_FLAGS(flags)	_BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags))
 
 #include <asm-generic/bug.h>
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 6f96247226a4..5fe9f39cec44 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -5,6 +5,8 @@
 
 #ifdef CONFIG_GENERIC_BUG
 #define BUGFLAG_WARNING		(1 << 0)
+#define BUGFLAG_ONCE		(1 << 1)
+#define BUGFLAG_DONE		(1 << 2)
 #define BUGFLAG_TAINT(taint)	(BUGFLAG_WARNING | ((taint) << 8))
 #define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
 #endif
@@ -55,6 +57,18 @@ struct bug_entry {
 #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
 #endif
 
+#ifdef __WARN_FLAGS
+#define __WARN_TAINT(taint)		__WARN_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_ONCE_TAINT(taint)	__WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint))
+
+#define WARN_ON_ONCE(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	if (unlikely(__ret_warn_on))				\
+		__WARN_ONCE_TAINT(TAINT_WARN);			\
+	unlikely(__ret_warn_on);				\
+})
+#endif
+
 /*
  * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
  * significant issues that need prompt attention if they should ever
@@ -97,7 +111,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 #endif
 
 #ifndef WARN
-#define WARN(condition, format...) ({						\
+#define WARN(condition, format...) ({					\
 	int __ret_warn_on = !!(condition);				\
 	if (unlikely(__ret_warn_on))					\
 		__WARN_printf(format);					\
@@ -112,6 +126,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 	unlikely(__ret_warn_on);					\
 })
 
+#ifndef WARN_ON_ONCE
 #define WARN_ON_ONCE(condition)	({				\
 	static bool __section(.data.unlikely) __warned;		\
 	int __ret_warn_once = !!(condition);			\
@@ -122,6 +137,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 	}							\
 	unlikely(__ret_warn_once);				\
 })
+#endif
 
 #define WARN_ONCE(condition, format...)	({			\
 	static bool __section(.data.unlikely) __warned;		\
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0968d13b3885..51c8dbe8e8d9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -286,8 +286,6 @@
 		*(.rodata1)						\
 	}								\
 									\
-	BUG_TABLE							\
-									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
@@ -855,7 +853,8 @@
 		READ_MOSTLY_DATA(cacheline)				\
 		DATA_DATA						\
 		CONSTRUCTORS						\
-	}
+	}								\
+	BUG_TABLE
 
 #define INIT_TEXT_SECTION(inittext_align)				\
 	. = ALIGN(inittext_align);					\
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 5828489309bb..687b557fc5eb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -105,7 +105,7 @@ static inline int is_warning_bug(const struct bug_entry *bug)
 	return bug->flags & BUGFLAG_WARNING;
 }
 
-const struct bug_entry *find_bug(unsigned long bugaddr);
+struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 
diff --git a/lib/bug.c b/lib/bug.c
index 06edbbef0623..a6a1137d06db 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,7 +47,7 @@
 #include <linux/sched.h>
 #include <linux/rculist.h>
 
-extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
+extern struct bug_entry __start___bug_table[], __stop___bug_table[];
 
 static inline unsigned long bug_addr(const struct bug_entry *bug)
 {
@@ -62,10 +62,10 @@ static inline unsigned long bug_addr(const struct bug_entry *bug)
 /* Updates are protected by module mutex */
 static LIST_HEAD(module_bug_list);
 
-static const struct bug_entry *module_find_bug(unsigned long bugaddr)
+static struct bug_entry *module_find_bug(unsigned long bugaddr)
 {
 	struct module *mod;
-	const struct bug_entry *bug = NULL;
+	struct bug_entry *bug = NULL;
 
 	rcu_read_lock_sched();
 	list_for_each_entry_rcu(mod, &module_bug_list, bug_list) {
@@ -122,15 +122,15 @@ void module_bug_cleanup(struct module *mod)
 
 #else
 
-static inline const struct bug_entry *module_find_bug(unsigned long bugaddr)
+static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
 {
 	return NULL;
 }
 #endif
 
-const struct bug_entry *find_bug(unsigned long bugaddr)
+struct bug_entry *find_bug(unsigned long bugaddr)
 {
-	const struct bug_entry *bug;
+	struct bug_entry *bug;
 
 	for (bug = __start___bug_table; bug < __stop___bug_table; ++bug)
 		if (bugaddr == bug_addr(bug))
@@ -141,9 +141,9 @@ const struct bug_entry *find_bug(unsigned long bugaddr)
 
 enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
-	const struct bug_entry *bug;
+	struct bug_entry *bug;
 	const char *file;
-	unsigned line, warning;
+	unsigned line, warning, once, done;
 
 	if (!is_valid_bugaddr(bugaddr))
 		return BUG_TRAP_TYPE_NONE;
@@ -164,6 +164,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		line = bug->line;
 #endif
 		warning = (bug->flags & BUGFLAG_WARNING) != 0;
+		once = (bug->flags & BUGFLAG_ONCE) != 0;
+		done = (bug->flags & BUGFLAG_DONE) != 0;
+
+		if (warning && once) {
+			if (done)
+				return BUG_TRAP_TYPE_WARN;
+
+			/*
+			 * Since this is the only store, concurrency is not an issue.
+			 */
+			bug->flags |= BUGFLAG_DONE;
+		}
 	}
 
 	if (warning) {
-- 
cgit v1.2.3


From cf25f904ef75aa7c25097eb4981bbc634bf5ff9e Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri, 24 Feb 2017 02:48:21 -0600
Subject: x86/events/amd/iommu: Add IOMMU-specific hw_perf_event struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current AMD IOMMU perf PMU inappropriately uses the hardware struct
inside the union in struct hw_perf_event, extra_reg in particular.

Instead, introduce an AMD IOMMU-specific struct with required parameters
to be programmed into the IOMMU performance counter control register.

Update the pasid field from 16 to 20 bits while at it.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
[ Fixup macros, shorten get_next_avail_iommu_bnk_cntr() local vars, massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jörg Rödel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: iommu@lists.linux-foundation.org
Link: http://lkml.kernel.org/r/1487926102-13073-10-git-send-email-Suravee.Suthikulpanit@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/iommu.c | 113 ++++++++++++++++++++------------------------
 include/linux/perf_event.h  |   7 +++
 2 files changed, 57 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 7ac8138023cc..f0d94c8b382a 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -23,17 +23,16 @@
 
 #define COUNTER_SHIFT		16
 
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x)     ((x)->conf & 0xFFULL)
+#define GET_DEVID(x)       (((x)->conf >> 8)  & 0xFFFFULL)
+#define GET_DOMID(x)       (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x)       (((x)->conf >> 40) & 0xFFFFFULL)
+
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x)  ((x)->conf1  & 0xFFFFULL)
+#define GET_DOMID_MASK(x)  (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x)  (((x)->conf1 >> 32) & 0xFFFFFULL)
 
 static struct perf_amd_iommu __perf_iommu;
 
@@ -50,11 +49,11 @@ struct perf_amd_iommu {
  *---------------------------------------------*/
 PMU_FORMAT_ATTR(csource,    "config:0-7");
 PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(domid,      "config:24-39");
+PMU_FORMAT_ATTR(pasid,      "config:40-59");
 PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
 
 static struct attribute *iommu_format_attrs[] = {
 	&format_attr_csource.attr,
@@ -150,30 +149,34 @@ static struct attribute_group amd_iommu_cpumask_group = {
 
 /*---------------------------------------------*/
 
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
 {
+	struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+	int max_cntrs = piommu->max_counters;
+	int max_banks = piommu->max_banks;
+	u32 shift, bank, cntr;
 	unsigned long flags;
-	int shift, bank, cntr, retval;
-	int max_banks = perf_iommu->max_banks;
-	int max_cntrs = perf_iommu->max_counters;
+	int retval;
 
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+	raw_spin_lock_irqsave(&piommu->lock, flags);
 
 	for (bank = 0, shift = 0; bank < max_banks; bank++) {
 		for (cntr = 0; cntr < max_cntrs; cntr++) {
 			shift = bank + (bank*3) + cntr;
-			if (perf_iommu->cntr_assign_mask & BIT_ULL(shift)) {
+			if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
 				continue;
 			} else {
-				perf_iommu->cntr_assign_mask |= BIT_ULL(shift);
-				retval = ((bank & 0xFF) << 8) | (cntr & 0xFF);
+				piommu->cntr_assign_mask |= BIT_ULL(shift);
+				event->hw.iommu_bank = bank;
+				event->hw.iommu_cntr = cntr;
+				retval = 0;
 				goto out;
 			}
 		}
 	}
 	retval = -ENOSPC;
 out:
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+	raw_spin_unlock_irqrestore(&piommu->lock, flags);
 	return retval;
 }
 
@@ -202,8 +205,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
 static int perf_iommu_event_init(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_amd_iommu *perf_iommu;
-	u64 config, config1;
 
 	/* test the event attr type check for PMU enumeration */
 	if (event->attr.type != event->pmu->type)
@@ -225,21 +226,9 @@ static int perf_iommu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
-	perf_iommu = &__perf_iommu;
-
-	if (event->pmu != &perf_iommu->pmu)
-		return -ENOENT;
-
-	if (perf_iommu) {
-		config = event->attr.config;
-		config1 = event->attr.config1;
-	} else {
-		return -EINVAL;
-	}
-
 	/* update the hw_perf_event struct with the iommu config data */
-	hwc->config = config;
-	hwc->extra_reg.config = config1;
+	hwc->conf  = event->attr.config;
+	hwc->conf1 = event->attr.config1;
 
 	return 0;
 }
@@ -247,26 +236,28 @@ static int perf_iommu_event_init(struct perf_event *event)
 static void perf_iommu_enable_event(struct perf_event *ev)
 {
 	struct amd_iommu *iommu = get_amd_iommu(0);
-	u8 csource = _GET_CSOURCE(ev);
-	u16 devid = _GET_DEVID(ev);
-	u8 bank = _GET_BANK(ev);
-	u8 cntr = _GET_CNTR(ev);
+	struct hw_perf_event *hwc = &ev->hw;
+	u8 bank = hwc->iommu_bank;
+	u8 cntr = hwc->iommu_cntr;
 	u64 reg = 0ULL;
 
-	reg = csource;
+	reg = GET_CSOURCE(hwc);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
 
-	reg = devid | (_GET_DEVID_MASK(ev) << 32);
+	reg = GET_DEVID_MASK(hwc);
+	reg = GET_DEVID(hwc) | (reg << 32);
 	if (reg)
 		reg |= BIT(31);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
 
-	reg = _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+	reg = GET_PASID_MASK(hwc);
+	reg = GET_PASID(hwc) | (reg << 32);
 	if (reg)
 		reg |= BIT(31);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
 
-	reg = _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+	reg = GET_DOMID_MASK(hwc);
+	reg = GET_DOMID(hwc) | (reg << 32);
 	if (reg)
 		reg |= BIT(31);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
@@ -275,16 +266,16 @@ static void perf_iommu_enable_event(struct perf_event *ev)
 static void perf_iommu_disable_event(struct perf_event *event)
 {
 	struct amd_iommu *iommu = get_amd_iommu(0);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 reg = 0ULL;
 
-	amd_iommu_pc_set_reg(iommu, _GET_BANK(event), _GET_CNTR(event),
+	amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
 			     IOMMU_PC_COUNTER_SRC_REG, &reg);
 }
 
 static void perf_iommu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct amd_iommu *iommu = get_amd_iommu(0);
 
 	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 		return;
@@ -293,8 +284,10 @@ static void perf_iommu_start(struct perf_event *event, int flags)
 	hwc->state = 0;
 
 	if (flags & PERF_EF_RELOAD) {
-		u64 prev_raw_count =  local64_read(&hwc->prev_count);
-		amd_iommu_pc_set_reg(iommu, _GET_BANK(event), _GET_CNTR(event),
+		u64 prev_raw_count = local64_read(&hwc->prev_count);
+		struct amd_iommu *iommu = get_amd_iommu(0);
+
+		amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
 				     IOMMU_PC_COUNTER_REG, &prev_raw_count);
 	}
 
@@ -309,7 +302,7 @@ static void perf_iommu_read(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct amd_iommu *iommu = get_amd_iommu(0);
 
-	if (amd_iommu_pc_get_reg(iommu, _GET_BANK(event), _GET_CNTR(event),
+	if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
 				 IOMMU_PC_COUNTER_REG, &count))
 		return;
 
@@ -329,7 +322,6 @@ static void perf_iommu_read(struct perf_event *event)
 static void perf_iommu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
 
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
@@ -341,7 +333,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
 
-	config = hwc->config;
 	perf_iommu_read(event);
 	hwc->state |= PERF_HES_UPTODATE;
 }
@@ -349,16 +340,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 static int perf_iommu_add(struct perf_event *event, int flags)
 {
 	int retval;
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
 
 	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
 	/* request an iommu bank/counter */
-	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-	if (retval != -ENOSPC)
-		event->hw.extra_reg.reg = (u16)retval;
-	else
+	retval = get_next_avail_iommu_bnk_cntr(event);
+	if (retval)
 		return retval;
 
 	if (flags & PERF_EF_START)
@@ -369,6 +356,7 @@ static int perf_iommu_add(struct perf_event *event, int flags)
 
 static void perf_iommu_del(struct perf_event *event, int flags)
 {
+	struct hw_perf_event *hwc = &event->hw;
 	struct perf_amd_iommu *perf_iommu =
 			container_of(event->pmu, struct perf_amd_iommu, pmu);
 
@@ -376,8 +364,7 @@ static void perf_iommu_del(struct perf_event *event, int flags)
 
 	/* clear the assigned iommu bank/counter */
 	clear_avail_iommu_bnk_cntr(perf_iommu,
-				     _GET_BANK(event),
-				     _GET_CNTR(event));
+				   hwc->iommu_bank, hwc->iommu_cntr);
 
 	perf_event_update_userpage(event);
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b6e75c9d4791..24a635887f28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -165,6 +165,13 @@ struct hw_perf_event {
 			struct list_head		bp_list;
 		};
 #endif
+		struct { /* amd_iommu */
+			u8	iommu_bank;
+			u8	iommu_cntr;
+			u16	padding;
+			u64	conf;
+			u64	conf1;
+		};
 	};
 	/*
 	 * If the event is a per task event, this will point to the task in
-- 
cgit v1.2.3


From 959d973e9890150342df76160d966ab1270208df Mon Sep 17 00:00:00 2001
From: Xiaolei Yu <dreifachstein@gmail.com>
Date: Sat, 25 Mar 2017 14:04:58 +0800
Subject: HID: add two missing usages for digitizer

They are part of HUTRR34 for multi-touch digitizers:

0x0E    Device configuration    CA      16.7
0x23    Device settings         CL      16.7

Signed-off-by: Xiaolei Yu <dreifachstein@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-debug.c | 2 ++
 include/linux/hid.h     | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index acfb522a432a..5a0061c0ee87 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -140,9 +140,11 @@ static const struct hid_usage_entry hid_usage_table[] = {
     {0, 0x03, "LightPen"},
     {0, 0x04, "TouchScreen"},
     {0, 0x05, "TouchPad"},
+    {0, 0x0e, "DeviceConfiguration"},
     {0, 0x20, "Stylus"},
     {0, 0x21, "Puck"},
     {0, 0x22, "Finger"},
+    {0, 0x23, "DeviceSettings"},
     {0, 0x30, "TipPressure"},
     {0, 0x31, "BarrelPressure"},
     {0, 0x32, "InRange"},
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 28f38e2b8f30..536f11fd21d5 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -268,6 +268,8 @@ struct hid_item {
 #define HID_CP_APPLICATIONLAUNCHBUTTONS	0x000c0180
 #define HID_CP_GENERICGUIAPPLICATIONCONTROLS	0x000c0200
 
+#define HID_DG_DEVICECONFIG	0x000d000e
+#define HID_DG_DEVICESETTINGS	0x000d0023
 #define HID_DG_CONFIDENCE	0x000d0047
 #define HID_DG_WIDTH		0x000d0048
 #define HID_DG_HEIGHT		0x000d0049
-- 
cgit v1.2.3


From ae7c18380495ac5c14a614fdb6c452c3bf9148ac Mon Sep 17 00:00:00 2001
From: Hanjun Guo <hanjun.guo@linaro.org>
Date: Tue, 7 Mar 2017 20:40:05 +0800
Subject: ACPI: platform-msi: retrieve devid from IORT

For devices connecting to an ITS, the devices need to identify themself
through a devid; this devid is represented in the IORT table in named
component node [1] for platform devices, so this patch adds code that
scans the IORT table to retrieve the devices devid.

Add an IORT interface to collect ITS devices devid to carry out platform
devices MSI mappings with IORT tables.

[1]: https://static.docs.arm.com/den0049/b/DEN0049B_IO_Remapping_Table.pdf

Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
[lorenzo.pieralisi@arm.com: rewrote commit log/dropped ITS changes]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Ming Lei <ming.lei@canonical.com>
Tested-by: Wei Xu <xuwei5@hisilicon.com>
Tested-by: Sinan Kaya <okaya@codeaurora.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/acpi/arm64/iort.c | 24 ++++++++++++++++++++++++
 include/linux/acpi_iort.h |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 83cd59dcb8b3..fb95ceb3840d 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -467,6 +467,30 @@ u32 iort_msi_map_rid(struct device *dev, u32 req_id)
 	return dev_id;
 }
 
+/**
+ * iort_pmsi_get_dev_id() - Get the device id for a device
+ * @dev: The device for which the mapping is to be done.
+ * @dev_id: The device ID found.
+ *
+ * Returns: 0 for successful find a dev id, -ENODEV on error
+ */
+int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id)
+{
+	int i;
+	struct acpi_iort_node *node;
+
+	node = iort_find_dev_node(dev);
+	if (!node)
+		return -ENODEV;
+
+	for (i = 0; i < node->mapping_count; i++) {
+		if (iort_node_map_platform_id(node, dev_id, IORT_MSI_TYPE, i))
+			return 0;
+	}
+
+	return -ENODEV;
+}
+
 /**
  * iort_dev_find_its_id() - Find the ITS identifier for a device
  * @dev: The device.
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 77e08099e554..fd8b9698e1d1 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -34,6 +34,7 @@ void acpi_iort_init(void);
 bool iort_node_match(u8 type);
 u32 iort_msi_map_rid(struct device *dev, u32 req_id);
 struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
+int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 void iort_set_dma_mask(struct device *dev);
 const struct iommu_ops *iort_iommu_configure(struct device *dev);
-- 
cgit v1.2.3


From d4f54a186667ffd19eac8e3f48c51d940a9b9784 Mon Sep 17 00:00:00 2001
From: Hanjun Guo <hanjun.guo@linaro.org>
Date: Tue, 7 Mar 2017 20:40:06 +0800
Subject: ACPI: platform: setup MSI domain for ACPI based platform device

By allowing platform MSI domain to be created on ACPI platforms,
a platform device MSI domain can be set-up when it is probed.

In order to do that, the MSI domain the platform device connects
to should be retrieved, so the iort_get_platform_device_domain() is
introduced to retrieve the domain from the IORT kernel layer.

With the domain retrieved, we need a proper way to set the
domain to platform device.

Given that some platform devices (irqchips) require the MSI irqdomain
to be their interrupt parent domain, the MSI irqdomain should be
determined before platform device is probed but after the platform
device is allocated which means that the code setting up the MSI
irqdomain, ie acpi_configure_pmsi_domain() should be called in
acpi_platform_notify() (that is triggered after adding a device but
before the respective driver is probed) for the platform MSI domain
code set-up path to work properly.

Acked-by: Rafael J. Wysocki <rafael@kernel.org> [for glue.c]
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
[lorenzo.pieralisi@arm.com: rewrote commit log]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Ming Lei <ming.lei@canonical.com>
Tested-by: Wei Xu <xuwei5@hisilicon.com>
Tested-by: Sinan Kaya <okaya@codeaurora.org>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
---
 drivers/acpi/arm64/iort.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/glue.c       |  6 ++++++
 include/linux/acpi_iort.h |  2 ++
 3 files changed, 58 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index fb95ceb3840d..22e08d272db7 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -548,6 +548,56 @@ struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id)
 	return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI);
 }
 
+/**
+ * iort_get_platform_device_domain() - Find MSI domain related to a
+ * platform device
+ * @dev: the dev pointer associated with the platform device
+ *
+ * Returns: the MSI domain for this device, NULL otherwise
+ */
+static struct irq_domain *iort_get_platform_device_domain(struct device *dev)
+{
+	struct acpi_iort_node *node, *msi_parent;
+	struct fwnode_handle *iort_fwnode;
+	struct acpi_iort_its_group *its;
+	int i;
+
+	/* find its associated iort node */
+	node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
+			      iort_match_node_callback, dev);
+	if (!node)
+		return NULL;
+
+	/* then find its msi parent node */
+	for (i = 0; i < node->mapping_count; i++) {
+		msi_parent = iort_node_map_platform_id(node, NULL,
+						       IORT_MSI_TYPE, i);
+		if (msi_parent)
+			break;
+	}
+
+	if (!msi_parent)
+		return NULL;
+
+	/* Move to ITS specific data */
+	its = (struct acpi_iort_its_group *)msi_parent->node_data;
+
+	iort_fwnode = iort_find_domain_token(its->identifiers[0]);
+	if (!iort_fwnode)
+		return NULL;
+
+	return irq_find_matching_fwnode(iort_fwnode, DOMAIN_BUS_PLATFORM_MSI);
+}
+
+void acpi_configure_pmsi_domain(struct device *dev)
+{
+	struct irq_domain *msi_domain;
+
+	msi_domain = iort_get_platform_device_domain(dev);
+	if (msi_domain)
+		dev_set_msi_domain(dev, msi_domain);
+}
+
 static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
 {
 	u32 *rid = data;
diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index fb19e1cdb641..ec31b439b4c8 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -6,6 +6,8 @@
  *
  * This file is released under the GPLv2.
  */
+
+#include <linux/acpi_iort.h>
 #include <linux/export.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -14,6 +16,7 @@
 #include <linux/rwsem.h>
 #include <linux/acpi.h>
 #include <linux/dma-mapping.h>
+#include <linux/platform_device.h>
 
 #include "internal.h"
 
@@ -322,6 +325,9 @@ static int acpi_platform_notify(struct device *dev)
 	if (!adev)
 		goto out;
 
+	if (dev->bus == &platform_bus_type)
+		acpi_configure_pmsi_domain(dev);
+
 	if (type && type->setup)
 		type->setup(dev);
 	else if (adev->handler && adev->handler->bind)
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index fd8b9698e1d1..26e25d85eb3e 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -34,6 +34,7 @@ void acpi_iort_init(void);
 bool iort_node_match(u8 type);
 u32 iort_msi_map_rid(struct device *dev, u32 req_id);
 struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
+void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 void iort_set_dma_mask(struct device *dev);
@@ -46,6 +47,7 @@ static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
 static inline struct irq_domain *iort_get_device_domain(struct device *dev,
 							u32 req_id)
 { return NULL; }
+static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
 static inline void iort_set_dma_mask(struct device *dev) { }
 static inline
-- 
cgit v1.2.3


From 8531e283bee66050734fb0e89d53e85fd5ce24a4 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 10 Mar 2017 21:23:45 +0100
Subject: PCI: Recognize Thunderbolt devices

Detect on probe whether a PCI device is part of a Thunderbolt controller.
Intel uses a Vendor-Specific Extended Capability (VSEC) with ID 0x1234
on such devices.  Detect presence of this VSEC and cache it in a newly
added is_thunderbolt bit in struct pci_dev.

Also, add a helper to check whether a given PCI device is situated on a
Thunderbolt daisy chain (i.e., below a PCI device with is_thunderbolt
set).

The necessity arises from the following:

* If an external Thunderbolt GPU is connected to a dual GPU laptop,
  that GPU is currently registered with vga_switcheroo even though it
  can neither drive the laptop's panel nor be powered off by the
  platform.  To vga_switcheroo it will appear as if two discrete
  GPUs are present.  As a result, when the external GPU is runtime
  suspended, vga_switcheroo will cut power to the internal discrete GPU
  which may not be runtime suspended at all at this moment.  The
  solution is to not register external GPUs with vga_switcheroo, which
  necessitates a way to recognize if they're on a Thunderbolt daisy
  chain.

* Dual GPU MacBook Pros introduced 2011+ can no longer switch external
  DisplayPort ports between GPUs.  (They're no longer just used for DP
  but have become combined DP/Thunderbolt ports.)  The driver to switch
  the ports, drivers/platform/x86/apple-gmux.c, needs to detect presence
  of a Thunderbolt controller and, if found, keep external ports
  permanently switched to the discrete GPU.

v2: Make kerneldoc for pci_is_thunderbolt_attached() more precise,
    drop portion of commit message pertaining to separate series.
    (Bjorn Helgaas)

Cc: Andreas Noever <andreas.noever@gmail.com>
Cc: Michael Jamet <michael.jamet@intel.com>
Cc: Tomas Winkler <tomas.winkler@intel.com>
Cc: Amir Levy <amir.jer.levy@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Link: http://patchwork.freedesktop.org/patch/msgid/0ab165a4a35c0b60f29d4c306c653ead14fcd8f9.1489145162.git.lukas@wunner.de
---
 drivers/pci/pci.h   |  2 ++
 drivers/pci/probe.c | 21 +++++++++++++++++++++
 include/linux/pci.h | 23 +++++++++++++++++++++++
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8dd38e69d6f2..4dbf9f96ae5b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -3,6 +3,8 @@
 
 #define PCI_FIND_CAP_TTL	48
 
+#define PCI_VSEC_ID_INTEL_TBT	0x1234	/* Thunderbolt */
+
 extern const unsigned char pcie_link_speed[];
 
 bool pcie_cap_has_lnkctl(const struct pci_dev *dev);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index dfc9a2794141..90592d424e9b 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1208,6 +1208,24 @@ void set_pcie_hotplug_bridge(struct pci_dev *pdev)
 		pdev->is_hotplug_bridge = 1;
 }
 
+static void set_pcie_thunderbolt(struct pci_dev *dev)
+{
+	int vsec = 0;
+	u32 header;
+
+	while ((vsec = pci_find_next_ext_capability(dev, vsec,
+						    PCI_EXT_CAP_ID_VNDR))) {
+		pci_read_config_dword(dev, vsec + PCI_VNDR_HEADER, &header);
+
+		/* Is the device part of a Thunderbolt controller? */
+		if (dev->vendor == PCI_VENDOR_ID_INTEL &&
+		    PCI_VNDR_HEADER_ID(header) == PCI_VSEC_ID_INTEL_TBT) {
+			dev->is_thunderbolt = 1;
+			return;
+		}
+	}
+}
+
 /**
  * pci_ext_cfg_is_aliased - is ext config space just an alias of std config?
  * @dev: PCI device
@@ -1360,6 +1378,9 @@ int pci_setup_device(struct pci_dev *dev)
 	/* need to have dev->class ready */
 	dev->cfg_size = pci_cfg_space_size(dev);
 
+	/* need to have dev->cfg_size ready */
+	set_pcie_thunderbolt(dev);
+
 	/* "Unknown power state" */
 	dev->current_state = PCI_UNKNOWN;
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..5948cfdc984e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -358,6 +358,7 @@ struct pci_dev {
 	unsigned int	is_virtfn:1;
 	unsigned int	reset_fn:1;
 	unsigned int    is_hotplug_bridge:1;
+	unsigned int	is_thunderbolt:1; /* Thunderbolt controller */
 	unsigned int    __aer_firmware_first_valid:1;
 	unsigned int	__aer_firmware_first:1;
 	unsigned int	broken_intx_masking:1;
@@ -2160,6 +2161,28 @@ static inline bool pci_ari_enabled(struct pci_bus *bus)
 	return bus->self && bus->self->ari_enabled;
 }
 
+/**
+ * pci_is_thunderbolt_attached - whether device is on a Thunderbolt daisy chain
+ * @pdev: PCI device to check
+ *
+ * Walk upwards from @pdev and check for each encountered bridge if it's part
+ * of a Thunderbolt controller.  Reaching the host bridge means @pdev is not
+ * Thunderbolt-attached.  (But rather soldered to the mainboard usually.)
+ */
+static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
+{
+	struct pci_dev *parent = pdev;
+
+	if (pdev->is_thunderbolt)
+		return true;
+
+	while ((parent = pci_upstream_bridge(parent)))
+		if (parent->is_thunderbolt)
+			return true;
+
+	return false;
+}
+
 /* provide the legacy pci_dma_* API */
 #include <linux/pci-dma-compat.h>
 
-- 
cgit v1.2.3


From a3caf7440dedd2399f90f27ff11ac390bf03e6c4 Mon Sep 17 00:00:00 2001
From: Vidyullatha Kanchanapally <vkanchan@qti.qualcomm.com>
Date: Fri, 31 Mar 2017 00:22:34 +0300
Subject: cfg80211: Add support for FILS shared key authentication offload

Enhance nl80211 and cfg80211 connect request and response APIs to
support FILS shared key authentication offload. The new nl80211
attributes can be used to provide additional information to the driver
to establish a FILS connection. Also enhance the set/del PMKSA to allow
support for adding and deleting PMKSA based on FILS cache identifier.

Add a new feature flag that drivers can use to advertize support for
FILS shared key authentication and association in station mode when
using their own SME.

Signed-off-by: Vidyullatha Kanchanapally <vkanchan@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h    | 13 +++++++
 include/net/cfg80211.h       | 57 +++++++++++++++++++++++++++-
 include/uapi/linux/nl80211.h | 86 ++++++++++++++++++++++++++++++++++++++++--
 net/wireless/nl80211.c       | 90 +++++++++++++++++++++++++++++++++++++++++---
 net/wireless/sme.c           | 25 +++++++++++-
 5 files changed, 259 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 22bf0676d928..294fa6273a62 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1723,6 +1723,9 @@ enum ieee80211_statuscode {
 	WLAN_STATUS_REJECT_DSE_BAND = 96,
 	WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99,
 	WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103,
+	/* 802.11ai */
+	WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 108,
+	WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109,
 };
 
 
@@ -2104,6 +2107,12 @@ enum ieee80211_key_len {
 #define FILS_NONCE_LEN			16
 #define FILS_MAX_KEK_LEN		64
 
+#define FILS_ERP_MAX_USERNAME_LEN	16
+#define FILS_ERP_MAX_REALM_LEN		253
+#define FILS_ERP_MAX_RRK_LEN		64
+
+#define PMK_MAX_LEN			48
+
 /* Public action codes */
 enum ieee80211_pub_actioncode {
 	WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4,
@@ -2355,6 +2364,10 @@ enum ieee80211_sa_query_action {
 #define WLAN_AKM_SUITE_TDLS		SUITE(0x000FAC, 7)
 #define WLAN_AKM_SUITE_SAE		SUITE(0x000FAC, 8)
 #define WLAN_AKM_SUITE_FT_OVER_SAE	SUITE(0x000FAC, 9)
+#define WLAN_AKM_SUITE_FILS_SHA256	SUITE(0x000FAC, 14)
+#define WLAN_AKM_SUITE_FILS_SHA384	SUITE(0x000FAC, 15)
+#define WLAN_AKM_SUITE_FT_FILS_SHA256	SUITE(0x000FAC, 16)
+#define WLAN_AKM_SUITE_FT_FILS_SHA384	SUITE(0x000FAC, 17)
 
 #define WLAN_MAX_KEY_LEN		32
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index da12d5b86e1b..042137d7d226 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2073,6 +2073,19 @@ struct cfg80211_bss_selection {
  *	the BSSID of the current association, i.e., to the value that is
  *	included in the Current AP address field of the Reassociation Request
  *	frame.
+ * @fils_erp_username: EAP re-authentication protocol (ERP) username part of the
+ *	NAI or %NULL if not specified. This is used to construct FILS wrapped
+ *	data IE.
+ * @fils_erp_username_len: Length of @fils_erp_username in octets.
+ * @fils_erp_realm: EAP re-authentication protocol (ERP) realm part of NAI or
+ *	%NULL if not specified. This specifies the domain name of ER server and
+ *	is used to construct FILS wrapped data IE.
+ * @fils_erp_realm_len: Length of @fils_erp_realm in octets.
+ * @fils_erp_next_seq_num: The next sequence number to use in the FILS ERP
+ *	messages. This is also used to construct FILS wrapped data IE.
+ * @fils_erp_rrk: ERP re-authentication Root Key (rRK) used to derive additional
+ *	keys in FILS or %NULL if not specified.
+ * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -2098,6 +2111,13 @@ struct cfg80211_connect_params {
 	bool pbss;
 	struct cfg80211_bss_selection bss_select;
 	const u8 *prev_bssid;
+	const u8 *fils_erp_username;
+	size_t fils_erp_username_len;
+	const u8 *fils_erp_realm;
+	size_t fils_erp_realm_len;
+	u16 fils_erp_next_seq_num;
+	const u8 *fils_erp_rrk;
+	size_t fils_erp_rrk_len;
 };
 
 /**
@@ -2136,12 +2156,27 @@ enum wiphy_params_flags {
  * This structure is passed to the set/del_pmksa() method for PMKSA
  * caching.
  *
- * @bssid: The AP's BSSID.
- * @pmkid: The PMK material itself.
+ * @bssid: The AP's BSSID (may be %NULL).
+ * @pmkid: The identifier to refer a PMKSA.
+ * @pmk: The PMK for the PMKSA identified by @pmkid. This is used for key
+ *	derivation by a FILS STA. Otherwise, %NULL.
+ * @pmk_len: Length of the @pmk. The length of @pmk can differ depending on
+ *	the hash algorithm used to generate this.
+ * @ssid: SSID to specify the ESS within which a PMKSA is valid when using FILS
+ *	cache identifier (may be %NULL).
+ * @ssid_len: Length of the @ssid in octets.
+ * @cache_id: 2-octet cache identifier advertized by a FILS AP identifying the
+ *	scope of PMKSA. This is valid only if @ssid_len is non-zero (may be
+ *	%NULL).
  */
 struct cfg80211_pmksa {
 	const u8 *bssid;
 	const u8 *pmkid;
+	const u8 *pmk;
+	size_t pmk_len;
+	const u8 *ssid;
+	size_t ssid_len;
+	const u8 *cache_id;
 };
 
 /**
@@ -5153,6 +5188,17 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp)
  * @req_ie_len: Association request IEs length
  * @resp_ie: Association response IEs (may be %NULL)
  * @resp_ie_len: Association response IEs length
+ * @fils_kek: KEK derived from a successful FILS connection (may be %NULL)
+ * @fils_kek_len: Length of @fils_kek in octets
+ * @update_erp_next_seq_num: Boolean value to specify whether the value in
+ *	@fils_erp_next_seq_num is valid.
+ * @fils_erp_next_seq_num: The next sequence number to use in ERP message in
+ *	FILS Authentication. This value should be specified irrespective of the
+ *	status for a FILS connection.
+ * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL).
+ * @pmk_len: Length of @pmk in octets
+ * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID
+ *	used for this FILS connection (may be %NULL).
  * @timeout_reason: Reason for connection timeout. This is used when the
  *	connection fails due to a timeout instead of an explicit rejection from
  *	the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is
@@ -5168,6 +5214,13 @@ struct cfg80211_connect_resp_params {
 	size_t req_ie_len;
 	const u8 *resp_ie;
 	size_t resp_ie_len;
+	const u8 *fils_kek;
+	size_t fils_kek_len;
+	bool update_erp_next_seq_num;
+	u16 fils_erp_next_seq_num;
+	const u8 *pmk;
+	size_t pmk_len;
+	const u8 *pmkid;
 	enum nl80211_timeout_reason timeout_reason;
 };
 
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index cd4dfef58fab..6095a6c4c412 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -172,6 +172,42 @@
  * Multiple such rules can be created.
  */
 
+/**
+ * DOC: FILS shared key authentication offload
+ *
+ * FILS shared key authentication offload can be advertized by drivers by
+ * setting @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD flag. The drivers that support
+ * FILS shared key authentication offload should be able to construct the
+ * authentication and association frames for FILS shared key authentication and
+ * eventually do a key derivation as per IEEE 802.11ai. The below additional
+ * parameters should be given to driver in %NL80211_CMD_CONNECT.
+ *	%NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai
+ *	%NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai
+ *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message
+ *	%NL80211_ATTR_FILS_ERP_RRK - used to generate the rIK and rMSK
+ * rIK should be used to generate an authentication tag on the ERP message and
+ * rMSK should be used to derive a PMKSA.
+ * rIK, rMSK should be generated and keyname_nai, sequence number should be used
+ * as specified in IETF RFC 6696.
+ *
+ * When FILS shared key authentication is completed, driver needs to provide the
+ * below additional parameters to userspace.
+ *	%NL80211_ATTR_FILS_KEK - used for key renewal
+ *	%NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges
+ *	%NL80211_ATTR_PMKID - used to identify the PMKSA used/generated
+ *	%Nl80211_ATTR_PMK - used to update PMKSA cache in userspace
+ * The PMKSA can be maintained in userspace persistently so that it can be used
+ * later after reboots or wifi turn off/on also.
+ *
+ * %NL80211_ATTR_FILS_CACHE_ID is the cache identifier advertized by a FILS
+ * capable AP supporting PMK caching. It specifies the scope within which the
+ * PMKSAs are cached in an ESS. %NL80211_CMD_SET_PMKSA and
+ * %NL80211_CMD_DEL_PMKSA are enhanced to allow support for PMKSA caching based
+ * on FILS cache identifier. Additionally %NL80211_ATTR_PMK is used with
+ * %NL80211_SET_PMKSA to specify the PMK corresponding to a PMKSA for driver to
+ * use in a FILS shared key connection with PMKSA caching.
+ */
+
 /**
  * enum nl80211_commands - supported nl80211 commands
  *
@@ -370,10 +406,18 @@
  * @NL80211_CMD_NEW_SURVEY_RESULTS: survey data notification (as a reply to
  *	NL80211_CMD_GET_SURVEY and on the "scan" multicast group)
  *
- * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry, using %NL80211_ATTR_MAC
- *	(for the BSSID) and %NL80211_ATTR_PMKID.
+ * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry using %NL80211_ATTR_MAC
+ *	(for the BSSID), %NL80211_ATTR_PMKID, and optionally %NL80211_ATTR_PMK
+ *	(PMK is used for PTKSA derivation in case of FILS shared key offload) or
+ *	using %NL80211_ATTR_SSID, %NL80211_ATTR_FILS_CACHE_ID,
+ *	%NL80211_ATTR_PMKID, and %NL80211_ATTR_PMK in case of FILS
+ *	authentication where %NL80211_ATTR_FILS_CACHE_ID is the identifier
+ *	advertized by a FILS capable AP identifying the scope of PMKSA in an
+ *	ESS.
  * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC
- *	(for the BSSID) and %NL80211_ATTR_PMKID.
+ *	(for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID,
+ *	%NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS
+ *	authentication.
  * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries.
  *
  * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain
@@ -2012,6 +2056,31 @@ enum nl80211_commands {
  *	u32 attribute with an &enum nl80211_timeout_reason value. This is used,
  *	e.g., with %NL80211_CMD_CONNECT event.
  *
+ * @NL80211_ATTR_FILS_ERP_USERNAME: EAP Re-authentication Protocol (ERP)
+ *	username part of NAI used to refer keys rRK and rIK. This is used with
+ *	%NL80211_CMD_CONNECT.
+ *
+ * @NL80211_ATTR_FILS_ERP_REALM: EAP Re-authentication Protocol (ERP) realm part
+ *	of NAI specifying the domain name of the ER server. This is used with
+ *	%NL80211_CMD_CONNECT.
+ *
+ * @NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM: Unsigned 16-bit ERP next sequence number
+ *	to use in ERP messages. This is used in generating the FILS wrapped data
+ *	for FILS authentication and is used with %NL80211_CMD_CONNECT.
+ *
+ * @NL80211_ATTR_FILS_ERP_RRK: ERP re-authentication Root Key (rRK) for the
+ *	NAI specified by %NL80211_ATTR_FILS_ERP_USERNAME and
+ *	%NL80211_ATTR_FILS_ERP_REALM. This is used for generating rIK and rMSK
+ *	from successful FILS authentication and is used with
+ *	%NL80211_CMD_CONNECT.
+ *
+ * @NL80211_ATTR_FILS_CACHE_ID: A 2-octet identifier advertized by a FILS AP
+ *	identifying the scope of PMKSAs. This is used with
+ *	@NL80211_CMD_SET_PMKSA and @NL80211_CMD_DEL_PMKSA.
+ *
+ * @NL80211_ATTR_PMK: PMK for the PMKSA identified by %NL80211_ATTR_PMKID.
+ *	This is used with @NL80211_CMD_SET_PMKSA.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -2423,6 +2492,14 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_TIMEOUT_REASON,
 
+	NL80211_ATTR_FILS_ERP_USERNAME,
+	NL80211_ATTR_FILS_ERP_REALM,
+	NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
+	NL80211_ATTR_FILS_ERP_RRK,
+	NL80211_ATTR_FILS_CACHE_ID,
+
+	NL80211_ATTR_PMK,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -4759,6 +4836,8 @@ enum nl80211_feature_flags {
  * @NL80211_EXT_FEATURE_CQM_RSSI_LIST: With this driver the
  *	%NL80211_ATTR_CQM_RSSI_THOLD attribute accepts a list of zero or more
  *	RSSI threshold values to monitor rather than exactly one threshold.
+ * @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD: Driver SME supports FILS shared key
+ *	authentication with %NL80211_CMD_CONNECT.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
@@ -4778,6 +4857,7 @@ enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED,
 	NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI,
 	NL80211_EXT_FEATURE_CQM_RSSI_LIST,
+	NL80211_EXT_FEATURE_FILS_SK_OFFLOAD,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 3d635c865281..9910aae08f1a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -410,6 +410,15 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 		.len = sizeof(struct nl80211_bss_select_rssi_adjust)
 	},
 	[NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 },
+	[NL80211_ATTR_FILS_ERP_USERNAME] = { .type = NLA_BINARY,
+					     .len = FILS_ERP_MAX_USERNAME_LEN },
+	[NL80211_ATTR_FILS_ERP_REALM] = { .type = NLA_BINARY,
+					  .len = FILS_ERP_MAX_REALM_LEN },
+	[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 },
+	[NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY,
+					.len = FILS_ERP_MAX_RRK_LEN },
+	[NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 },
+	[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
 };
 
 /* policy for the key attributes */
@@ -3832,6 +3841,19 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 			return false;
 		return true;
 	case NL80211_CMD_CONNECT:
+		/* SAE not supported yet */
+		if (auth_type == NL80211_AUTHTYPE_SAE)
+			return false;
+		/* FILS with SK PFS or PK not supported yet */
+		if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
+		    auth_type == NL80211_AUTHTYPE_FILS_PK)
+			return false;
+		if (!wiphy_ext_feature_isset(
+			    &rdev->wiphy,
+			    NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) &&
+		    auth_type == NL80211_AUTHTYPE_FILS_SK)
+			return false;
+		return true;
 	case NL80211_CMD_START_AP:
 		/* SAE not supported yet */
 		if (auth_type == NL80211_AUTHTYPE_SAE)
@@ -8906,6 +8928,35 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (wiphy_ext_feature_isset(&rdev->wiphy,
+				    NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_REALM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] &&
+	    info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		connect.fils_erp_username =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_username_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]);
+		connect.fils_erp_realm =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_realm_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]);
+		connect.fils_erp_next_seq_num =
+			nla_get_u16(
+			   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]);
+		connect.fils_erp_rrk =
+			nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+		connect.fils_erp_rrk_len =
+			nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]);
+	} else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_REALM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] ||
+		   info->attrs[NL80211_ATTR_FILS_ERP_RRK]) {
+		kzfree(connkeys);
+		return -EINVAL;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
@@ -9025,14 +9076,28 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info)
 
 	memset(&pmksa, 0, sizeof(struct cfg80211_pmksa));
 
-	if (!info->attrs[NL80211_ATTR_MAC])
-		return -EINVAL;
-
 	if (!info->attrs[NL80211_ATTR_PMKID])
 		return -EINVAL;
 
 	pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]);
-	pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+
+	if (info->attrs[NL80211_ATTR_MAC]) {
+		pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	} else if (info->attrs[NL80211_ATTR_SSID] &&
+		   info->attrs[NL80211_ATTR_FILS_CACHE_ID] &&
+		   (info->genlhdr->cmd == NL80211_CMD_DEL_PMKSA ||
+		    info->attrs[NL80211_ATTR_PMK])) {
+		pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]);
+		pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+		pmksa.cache_id =
+			nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]);
+	} else {
+		return -EINVAL;
+	}
+	if (info->attrs[NL80211_ATTR_PMK]) {
+		pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]);
+		pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]);
+	}
 
 	if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
 	    dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
@@ -13471,7 +13536,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	struct sk_buff *msg;
 	void *hdr;
 
-	msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len, gfp);
+	msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len +
+			cr->fils_kek_len + cr->pmk_len +
+			(cr->pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!msg)
 		return;
 
@@ -13496,7 +13563,18 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	     nla_put(msg, NL80211_ATTR_REQ_IE, cr->req_ie_len, cr->req_ie)) ||
 	    (cr->resp_ie &&
 	     nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len,
-		     cr->resp_ie)))
+		     cr->resp_ie)) ||
+	    (cr->update_erp_next_seq_num &&
+	     nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM,
+			 cr->fils_erp_next_seq_num)) ||
+	    (cr->status == WLAN_STATUS_SUCCESS &&
+	     ((cr->fils_kek &&
+	       nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len,
+		       cr->fils_kek)) ||
+	      (cr->pmk &&
+	       nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) ||
+	      (cr->pmkid &&
+	       nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid)))))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index ebd7adc27246..6459bb7c21f7 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -805,7 +805,9 @@ void cfg80211_connect_done(struct net_device *dev,
 	}
 
 	ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) +
-		     params->req_ie_len + params->resp_ie_len, gfp);
+		     params->req_ie_len + params->resp_ie_len +
+		     params->fils_kek_len + params->pmk_len +
+		     (params->pmkid ? WLAN_PMKID_LEN : 0), gfp);
 	if (!ev) {
 		cfg80211_put_bss(wdev->wiphy, params->bss);
 		return;
@@ -832,6 +834,27 @@ void cfg80211_connect_done(struct net_device *dev,
 		       params->resp_ie_len);
 		next += params->resp_ie_len;
 	}
+	if (params->fils_kek_len) {
+		ev->cr.fils_kek = next;
+		ev->cr.fils_kek_len = params->fils_kek_len;
+		memcpy((void *)ev->cr.fils_kek, params->fils_kek,
+		       params->fils_kek_len);
+		next += params->fils_kek_len;
+	}
+	if (params->pmk_len) {
+		ev->cr.pmk = next;
+		ev->cr.pmk_len = params->pmk_len;
+		memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len);
+		next += params->pmk_len;
+	}
+	if (params->pmkid) {
+		ev->cr.pmkid = next;
+		memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN);
+		next += WLAN_PMKID_LEN;
+	}
+	ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num;
+	if (params->update_erp_next_seq_num)
+		ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num;
 	if (params->bss)
 		cfg80211_hold_bss(bss_from_pub(params->bss));
 	ev->cr.bss = params->bss;
-- 
cgit v1.2.3


From f7048b15900f36fe21398fba94777b8aab3b376d Mon Sep 17 00:00:00 2001
From: Vignesh R <vigneshr@ti.com>
Date: Fri, 24 Mar 2017 10:57:59 +0530
Subject: tty: serial_core: Add name field to uart_port struct

Introduce a field to store name of uart_port that can be used to easily
identify UART port instances on a system that has more than one UART
instance. The name is of the form ttyXN(eg. ttyS0, ttyAMA0,..) where N
is number that particular UART instance.
This field will be useful when printing debug info for a particular port
or in register IRQs with unique IRQ name. Port name is populated during
uart_add_one_port().

Signed-off-by: Vignesh R <vigneshr@ti.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/serial_core.c | 7 +++++++
 include/linux/serial_core.h      | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 0fb3f7cce62a..f5572e28d16a 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -2744,6 +2744,12 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport)
 	state->pm_state = UART_PM_STATE_UNDEFINED;
 	uport->cons = drv->cons;
 	uport->minor = drv->tty_driver->minor_start + uport->line;
+	uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name,
+				drv->tty_driver->name_base + uport->line);
+	if (!uport->name) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	/*
 	 * If this port is a console, then the spinlock is already
@@ -2861,6 +2867,7 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport)
 	if (uport->type != PORT_UNKNOWN && uport->ops->release_port)
 		uport->ops->release_port(uport);
 	kfree(uport->tty_groups);
+	kfree(uport->name);
 
 	/*
 	 * Indicate that there isn't a port here anymore.
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 58484fb35cc8..60530678c633 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -247,6 +247,7 @@ struct uart_port {
 	unsigned char		suspended;
 	unsigned char		irq_wake;
 	unsigned char		unused[2];
+	char			*name;			/* port name */
 	struct attribute_group	*attr_group;		/* port specific attributes */
 	const struct attribute_group **tty_groups;	/* all attributes (serial core use only) */
 	struct serial_rs485     rs485;
-- 
cgit v1.2.3


From 7ed98e0168bd23d8ea3294e95254cc5b4000c948 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Mar 2017 10:46:14 +0000
Subject: drivers/perf: arm_pmu: manage interrupts per-cpu

When requesting or freeing interrupts, we use platform_get_irq() to find
relevant irqs, backing this up with additional information in an
optional irq_affinity table.

This means that our irq request and free paths are tied to a
platform_device, and our request path must jump through a number of
hoops in order to determine the required affinity of each interrupt.

Given that the affinity must be static, we can compute the affinity once
up-front at probe time, simplifying the irq request and free paths. By
recording interrupts in a per-cpu data structure, we simplify a few
paths, and permit a subsequent rework of the request and free paths.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[will: rename local nr_irqs variable to avoid conflict with global]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 314 ++++++++++++++++++++++---------------------
 include/linux/perf/arm_pmu.h |   3 +-
 2 files changed, 166 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index ad60e966f174..e984653b93aa 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -617,94 +617,76 @@ static void cpu_pmu_disable_percpu_irq(void *data)
 
 static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 {
-	int i, irq, irqs;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	int cpu;
 	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
 
-	irqs = min(pmu_device->num_resources, num_possible_cpus());
-
-	irq = platform_get_irq(pmu_device, 0);
-	if (irq > 0 && irq_is_percpu(irq)) {
-		on_each_cpu_mask(&cpu_pmu->supported_cpus,
-				 cpu_pmu_disable_percpu_irq, &irq, 1);
-		free_percpu_irq(irq, &hw_events->percpu_pmu);
-	} else {
-		for (i = 0; i < irqs; ++i) {
-			int cpu = i;
-
-			if (cpu_pmu->irq_affinity)
-				cpu = cpu_pmu->irq_affinity[i];
-
-			if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
-				continue;
-			irq = platform_get_irq(pmu_device, i);
-			if (irq > 0)
-				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+	for_each_cpu(cpu, &cpu_pmu->supported_cpus) {
+		int irq = per_cpu(hw_events->irq, cpu);
+		if (!irq)
+			continue;
+
+		if (irq_is_percpu(irq)) {
+			on_each_cpu_mask(&cpu_pmu->supported_cpus,
+					 cpu_pmu_disable_percpu_irq, &irq, 1);
+			free_percpu_irq(irq, &hw_events->percpu_pmu);
+
+			break;
 		}
+
+		if (!cpumask_test_and_clear_cpu(cpu, &cpu_pmu->active_irqs))
+			continue;
+
+		free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
 	}
 }
 
 static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 {
-	int i, err, irq, irqs;
-	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	int cpu, err;
 	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
 
-	if (!pmu_device)
-		return -ENODEV;
-
-	irqs = min(pmu_device->num_resources, num_possible_cpus());
-	if (irqs < 1) {
-		pr_warn_once("perf/ARM: No irqs for PMU defined, sampling events not supported\n");
-		return 0;
-	}
-
-	irq = platform_get_irq(pmu_device, 0);
-	if (irq > 0 && irq_is_percpu(irq)) {
-		err = request_percpu_irq(irq, handler, "arm-pmu",
-					 &hw_events->percpu_pmu);
-		if (err) {
-			pr_err("unable to request IRQ%d for ARM PMU counters\n",
-				irq);
-			return err;
-		}
-
-		on_each_cpu_mask(&cpu_pmu->supported_cpus,
-				 cpu_pmu_enable_percpu_irq, &irq, 1);
-	} else {
-		for (i = 0; i < irqs; ++i) {
-			int cpu = i;
-
-			err = 0;
-			irq = platform_get_irq(pmu_device, i);
-			if (irq < 0)
-				continue;
-
-			if (cpu_pmu->irq_affinity)
-				cpu = cpu_pmu->irq_affinity[i];
-
-			/*
-			 * If we have a single PMU interrupt that we can't shift,
-			 * assume that we're running on a uniprocessor machine and
-			 * continue. Otherwise, continue without this interrupt.
-			 */
-			if (irq_set_affinity(irq, cpumask_of(cpu)) && irqs > 1) {
-				pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
-					irq, cpu);
-				continue;
-			}
+	for_each_cpu(cpu, &cpu_pmu->supported_cpus) {
+		int irq = per_cpu(hw_events->irq, cpu);
+		if (!irq)
+			continue;
 
-			err = request_irq(irq, handler,
-					  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
-					  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+		if (irq_is_percpu(irq)) {
+			err = request_percpu_irq(irq, handler, "arm-pmu",
+						 &hw_events->percpu_pmu);
 			if (err) {
 				pr_err("unable to request IRQ%d for ARM PMU counters\n",
 					irq);
 				return err;
 			}
 
-			cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
+			on_each_cpu_mask(&cpu_pmu->supported_cpus,
+					 cpu_pmu_enable_percpu_irq, &irq, 1);
+
+			break;
 		}
+
+		/*
+		 * If we have a single PMU interrupt that we can't shift,
+		 * assume that we're running on a uniprocessor machine and
+		 * continue. Otherwise, continue without this interrupt.
+		 */
+		if (irq_set_affinity(irq, cpumask_of(cpu)) &&
+		    num_possible_cpus() > 1) {
+			pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
+				irq, cpu);
+			continue;
+		}
+
+		err = request_irq(irq, handler,
+				  IRQF_NOBALANCING | IRQF_NO_THREAD, "arm-pmu",
+				  per_cpu_ptr(&hw_events->percpu_pmu, cpu));
+		if (err) {
+			pr_err("unable to request IRQ%d for ARM PMU counters\n",
+				irq);
+			return err;
+		}
+
+		cpumask_set_cpu(cpu, &cpu_pmu->active_irqs);
 	}
 
 	return 0;
@@ -846,10 +828,6 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 		on_each_cpu_mask(&cpu_pmu->supported_cpus, cpu_pmu->reset,
 			 cpu_pmu, 1);
 
-	/* If no interrupts available, set the corresponding capability flag */
-	if (!platform_get_irq(cpu_pmu->plat_device, 0))
-		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
 	/*
 	 * This is a CPU PMU potentially in a heterogeneous configuration (e.g.
 	 * big.LITTLE). This is not an uncore PMU, and we have taken ctx
@@ -897,98 +875,133 @@ static int probe_current_pmu(struct arm_pmu *pmu,
 	return ret;
 }
 
-static int of_pmu_irq_cfg(struct arm_pmu *pmu)
+static int pmu_parse_percpu_irq(struct arm_pmu *pmu, int irq)
 {
-	int *irqs, i = 0;
-	bool using_spi = false;
-	struct platform_device *pdev = pmu->plat_device;
+	int cpu, ret;
+	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
 
-	irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
-	if (!irqs)
-		return -ENOMEM;
+	ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus);
+	if (ret)
+		return ret;
 
-	do {
-		struct device_node *dn;
-		int cpu, irq;
+	for_each_cpu(cpu, &pmu->supported_cpus)
+		per_cpu(hw_events->irq, cpu) = irq;
 
-		/* See if we have an affinity entry */
-		dn = of_parse_phandle(pdev->dev.of_node, "interrupt-affinity", i);
-		if (!dn)
-			break;
+	return 0;
+}
 
-		/* Check the IRQ type and prohibit a mix of PPIs and SPIs */
-		irq = platform_get_irq(pdev, i);
-		if (irq > 0) {
-			bool spi = !irq_is_percpu(irq);
-
-			if (i > 0 && spi != using_spi) {
-				pr_err("PPI/SPI IRQ type mismatch for %s!\n",
-					dn->name);
-				of_node_put(dn);
-				kfree(irqs);
-				return -EINVAL;
-			}
+static bool pmu_has_irq_affinity(struct device_node *node)
+{
+	return !!of_find_property(node, "interrupt-affinity", NULL);
+}
 
-			using_spi = spi;
-		}
+static int pmu_parse_irq_affinity(struct device_node *node, int i)
+{
+	struct device_node *dn;
+	int cpu;
 
-		/* Now look up the logical CPU number */
-		for_each_possible_cpu(cpu) {
-			struct device_node *cpu_dn;
+	/*
+	 * If we don't have an interrupt-affinity property, we guess irq
+	 * affinity matches our logical CPU order, as we used to assume.
+	 * This is fragile, so we'll warn in pmu_parse_irqs().
+	 */
+	if (!pmu_has_irq_affinity(node))
+		return i;
 
-			cpu_dn = of_cpu_device_node_get(cpu);
-			of_node_put(cpu_dn);
+	dn = of_parse_phandle(node, "interrupt-affinity", i);
+	if (!dn) {
+		pr_warn("failed to parse interrupt-affinity[%d] for %s\n",
+			i, node->name);
+		return -EINVAL;
+	}
 
-			if (dn == cpu_dn)
-				break;
-		}
+	/* Now look up the logical CPU number */
+	for_each_possible_cpu(cpu) {
+		struct device_node *cpu_dn;
+
+		cpu_dn = of_cpu_device_node_get(cpu);
+		of_node_put(cpu_dn);
 
-		if (cpu >= nr_cpu_ids) {
-			pr_warn("Failed to find logical CPU for %s\n",
-				dn->name);
-			of_node_put(dn);
-			cpumask_setall(&pmu->supported_cpus);
+		if (dn == cpu_dn)
 			break;
-		}
-		of_node_put(dn);
+	}
 
-		/* For SPIs, we need to track the affinity per IRQ */
-		if (using_spi) {
-			if (i >= pdev->num_resources)
-				break;
+	if (cpu >= nr_cpu_ids) {
+		pr_warn("failed to find logical CPU for %s\n", dn->name);
+	}
 
-			irqs[i] = cpu;
-		}
+	of_node_put(dn);
 
-		/* Keep track of the CPUs containing this PMU type */
-		cpumask_set_cpu(cpu, &pmu->supported_cpus);
-		i++;
-	} while (1);
+	return cpu;
+}
+
+static int pmu_parse_irqs(struct arm_pmu *pmu)
+{
+	int i = 0, irqs;
+	struct platform_device *pdev = pmu->plat_device;
+	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
+
+	irqs = platform_irq_count(pdev);
+	if (irqs < 0) {
+		pr_err("unable to count PMU IRQs\n");
+		return irqs;
+	}
+
+	/*
+	 * In this case we have no idea which CPUs are covered by the PMU.
+	 * To match our prior behaviour, we assume all CPUs in this case.
+	 */
+	if (irqs == 0) {
+		pr_warn("no irqs for PMU, sampling events not supported\n");
+		pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+		cpumask_setall(&pmu->supported_cpus);
+		return 0;
+	}
 
-	/* If we didn't manage to parse anything, try the interrupt affinity */
-	if (cpumask_weight(&pmu->supported_cpus) == 0) {
+	if (irqs == 1) {
 		int irq = platform_get_irq(pdev, 0);
+		if (irq && irq_is_percpu(irq))
+			return pmu_parse_percpu_irq(pmu, irq);
+	}
 
-		if (irq > 0 && irq_is_percpu(irq)) {
-			/* If using PPIs, check the affinity of the partition */
-			int ret;
+	if (!pmu_has_irq_affinity(pdev->dev.of_node)) {
+		pr_warn("no interrupt-affinity property for %s, guessing.\n",
+			of_node_full_name(pdev->dev.of_node));
+	}
 
-			ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus);
-			if (ret) {
-				kfree(irqs);
-				return ret;
-			}
-		} else {
-			/* Otherwise default to all CPUs */
-			cpumask_setall(&pmu->supported_cpus);
+	/*
+	 * Some platforms have all PMU IRQs OR'd into a single IRQ, with a
+	 * special platdata function that attempts to demux them.
+	 */
+	if (dev_get_platdata(&pdev->dev))
+		cpumask_setall(&pmu->supported_cpus);
+
+	for (i = 0; i < irqs; i++) {
+		int cpu, irq;
+
+		irq = platform_get_irq(pdev, i);
+		if (WARN_ON(irq <= 0))
+			continue;
+
+		if (irq_is_percpu(irq)) {
+			pr_warn("multiple PPIs or mismatched SPI/PPI detected\n");
+			return -EINVAL;
 		}
-	}
 
-	/* If we matched up the IRQ affinities, use them to route the SPIs */
-	if (using_spi && i == pdev->num_resources)
-		pmu->irq_affinity = irqs;
-	else
-		kfree(irqs);
+		cpu = pmu_parse_irq_affinity(pdev->dev.of_node, i);
+		if (cpu < 0)
+			return cpu;
+		if (cpu >= nr_cpu_ids)
+			continue;
+
+		if (per_cpu(hw_events->irq, cpu)) {
+			pr_warn("multiple PMU IRQs for the same CPU detected\n");
+			return -EINVAL;
+		}
+
+		per_cpu(hw_events->irq, cpu) = irq;
+		cpumask_set_cpu(cpu, &pmu->supported_cpus);
+	}
 
 	return 0;
 }
@@ -1050,6 +1063,10 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 
 	pmu->plat_device = pdev;
 
+	ret = pmu_parse_irqs(pmu);
+	if (ret)
+		goto out_free;
+
 	if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
 		init_fn = of_id->data;
 
@@ -1062,9 +1079,7 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 			pmu->secure_access = false;
 		}
 
-		ret = of_pmu_irq_cfg(pmu);
-		if (!ret)
-			ret = init_fn(pmu);
+		ret = init_fn(pmu);
 	} else if (probe_table) {
 		cpumask_setall(&pmu->supported_cpus);
 		ret = probe_current_pmu(pmu, probe_table);
@@ -1097,7 +1112,6 @@ out_destroy:
 out_free:
 	pr_info("%s: failed to register PMU devices!\n",
 		of_node_full_name(node));
-	kfree(pmu->irq_affinity);
 	armpmu_free(pmu);
 	return ret;
 }
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 8462da266089..05a3eb447fc8 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -75,6 +75,8 @@ struct pmu_hw_events {
 	 * already have to allocate this struct per cpu.
 	 */
 	struct arm_pmu		*percpu_pmu;
+
+	int irq;
 };
 
 enum armpmu_attr_groups {
@@ -88,7 +90,6 @@ struct arm_pmu {
 	struct pmu	pmu;
 	cpumask_t	active_irqs;
 	cpumask_t	supported_cpus;
-	int		*irq_affinity;
 	char		*name;
 	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
 	void		(*enable)(struct perf_event *event);
-- 
cgit v1.2.3


From c09adab01e4aeecfa3dfae0946409844400c5901 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Mar 2017 10:46:15 +0000
Subject: drivers/perf: arm_pmu: split irq request from enable

For historical reasons, we lazily request and free interrupts in the
arm pmu driver. This requires us to refcount use of the pmu (by way of
counting the active events) in order to request/free interrupts at the
correct times, which complicates the driver somewhat.

The existing logic is flawed, as it only considers currently online CPUs
when requesting, freeing, or managing the affinity of interrupts.
Intervening hotplug events can result in erroneous IRQ affinity, online
CPUs for which interrupts have not been requested, or offline CPUs whose
interrupts are still requested.

To fix this, this patch splits the requesting of interrupts from any
per-cpu management (i.e. per-cpu enable/disable, and configuration of
cpu affinity). We now request all interrupts up-front at probe time (and
never free them, since we never unregister PMUs).

The management of affinity, and per-cpu enable/disable now happens in
our cpu hotplug callback, ensuring it occurs consistently. This means
that we must now invoke the CPU hotplug callback at boot time in order
to configure IRQs, and since the callback also resets the PMU hardware,
we can remove the duplicate reset in the probe path.

This rework renders our event refcounting unnecessary, so this is
removed.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[will: make armpmu_get_cpu_irq static]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 153 ++++++++++++++-----------------------------
 include/linux/perf/arm_pmu.h |   4 --
 2 files changed, 50 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index e984653b93aa..a1dfe895cb1d 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -352,37 +352,6 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	return ret;
 }
 
-static void
-armpmu_release_hardware(struct arm_pmu *armpmu)
-{
-	armpmu->free_irq(armpmu);
-}
-
-static int
-armpmu_reserve_hardware(struct arm_pmu *armpmu)
-{
-	int err = armpmu->request_irq(armpmu, armpmu_dispatch_irq);
-	if (err) {
-		armpmu_release_hardware(armpmu);
-		return err;
-	}
-
-	return 0;
-}
-
-static void
-hw_perf_event_destroy(struct perf_event *event)
-{
-	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	atomic_t *active_events	 = &armpmu->active_events;
-	struct mutex *pmu_reserve_mutex = &armpmu->reserve_mutex;
-
-	if (atomic_dec_and_mutex_lock(active_events, pmu_reserve_mutex)) {
-		armpmu_release_hardware(armpmu);
-		mutex_unlock(pmu_reserve_mutex);
-	}
-}
-
 static int
 event_requires_mode_exclusion(struct perf_event_attr *attr)
 {
@@ -455,8 +424,6 @@ __hw_perf_event_init(struct perf_event *event)
 static int armpmu_event_init(struct perf_event *event)
 {
 	struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
-	int err = 0;
-	atomic_t *active_events = &armpmu->active_events;
 
 	/*
 	 * Reject CPU-affine events for CPUs that are of a different class to
@@ -476,26 +443,7 @@ static int armpmu_event_init(struct perf_event *event)
 	if (armpmu->map_event(event) == -ENOENT)
 		return -ENOENT;
 
-	event->destroy = hw_perf_event_destroy;
-
-	if (!atomic_inc_not_zero(active_events)) {
-		mutex_lock(&armpmu->reserve_mutex);
-		if (atomic_read(active_events) == 0)
-			err = armpmu_reserve_hardware(armpmu);
-
-		if (!err)
-			atomic_inc(active_events);
-		mutex_unlock(&armpmu->reserve_mutex);
-	}
-
-	if (err)
-		return err;
-
-	err = __hw_perf_event_init(event);
-	if (err)
-		hw_perf_event_destroy(event);
-
-	return err;
+	return __hw_perf_event_init(event);
 }
 
 static void armpmu_enable(struct pmu *pmu)
@@ -555,9 +503,6 @@ static struct attribute_group armpmu_common_attr_group = {
 
 static void armpmu_init(struct arm_pmu *armpmu)
 {
-	atomic_set(&armpmu->active_events, 0);
-	mutex_init(&armpmu->reserve_mutex);
-
 	armpmu->pmu = (struct pmu) {
 		.pmu_enable	= armpmu_enable,
 		.pmu_disable	= armpmu_disable,
@@ -601,21 +546,7 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);
 
-static void cpu_pmu_enable_percpu_irq(void *data)
-{
-	int irq = *(int *)data;
-
-	enable_percpu_irq(irq, IRQ_TYPE_NONE);
-}
-
-static void cpu_pmu_disable_percpu_irq(void *data)
-{
-	int irq = *(int *)data;
-
-	disable_percpu_irq(irq);
-}
-
-static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
+static void cpu_pmu_free_irqs(struct arm_pmu *cpu_pmu)
 {
 	int cpu;
 	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
@@ -626,10 +557,7 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			continue;
 
 		if (irq_is_percpu(irq)) {
-			on_each_cpu_mask(&cpu_pmu->supported_cpus,
-					 cpu_pmu_disable_percpu_irq, &irq, 1);
 			free_percpu_irq(irq, &hw_events->percpu_pmu);
-
 			break;
 		}
 
@@ -640,7 +568,7 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 	}
 }
 
-static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
+static int cpu_pmu_request_irqs(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 {
 	int cpu, err;
 	struct pmu_hw_events __percpu *hw_events = cpu_pmu->hw_events;
@@ -656,25 +584,9 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 			if (err) {
 				pr_err("unable to request IRQ%d for ARM PMU counters\n",
 					irq);
-				return err;
 			}
 
-			on_each_cpu_mask(&cpu_pmu->supported_cpus,
-					 cpu_pmu_enable_percpu_irq, &irq, 1);
-
-			break;
-		}
-
-		/*
-		 * If we have a single PMU interrupt that we can't shift,
-		 * assume that we're running on a uniprocessor machine and
-		 * continue. Otherwise, continue without this interrupt.
-		 */
-		if (irq_set_affinity(irq, cpumask_of(cpu)) &&
-		    num_possible_cpus() > 1) {
-			pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
-				irq, cpu);
-			continue;
+			return err;
 		}
 
 		err = request_irq(irq, handler,
@@ -692,6 +604,12 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 	return 0;
 }
 
+static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
+{
+	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
+	return per_cpu(hw_events->irq, cpu);
+}
+
 /*
  * PMU hardware loses all context when a CPU goes offline.
  * When a CPU is hotplugged back in, since some hardware registers are
@@ -701,11 +619,42 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
 static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node)
 {
 	struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node);
+	int irq;
 
 	if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
 		return 0;
 	if (pmu->reset)
 		pmu->reset(pmu);
+
+	irq = armpmu_get_cpu_irq(pmu, cpu);
+	if (irq) {
+		if (irq_is_percpu(irq)) {
+			enable_percpu_irq(irq, IRQ_TYPE_NONE);
+			return 0;
+		}
+
+		if (irq_force_affinity(irq, cpumask_of(cpu)) &&
+		    num_possible_cpus() > 1) {
+			pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
+				irq, cpu);
+		}
+	}
+
+	return 0;
+}
+
+static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node);
+	int irq;
+
+	if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
+		return 0;
+
+	irq = armpmu_get_cpu_irq(pmu, cpu);
+	if (irq && irq_is_percpu(irq))
+		disable_percpu_irq(irq);
+
 	return 0;
 }
 
@@ -811,8 +760,12 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	int err;
 
-	err = cpuhp_state_add_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
-					       &cpu_pmu->node);
+	err = cpu_pmu_request_irqs(cpu_pmu, armpmu_dispatch_irq);
+	if (err)
+		goto out;
+
+	err = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_STARTING,
+				       &cpu_pmu->node);
 	if (err)
 		goto out;
 
@@ -820,14 +773,6 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 	if (err)
 		goto out_unregister;
 
-	cpu_pmu->request_irq	= cpu_pmu_request_irq;
-	cpu_pmu->free_irq	= cpu_pmu_free_irq;
-
-	/* Ensure the PMU has sane values out of reset. */
-	if (cpu_pmu->reset)
-		on_each_cpu_mask(&cpu_pmu->supported_cpus, cpu_pmu->reset,
-			 cpu_pmu, 1);
-
 	/*
 	 * This is a CPU PMU potentially in a heterogeneous configuration (e.g.
 	 * big.LITTLE). This is not an uncore PMU, and we have taken ctx
@@ -842,6 +787,7 @@ out_unregister:
 	cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
 					    &cpu_pmu->node);
 out:
+	cpu_pmu_free_irqs(cpu_pmu);
 	return err;
 }
 
@@ -1122,7 +1068,8 @@ static int arm_pmu_hp_init(void)
 
 	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING,
 				      "perf/arm/pmu:starting",
-				      arm_perf_starting_cpu, NULL);
+				      arm_perf_starting_cpu,
+				      arm_perf_teardown_cpu);
 	if (ret)
 		pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n",
 		       ret);
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 05a3eb447fc8..44f43fcf2524 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -105,12 +105,8 @@ struct arm_pmu {
 	void		(*start)(struct arm_pmu *);
 	void		(*stop)(struct arm_pmu *);
 	void		(*reset)(void *);
-	int		(*request_irq)(struct arm_pmu *, irq_handler_t handler);
-	void		(*free_irq)(struct arm_pmu *);
 	int		(*map_event)(struct perf_event *event);
 	int		num_events;
-	atomic_t	active_events;
-	struct mutex	reserve_mutex;
 	u64		max_period;
 	bool		secure_access; /* 32-bit ARM only */
 #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
-- 
cgit v1.2.3


From 1cf1cae963c2e6032aebe1637e995bc2f5d330f4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 30 Mar 2017 21:45:38 -0700
Subject: bpf: introduce BPF_PROG_TEST_RUN command

development and testing of networking bpf programs is quite cumbersome.
Despite availability of user space bpf interpreters the kernel is
the ultimate authority and execution environment.
Current test frameworks for TC include creation of netns, veth,
qdiscs and use of various packet generators just to test functionality
of a bpf program. XDP testing is even more complicated, since
qemu needs to be started with gro/gso disabled and precise queue
configuration, transferring of xdp program from host into guest,
attaching to virtio/eth0 and generating traffic from the host
while capturing the results from the guest.

Moreover analyzing performance bottlenecks in XDP program is
impossible in virtio environment, since cost of running the program
is tiny comparing to the overhead of virtio packet processing,
so performance testing can only be done on physical nic
with another server generating traffic.

Furthermore ongoing changes to user space control plane of production
applications cannot be run on the test servers leaving bpf programs
stubbed out for testing.

Last but not least, the upstream llvm changes are validated by the bpf
backend testsuite which has no ability to test the code generated.

To improve this situation introduce BPF_PROG_TEST_RUN command
to test and performance benchmark bpf programs.

Joint work with Daniel Borkmann.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   7 ++
 include/uapi/linux/bpf.h |  12 ++++
 kernel/bpf/syscall.c     |  27 +++++++-
 net/Makefile             |   2 +-
 net/bpf/Makefile         |   1 +
 net/bpf/test_run.c       | 172 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/filter.c        |   5 ++
 7 files changed, 223 insertions(+), 3 deletions(-)
 create mode 100644 net/bpf/Makefile
 create mode 100644 net/bpf/test_run.c

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2ae39a3e9ead..bbb513da5075 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,6 +169,8 @@ struct bpf_verifier_ops {
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog);
+	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+			union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_type_list {
@@ -233,6 +235,11 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src,
 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
 		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy);
 
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr);
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr);
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 28317a04c34d..a1d95386f562 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
 	BPF_OBJ_GET,
 	BPF_PROG_ATTACH,
 	BPF_PROG_DETACH,
+	BPF_PROG_TEST_RUN,
 };
 
 enum bpf_map_type {
@@ -189,6 +190,17 @@ union bpf_attr {
 		__u32		attach_type;
 		__u32		attach_flags;
 	};
+
+	struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+		__u32		prog_fd;
+		__u32		retval;
+		__u32		data_size_in;
+		__u32		data_size_out;
+		__aligned_u64	data_in;
+		__aligned_u64	data_out;
+		__u32		repeat;
+		__u32		duration;
+	} test;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c35ebfe6d84d..ab0cf4c43690 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -973,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 }
 #endif /* CONFIG_CGROUP_BPF */
 
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+
+static int bpf_prog_test_run(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	struct bpf_prog *prog;
+	int ret = -ENOTSUPP;
+
+	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
+		return -EINVAL;
+
+	prog = bpf_prog_get(attr->test.prog_fd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (prog->aux->ops->test_run)
+		ret = prog->aux->ops->test_run(prog, attr, uattr);
+
+	bpf_prog_put(prog);
+	return ret;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1039,7 +1061,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_OBJ_GET:
 		err = bpf_obj_get(&attr);
 		break;
-
 #ifdef CONFIG_CGROUP_BPF
 	case BPF_PROG_ATTACH:
 		err = bpf_prog_attach(&attr);
@@ -1048,7 +1069,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_prog_detach(&attr);
 		break;
 #endif
-
+	case BPF_PROG_TEST_RUN:
+		err = bpf_prog_test_run(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/net/Makefile b/net/Makefile
index 9b681550e3a3..9086ffbb5085 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_NET)		+= $(tmp-y)
 
 # LLC has to be linked before the files in net/802/
 obj-$(CONFIG_LLC)		+= llc/
-obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/
+obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/ bpf/
 obj-$(CONFIG_NETFILTER)		+= netfilter/
 obj-$(CONFIG_INET)		+= ipv4/
 obj-$(CONFIG_XFRM)		+= xfrm/
diff --git a/net/bpf/Makefile b/net/bpf/Makefile
new file mode 100644
index 000000000000..27b2992a0692
--- /dev/null
+++ b/net/bpf/Makefile
@@ -0,0 +1 @@
+obj-y	:= test_run.o
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
new file mode 100644
index 000000000000..8a6d0a37c30c
--- /dev/null
+++ b/net/bpf/test_run.c
@@ -0,0 +1,172 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/etherdevice.h>
+#include <linux/filter.h>
+#include <linux/sched/signal.h>
+
+static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx)
+{
+	u32 ret;
+
+	preempt_disable();
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, ctx);
+	rcu_read_unlock();
+	preempt_enable();
+
+	return ret;
+}
+
+static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
+{
+	u64 time_start, time_spent = 0;
+	u32 ret = 0, i;
+
+	if (!repeat)
+		repeat = 1;
+	time_start = ktime_get_ns();
+	for (i = 0; i < repeat; i++) {
+		ret = bpf_test_run_one(prog, ctx);
+		if (need_resched()) {
+			if (signal_pending(current))
+				break;
+			time_spent += ktime_get_ns() - time_start;
+			cond_resched();
+			time_start = ktime_get_ns();
+		}
+	}
+	time_spent += ktime_get_ns() - time_start;
+	do_div(time_spent, repeat);
+	*time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
+
+	return ret;
+}
+
+static int bpf_test_finish(union bpf_attr __user *uattr, const void *data,
+			   u32 size, u32 retval, u32 duration)
+{
+	void __user *data_out = u64_to_user_ptr(uattr->test.data_out);
+	int err = -EFAULT;
+
+	if (data_out && copy_to_user(data_out, data, size))
+		goto out;
+	if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
+		goto out;
+	if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
+		goto out;
+	if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration)))
+		goto out;
+	err = 0;
+out:
+	return err;
+}
+
+static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
+			   u32 headroom, u32 tailroom)
+{
+	void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+	void *data;
+
+	if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
+		return ERR_PTR(-EINVAL);
+
+	data = kzalloc(size + headroom + tailroom, GFP_USER);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(data + headroom, data_in, size)) {
+		kfree(data);
+		return ERR_PTR(-EFAULT);
+	}
+	return data;
+}
+
+int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr)
+{
+	bool is_l2 = false, is_direct_pkt_access = false;
+	u32 size = kattr->test.data_size_in;
+	u32 repeat = kattr->test.repeat;
+	u32 retval, duration;
+	struct sk_buff *skb;
+	void *data;
+	int ret;
+
+	data = bpf_test_init(kattr, size, NET_SKB_PAD,
+			     SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	switch (prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
+		is_l2 = true;
+		/* fall through */
+	case BPF_PROG_TYPE_LWT_IN:
+	case BPF_PROG_TYPE_LWT_OUT:
+	case BPF_PROG_TYPE_LWT_XMIT:
+		is_direct_pkt_access = true;
+		break;
+	default:
+		break;
+	}
+
+	skb = build_skb(data, 0);
+	if (!skb) {
+		kfree(data);
+		return -ENOMEM;
+	}
+
+	skb_reserve(skb, NET_SKB_PAD);
+	__skb_put(skb, size);
+	skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev);
+	skb_reset_network_header(skb);
+
+	if (is_l2)
+		__skb_push(skb, ETH_HLEN);
+	if (is_direct_pkt_access)
+		bpf_compute_data_end(skb);
+	retval = bpf_test_run(prog, skb, repeat, &duration);
+	if (!is_l2)
+		__skb_push(skb, ETH_HLEN);
+	size = skb->len;
+	/* bpf program can never convert linear skb to non-linear */
+	if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
+		size = skb_headlen(skb);
+	ret = bpf_test_finish(uattr, skb->data, size, retval, duration);
+	kfree_skb(skb);
+	return ret;
+}
+
+int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
+			  union bpf_attr __user *uattr)
+{
+	u32 size = kattr->test.data_size_in;
+	u32 repeat = kattr->test.repeat;
+	struct xdp_buff xdp = {};
+	u32 retval, duration;
+	void *data;
+	int ret;
+
+	data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM, 0);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	xdp.data_hard_start = data;
+	xdp.data = data + XDP_PACKET_HEADROOM;
+	xdp.data_end = xdp.data + size;
+
+	retval = bpf_test_run(prog, &xdp, repeat, &duration);
+	if (xdp.data != data + XDP_PACKET_HEADROOM)
+		size = xdp.data_end - xdp.data;
+	ret = bpf_test_finish(uattr, xdp.data, size, retval, duration);
+	kfree(data);
+	return ret;
+}
diff --git a/net/core/filter.c b/net/core/filter.c
index dfb9f61a2fd5..15e9a81ffebe 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3309,24 +3309,28 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 static const struct bpf_verifier_ops xdp_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
+	.test_run		= bpf_prog_test_run_xdp,
 };
 
 static const struct bpf_verifier_ops cg_skb_ops = {
 	.get_func_proto		= cg_skb_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 static const struct bpf_verifier_ops lwt_inout_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 static const struct bpf_verifier_ops lwt_xmit_ops = {
@@ -3334,6 +3338,7 @@ static const struct bpf_verifier_ops lwt_xmit_ops = {
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+	.test_run		= bpf_prog_test_run_skb,
 };
 
 static const struct bpf_verifier_ops cg_sock_ops = {
-- 
cgit v1.2.3


From 3d8417d79e0da6a47ff29932ef80486be78af56e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 31 Mar 2017 11:47:39 +0200
Subject: udp: use sk_protocol instead of pcflag to detect udplite sockets

In the udp_sock struct, the 'forward_deficit' and 'pcflag' fields
share the same cacheline. While the first is dirtied by
udp_recvmsg, the latter is read, possibly several times, by the
bottom half processing to discriminate between udp and udplite
sockets.

With this patch, sk->sk_protocol is used to check is the socket is
really an udplite one, avoiding some cache misses per
packet and improving the performance under udp_flood with
small packet up to 10%.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index c0f530809d1f..6cb4061a720d 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -115,6 +115,6 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 #define udp_portaddr_for_each_entry_rcu(__sk, list) \
 	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
-#define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag)
+#define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
 
 #endif	/* _LINUX_UDP_H */
-- 
cgit v1.2.3


From bf17aa36c0f199f5b254262e77eaefda7da0f50b Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Wed, 1 Mar 2017 18:22:01 -0800
Subject: nvme: Correct NVMF enum values to match NVMe-oF rev 1.0

The enum values for QPTYPE, PRTYPE and CMS are off by 1 from the
values defined in figure 42 of the NVM Express over Fabrics 1.0:

    http://www.nvmexpress.org/wp-content/uploads/NVMe_over_Fabrics_1_0_Gold_20160605-1.pdf

Fix our enums to match the final spec.

Signed-off-by: Roland Dreier <roland@purestorage.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 include/linux/nvme.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c43d435d4225..9061780b141f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -64,26 +64,26 @@ enum {
  * RDMA_QPTYPE field
  */
 enum {
-	NVMF_RDMA_QPTYPE_CONNECTED	= 0, /* Reliable Connected */
-	NVMF_RDMA_QPTYPE_DATAGRAM	= 1, /* Reliable Datagram */
+	NVMF_RDMA_QPTYPE_CONNECTED	= 1, /* Reliable Connected */
+	NVMF_RDMA_QPTYPE_DATAGRAM	= 2, /* Reliable Datagram */
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS
  * RDMA_QPTYPE field
  */
 enum {
-	NVMF_RDMA_PRTYPE_NOT_SPECIFIED	= 0, /* No Provider Specified */
-	NVMF_RDMA_PRTYPE_IB		= 1, /* InfiniBand */
-	NVMF_RDMA_PRTYPE_ROCE		= 2, /* InfiniBand RoCE */
-	NVMF_RDMA_PRTYPE_ROCEV2		= 3, /* InfiniBand RoCEV2 */
-	NVMF_RDMA_PRTYPE_IWARP		= 4, /* IWARP */
+	NVMF_RDMA_PRTYPE_NOT_SPECIFIED	= 1, /* No Provider Specified */
+	NVMF_RDMA_PRTYPE_IB		= 2, /* InfiniBand */
+	NVMF_RDMA_PRTYPE_ROCE		= 3, /* InfiniBand RoCE */
+	NVMF_RDMA_PRTYPE_ROCEV2		= 4, /* InfiniBand RoCEV2 */
+	NVMF_RDMA_PRTYPE_IWARP		= 5, /* IWARP */
 };
 
 /* RDMA Connection Management Service Type codes for Discovery Log Page
  * entry TSAS RDMA_CMS field
  */
 enum {
-	NVMF_RDMA_CMS_RDMA_CM	= 0, /* Sockets based enpoint addressing */
+	NVMF_RDMA_CMS_RDMA_CM	= 1, /* Sockets based endpoint addressing */
 };
 
 #define NVMF_AQ_DEPTH		32
-- 
cgit v1.2.3


From 5ba6bcbc335771c37d05b88cbfcad5441b57130b Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 7 Mar 2017 06:15:15 -0800
Subject: hwmon: Constify str parameter of hwmon_ops->read_string

The read_string callback is supposed to retrieve a pointer to a
constant string.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Reviewed-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/hwmon.c | 2 +-
 include/linux/hwmon.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 28375d59cc36..dd6e17c1076b 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -186,7 +186,7 @@ static ssize_t hwmon_attr_show_string(struct device *dev,
 				      char *buf)
 {
 	struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr);
-	char *s;
+	const char *s;
 	int ret;
 
 	ret = hattr->ops->read_string(dev, hattr->type, hattr->attr,
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 88b673749121..ceb751987c40 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -337,7 +337,7 @@ struct hwmon_ops {
 	int (*read)(struct device *dev, enum hwmon_sensor_types type,
 		    u32 attr, int channel, long *val);
 	int (*read_string)(struct device *dev, enum hwmon_sensor_types type,
-		    u32 attr, int channel, char **str);
+		    u32 attr, int channel, const char **str);
 	int (*write)(struct device *dev, enum hwmon_sensor_types type,
 		     u32 attr, int channel, long val);
 };
-- 
cgit v1.2.3


From 27c0e3748e41ca79171ffa3e97415a20af6facd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 17 Feb 2017 18:42:24 -0500
Subject: [iov_iter] new privimitive: iov_iter_revert()

opposite to iov_iter_advance(); the caller is responsible for never
using it to move back past the initial position.

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h |  6 ++++-
 lib/iov_iter.c      | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 804e34c6f981..f2d36a3d3005 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -39,7 +39,10 @@ struct iov_iter {
 	};
 	union {
 		unsigned long nr_segs;
-		int idx;
+		struct {
+			int idx;
+			int start_idx;
+		};
 	};
 };
 
@@ -81,6 +84,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to);
 size_t iov_iter_copy_from_user_atomic(struct page *page,
 		struct iov_iter *i, unsigned long offset, size_t bytes);
 void iov_iter_advance(struct iov_iter *i, size_t bytes);
+void iov_iter_revert(struct iov_iter *i, size_t bytes);
 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
 size_t iov_iter_single_seg_count(const struct iov_iter *i);
 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..60abc44385b7 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -786,6 +786,68 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 }
 EXPORT_SYMBOL(iov_iter_advance);
 
+void iov_iter_revert(struct iov_iter *i, size_t unroll)
+{
+	if (!unroll)
+		return;
+	i->count += unroll;
+	if (unlikely(i->type & ITER_PIPE)) {
+		struct pipe_inode_info *pipe = i->pipe;
+		int idx = i->idx;
+		size_t off = i->iov_offset;
+		while (1) {
+			size_t n = off - pipe->bufs[idx].offset;
+			if (unroll < n) {
+				off -= (n - unroll);
+				break;
+			}
+			unroll -= n;
+			if (!unroll && idx == i->start_idx) {
+				off = 0;
+				break;
+			}
+			if (!idx--)
+				idx = pipe->buffers - 1;
+			off = pipe->bufs[idx].offset + pipe->bufs[idx].len;
+		}
+		i->iov_offset = off;
+		i->idx = idx;
+		pipe_truncate(i);
+		return;
+	}
+	if (unroll <= i->iov_offset) {
+		i->iov_offset -= unroll;
+		return;
+	}
+	unroll -= i->iov_offset;
+	if (i->type & ITER_BVEC) {
+		const struct bio_vec *bvec = i->bvec;
+		while (1) {
+			size_t n = (--bvec)->bv_len;
+			i->nr_segs++;
+			if (unroll <= n) {
+				i->bvec = bvec;
+				i->iov_offset = n - unroll;
+				return;
+			}
+			unroll -= n;
+		}
+	} else { /* same logics for iovec and kvec */
+		const struct iovec *iov = i->iov;
+		while (1) {
+			size_t n = (--iov)->iov_len;
+			i->nr_segs++;
+			if (unroll <= n) {
+				i->iov = iov;
+				i->iov_offset = n - unroll;
+				return;
+			}
+			unroll -= n;
+		}
+	}
+}
+EXPORT_SYMBOL(iov_iter_revert);
+
 /*
  * Return the count of just the current iov_iter segment.
  */
@@ -839,6 +901,7 @@ void iov_iter_pipe(struct iov_iter *i, int direction,
 	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
 	i->iov_offset = 0;
 	i->count = count;
+	i->start_idx = i->idx;
 }
 EXPORT_SYMBOL(iov_iter_pipe);
 
-- 
cgit v1.2.3


From 0e056eb5530da802c07f080d6bbd43c50e799efd Mon Sep 17 00:00:00 2001
From: "mchehab@s-opensource.com" <mchehab@s-opensource.com>
Date: Thu, 30 Mar 2017 17:11:36 -0300
Subject: kernel-api.rst: fix a series of errors when parsing C files

./lib/string.c:134: WARNING: Inline emphasis start-string without end-string.
./mm/filemap.c:522: WARNING: Inline interpreted text or phrase reference start-string without end-string.
./mm/filemap.c:1283: ERROR: Unexpected indentation.
./mm/filemap.c:3003: WARNING: Inline interpreted text or phrase reference start-string without end-string.
./mm/vmalloc.c:1544: WARNING: Inline emphasis start-string without end-string.
./mm/page_alloc.c:4245: ERROR: Unexpected indentation.
./ipc/util.c:676: ERROR: Unexpected indentation.
./drivers/pci/irq.c:35: WARNING: Block quote ends without a blank line; unexpected unindent.
./security/security.c:109: ERROR: Unexpected indentation.
./security/security.c:110: WARNING: Definition list ends without a blank line; unexpected unindent.
./block/genhd.c:275: WARNING: Inline strong start-string without end-string.
./block/genhd.c:283: WARNING: Inline strong start-string without end-string.
./include/linux/clk.h:134: WARNING: Inline emphasis start-string without end-string.
./include/linux/clk.h:134: WARNING: Inline emphasis start-string without end-string.
./ipc/util.c:477: ERROR: Unknown target name: "s".

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 block/genhd.c       |  7 ++++---
 drivers/pci/irq.c   |  2 +-
 include/linux/clk.h |  4 ++--
 ipc/util.c          | 12 +++++++-----
 lib/string.c        |  2 +-
 mm/filemap.c        | 18 ++++++++++--------
 mm/page_alloc.c     |  3 ++-
 mm/vmalloc.c        |  2 +-
 security/security.c | 12 ++++++++----
 9 files changed, 36 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index b26a5ea115d0..a53bfd19a0ec 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -271,16 +271,17 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 /**
  * register_blkdev - register a new block device
  *
- * @major: the requested major device number [1..255]. If @major=0, try to
+ * @major: the requested major device number [1..255]. If @major = 0, try to
  *         allocate any unused major number.
  * @name: the name of the new block device as a zero terminated string
  *
  * The @name must be unique within the system.
  *
- * The return value depends on the @major input parameter.
+ * The return value depends on the @major input parameter:
+ *
  *  - if a major device number was requested in range [1..255] then the
  *    function returns zero on success, or a negative error code
- *  - if any unused major number was requested with @major=0 parameter
+ *  - if any unused major number was requested with @major = 0 parameter
  *    then the return value is the allocated major number in range
  *    [1..255] or a negative error code otherwise
  */
diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c
index 6684f153ab57..f9f2a0324ecc 100644
--- a/drivers/pci/irq.c
+++ b/drivers/pci/irq.c
@@ -31,7 +31,7 @@ static void pci_note_irq_problem(struct pci_dev *pdev, const char *reason)
  * driver).
  *
  * Returns:
- *  a suggestion for fixing it (although the driver is not required to
+ * a suggestion for fixing it (although the driver is not required to
  * act on this).
  */
 enum pci_lost_interrupt_reason pci_lost_interrupt(struct pci_dev *pdev)
diff --git a/include/linux/clk.h b/include/linux/clk.h
index e9d36b3e49de..024cd07870d0 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -132,8 +132,8 @@ int clk_get_phase(struct clk *clk);
  * @q: clk compared against p
  *
  * Returns true if the two struct clk pointers both point to the same hardware
- * clock node. Put differently, returns true if struct clk *p and struct clk *q
- * share the same struct clk_core object.
+ * clock node. Put differently, returns true if @p and @q
+ * share the same &struct clk_core object.
  *
  * Returns false otherwise. Note that two NULL clks are treated as matching.
  */
diff --git a/ipc/util.c b/ipc/util.c
index 798cad18dd87..3459a16a9df9 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -474,7 +474,7 @@ void ipc_rcu_free(struct rcu_head *head)
  * Check user, group, other permissions for access
  * to ipc resources. return 0 if allowed
  *
- * @flag will most probably be 0 or S_...UGO from <linux/stat.h>
+ * @flag will most probably be 0 or ``S_...UGO`` from <linux/stat.h>
  */
 int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag)
 {
@@ -672,10 +672,12 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
  *
  * This function does some common audit and permissions check for some IPC_XXX
  * cmd and is called from semctl_down, shmctl_down and msgctl_down.
- * It must be called without any lock held and
- *  - retrieves the ipc with the given id in the given table.
- *  - performs some audit and permission check, depending on the given cmd
- *  - returns a pointer to the ipc object or otherwise, the corresponding error.
+ * It must be called without any lock held and:
+ *
+ *   - retrieves the ipc with the given id in the given table.
+ *   - performs some audit and permission check, depending on the given cmd
+ *   - returns a pointer to the ipc object or otherwise, the corresponding
+ *     error.
  *
  * Call holding the both the rwsem and the rcu read lock.
  */
diff --git a/lib/string.c b/lib/string.c
index ed83562a53ae..b5c9a1168d3a 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -131,7 +131,7 @@ EXPORT_SYMBOL(strncpy);
  * @src: Where to copy the string from
  * @size: size of destination buffer
  *
- * Compatible with *BSD: the result is always a valid
+ * Compatible with ``*BSD``: the result is always a valid
  * NUL-terminated string that fits in the buffer (unless,
  * of course, the buffer size is zero). It does not pad
  * out the result like strncpy() does.
diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..c5808b7a5fb1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait);
  *
  * Write out and wait upon file offsets lstart->lend, inclusive.
  *
- * Note that `lend' is inclusive (describes the last byte to be written) so
+ * Note that @lend is inclusive (describes the last byte to be written) so
  * that this function can be used to write to the very end-of-file (end = -1).
  */
 int filemap_write_and_wait_range(struct address_space *mapping,
@@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry);
  *
  * PCG flags modify how the page is returned.
  *
- * FGP_ACCESSED: the page will be marked accessed
- * FGP_LOCK: Page is return locked
- * FGP_CREAT: If page is not present then a new page is allocated using
- *		@gfp_mask and added to the page cache and the VM's LRU
- *		list. The page is returned locked and with an increased
- *		refcount. Otherwise, %NULL is returned.
+ * @fgp_flags can be:
+ *
+ * - FGP_ACCESSED: the page will be marked accessed
+ * - FGP_LOCK: Page is return locked
+ * - FGP_CREAT: If page is not present then a new page is allocated using
+ *   @gfp_mask and added to the page cache and the VM's LRU
+ *   list. The page is returned locked and with an increased
+ *   refcount. Otherwise, NULL is returned.
  *
  * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
  * if the GFP flags specified for FGP_CREAT are atomic.
@@ -3001,7 +3003,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
  * @gfp_mask: memory allocation flags (and I/O mode)
  *
  * The address_space is to try to release any data against the page
- * (presumably at page->private).  If the release was successful, return `1'.
+ * (presumably at page->private).  If the release was successful, return '1'.
  * Otherwise return zero.
  *
  * This may also be called if PG_fscache is set on a page, indicating that the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa64d2ffdc5..c1b68edaf106 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4242,7 +4242,8 @@ EXPORT_SYMBOL(free_pages_exact);
  * nr_free_zone_pages() counts the number of counts pages which are beyond the
  * high watermark within all zones at or below a given zone index.  For each
  * zone, the number of pages is calculated as:
- *     managed_pages - high_pages
+ *
+ *     nr_free_zone_pages = managed_pages - high_pages
  */
 static unsigned long nr_free_zone_pages(int offset)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b4024d688f38..c24db06f15c4 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1540,7 +1540,7 @@ void vfree_atomic(const void *addr)
  *	have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
  *	conventions for vfree() arch-depenedent would be a really bad idea)
  *
- *	NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
+ *	NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
  */
 void vfree(const void *addr)
 {
diff --git a/security/security.c b/security/security.c
index d0e07f269b2d..23555c5504f6 100644
--- a/security/security.c
+++ b/security/security.c
@@ -103,10 +103,14 @@ static int lsm_append(char *new, char **result)
  * to avoid security registration races. This method may also be used
  * to check if your LSM is currently loaded during kernel initialization.
  *
- * Return true if:
- *	-The passed LSM is the one chosen by user at boot time,
- *	-or the passed LSM is configured as the default and the user did not
- *	 choose an alternate LSM at boot time.
+ * Returns:
+ *
+ * true if:
+ *
+ * - The passed LSM is the one chosen by user at boot time,
+ * - or the passed LSM is configured as the default and the user did not
+ *   choose an alternate LSM at boot time.
+ *
  * Otherwise, return false.
  */
 int __init security_module_enable(const char *module)
-- 
cgit v1.2.3


From fff292914d3a2f1efd05ca71c2ba72a3c663201e Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 31 Mar 2017 15:20:48 +0300
Subject: security, keys: convert key.usage from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/key.h              | 5 +++--
 security/keys/gc.c               | 2 +-
 security/keys/key.c              | 6 +++---
 security/keys/keyring.c          | 8 ++++----
 security/keys/proc.c             | 2 +-
 security/keys/request_key_auth.c | 2 +-
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index e45212f2777e..9d9fac583dd3 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -23,6 +23,7 @@
 #include <linux/rwsem.h>
 #include <linux/atomic.h>
 #include <linux/assoc_array.h>
+#include <linux/refcount.h>
 
 #ifdef __KERNEL__
 #include <linux/uidgid.h>
@@ -135,7 +136,7 @@ static inline bool is_key_possessed(const key_ref_t key_ref)
  *   - Kerberos TGTs and tickets
  */
 struct key {
-	atomic_t		usage;		/* number of references */
+	refcount_t		usage;		/* number of references */
 	key_serial_t		serial;		/* key serial number */
 	union {
 		struct list_head graveyard_link;
@@ -242,7 +243,7 @@ extern void key_put(struct key *key);
 
 static inline struct key *__key_get(struct key *key)
 {
-	atomic_inc(&key->usage);
+	refcount_inc(&key->usage);
 	return key;
 }
 
diff --git a/security/keys/gc.c b/security/keys/gc.c
index addf060399e0..44789256c88c 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -220,7 +220,7 @@ continue_scanning:
 		key = rb_entry(cursor, struct key, serial_node);
 		cursor = rb_next(cursor);
 
-		if (atomic_read(&key->usage) == 0)
+		if (refcount_read(&key->usage) == 0)
 			goto found_unreferenced_key;
 
 		if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) {
diff --git a/security/keys/key.c b/security/keys/key.c
index 346fbf201c22..ff9244392d35 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -285,7 +285,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	if (!key->index_key.description)
 		goto no_memory_3;
 
-	atomic_set(&key->usage, 1);
+	refcount_set(&key->usage, 1);
 	init_rwsem(&key->sem);
 	lockdep_set_class(&key->sem, &type->lock_class);
 	key->index_key.type = type;
@@ -621,7 +621,7 @@ void key_put(struct key *key)
 	if (key) {
 		key_check(key);
 
-		if (atomic_dec_and_test(&key->usage))
+		if (refcount_dec_and_test(&key->usage))
 			schedule_work(&key_gc_work);
 	}
 }
@@ -656,7 +656,7 @@ not_found:
 
 found:
 	/* pretend it doesn't exist if it is awaiting deletion */
-	if (atomic_read(&key->usage) == 0)
+	if (refcount_read(&key->usage) == 0)
 		goto not_found;
 
 	/* this races with key_put(), but that doesn't matter since key_put()
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index c91e4e0cea08..3d95f7d02ba1 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -1033,7 +1033,7 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check)
 			/* we've got a match but we might end up racing with
 			 * key_cleanup() if the keyring is currently 'dead'
 			 * (ie. it has a zero usage count) */
-			if (!atomic_inc_not_zero(&keyring->usage))
+			if (!refcount_inc_not_zero(&keyring->usage))
 				continue;
 			keyring->last_used_at = current_kernel_time().tv_sec;
 			goto out;
@@ -1250,14 +1250,14 @@ int key_link(struct key *keyring, struct key *key)
 	struct assoc_array_edit *edit;
 	int ret;
 
-	kenter("{%d,%d}", keyring->serial, atomic_read(&keyring->usage));
+	kenter("{%d,%d}", keyring->serial, refcount_read(&keyring->usage));
 
 	key_check(keyring);
 	key_check(key);
 
 	ret = __key_link_begin(keyring, &key->index_key, &edit);
 	if (ret == 0) {
-		kdebug("begun {%d,%d}", keyring->serial, atomic_read(&keyring->usage));
+		kdebug("begun {%d,%d}", keyring->serial, refcount_read(&keyring->usage));
 		ret = __key_link_check_restriction(keyring, key);
 		if (ret == 0)
 			ret = __key_link_check_live_key(keyring, key);
@@ -1266,7 +1266,7 @@ int key_link(struct key *keyring, struct key *key)
 		__key_link_end(keyring, &key->index_key, edit);
 	}
 
-	kleave(" = %d {%d,%d}", ret, keyring->serial, atomic_read(&keyring->usage));
+	kleave(" = %d {%d,%d}", ret, keyring->serial, refcount_read(&keyring->usage));
 	return ret;
 }
 EXPORT_SYMBOL(key_link);
diff --git a/security/keys/proc.c b/security/keys/proc.c
index b9f531c9e4fa..69199f18bfb3 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -252,7 +252,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		   showflag(key, 'U', KEY_FLAG_USER_CONSTRUCT),
 		   showflag(key, 'N', KEY_FLAG_NEGATIVE),
 		   showflag(key, 'i', KEY_FLAG_INVALIDATED),
-		   atomic_read(&key->usage),
+		   refcount_read(&key->usage),
 		   xbuf,
 		   key->perm,
 		   from_kuid_munged(seq_user_ns(m), key->uid),
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
index 6bbe2f535f08..0f062156dfb2 100644
--- a/security/keys/request_key_auth.c
+++ b/security/keys/request_key_auth.c
@@ -213,7 +213,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info,
 	if (ret < 0)
 		goto error_inst;
 
-	kleave(" = {%d,%d}", authkey->serial, atomic_read(&authkey->usage));
+	kleave(" = {%d,%d}", authkey->serial, refcount_read(&authkey->usage));
 	return authkey;
 
 auth_key_revoked:
-- 
cgit v1.2.3


From 3209f68b3ca4667069923a325c88b21131bfdf9f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 31 Mar 2017 18:32:17 +0100
Subject: statx: Include a mask for stx_attributes in struct statx

Include a mask in struct stat to indicate which bits of stx_attributes the
filesystem actually supports.

This would also be useful if we add another system call that allows you to
do a 'bulk attribute set' and pass in a statx struct with the masks
appropriately set to say what you want to set.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/inode.c            |  6 ++++++
 fs/stat.c                  |  1 +
 include/linux/stat.h       |  1 +
 include/uapi/linux/stat.h  |  4 ++--
 samples/statx/test-statx.c | 12 ++++++++----
 5 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5d02b922afa3..b9ffa9f4191f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5413,6 +5413,12 @@ int ext4_getattr(const struct path *path, struct kstat *stat,
 	if (flags & EXT4_NODUMP_FL)
 		stat->attributes |= STATX_ATTR_NODUMP;
 
+	stat->attributes_mask |= (STATX_ATTR_APPEND |
+				  STATX_ATTR_COMPRESSED |
+				  STATX_ATTR_ENCRYPTED |
+				  STATX_ATTR_IMMUTABLE |
+				  STATX_ATTR_NODUMP);
+
 	generic_fillattr(inode, stat);
 	return 0;
 }
diff --git a/fs/stat.c b/fs/stat.c
index 0c7e6cdc435c..c6c963b2546b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -527,6 +527,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_ino = stat->ino;
 	tmp.stx_size = stat->size;
 	tmp.stx_blocks = stat->blocks;
+	tmp.stx_attributes_mask = stat->attributes_mask;
 	tmp.stx_atime.tv_sec = stat->atime.tv_sec;
 	tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
 	tmp.stx_btime.tv_sec = stat->btime.tv_sec;
diff --git a/include/linux/stat.h b/include/linux/stat.h
index c76e524fb34b..64b6b3aece21 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -26,6 +26,7 @@ struct kstat {
 	unsigned int	nlink;
 	uint32_t	blksize;	/* Preferred I/O size */
 	u64		attributes;
+	u64		attributes_mask;
 #define KSTAT_ATTR_FS_IOC_FLAGS				\
 	(STATX_ATTR_COMPRESSED |			\
 	 STATX_ATTR_IMMUTABLE |				\
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 0869b9eaa8ce..d538897b8e08 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -114,7 +114,7 @@ struct statx {
 	__u64	stx_ino;	/* Inode number */
 	__u64	stx_size;	/* File size */
 	__u64	stx_blocks;	/* Number of 512-byte blocks allocated */
-	__u64	__spare1[1];
+	__u64	stx_attributes_mask; /* Mask to show what's supported in stx_attributes */
 	/* 0x40 */
 	struct statx_timestamp	stx_atime;	/* Last access time */
 	struct statx_timestamp	stx_btime;	/* File creation time */
@@ -155,7 +155,7 @@ struct statx {
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
 /*
- * Attributes to be found in stx_attributes
+ * Attributes to be found in stx_attributes and masked in stx_attributes_mask.
  *
  * These give information about the features or the state of a file that might
  * be of use to ordinary userspace programs such as GUIs or ls rather than
diff --git a/samples/statx/test-statx.c b/samples/statx/test-statx.c
index 8571d766331d..d4d77b09412c 100644
--- a/samples/statx/test-statx.c
+++ b/samples/statx/test-statx.c
@@ -141,8 +141,8 @@ static void dump_statx(struct statx *stx)
 	if (stx->stx_mask & STATX_BTIME)
 		print_time(" Birth: ", &stx->stx_btime);
 
-	if (stx->stx_attributes) {
-		unsigned char bits;
+	if (stx->stx_attributes_mask) {
+		unsigned char bits, mbits;
 		int loop, byte;
 
 		static char attr_representation[64 + 1] =
@@ -160,14 +160,18 @@ static void dump_statx(struct statx *stx)
 		printf("Attributes: %016llx (", stx->stx_attributes);
 		for (byte = 64 - 8; byte >= 0; byte -= 8) {
 			bits = stx->stx_attributes >> byte;
+			mbits = stx->stx_attributes_mask >> byte;
 			for (loop = 7; loop >= 0; loop--) {
 				int bit = byte + loop;
 
-				if (bits & 0x80)
+				if (!(mbits & 0x80))
+					putchar('.');	/* Not supported */
+				else if (bits & 0x80)
 					putchar(attr_representation[63 - bit]);
 				else
-					putchar('-');
+					putchar('-');	/* Not set */
 				bits <<= 1;
+				mbits <<= 1;
 			}
 			if (byte)
 				putchar(' ');
-- 
cgit v1.2.3


From 2475a2b6c877a0c8d1ca42c3f2b30f8ce518ac0b Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 3 Apr 2017 19:51:42 +1000
Subject: drivers/of/base.c: Add of_property_read_u64_index

There is of_property_read_u32_index but no u64 variant. This patch
adds one similar to the u32 version for u64.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/of/base.c  | 31 +++++++++++++++++++++++++++++++
 include/linux/of.h |  3 +++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index d7c4629a3a2d..0ea16bd3c8f1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1212,6 +1212,37 @@ int of_property_read_u32_index(const struct device_node *np,
 }
 EXPORT_SYMBOL_GPL(of_property_read_u32_index);
 
+/**
+ * of_property_read_u64_index - Find and read a u64 from a multi-value property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @index:	index of the u64 in the list of values
+ * @out_value:	pointer to return value, modified only if no error.
+ *
+ * Search for a property in a device node and read nth 64-bit value from
+ * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_value is modified only if a valid u64 value can be decoded.
+ */
+int of_property_read_u64_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u64 *out_value)
+{
+	const u64 *val = of_find_property_value_of_size(np, propname,
+					((index + 1) * sizeof(*out_value)),
+					0, NULL);
+
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+
+	*out_value = be64_to_cpup(((__be64 *)val) + index);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_u64_index);
+
 /**
  * of_property_read_variable_u8_array - Find and read an array of u8 from a
  * property, with bounds on the minimum and maximum array size.
diff --git a/include/linux/of.h b/include/linux/of.h
index 21e6323de0f3..d08788daae5c 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -292,6 +292,9 @@ extern int of_property_count_elems_of_size(const struct device_node *np,
 extern int of_property_read_u32_index(const struct device_node *np,
 				       const char *propname,
 				       u32 index, u32 *out_value);
+extern int of_property_read_u64_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u64 *out_value);
 extern int of_property_read_variable_u8_array(const struct device_node *np,
 					const char *propname, u8 *out_values,
 					size_t sz_min, size_t sz_max);
-- 
cgit v1.2.3


From 469ff8f7d46d75b36de68a0411a2ce80109ad00b Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Mon, 25 Apr 2016 11:30:39 -0700
Subject: KEYS: Use a typedef for restrict_link function pointers

This pointer type needs to be returned from a lookup function, and
without a typedef the syntax gets cumbersome.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 Documentation/security/keys.txt |  5 +----
 include/linux/key.h             | 16 +++++++---------
 security/keys/key.c             |  8 ++------
 security/keys/keyring.c         |  4 +---
 4 files changed, 11 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 0e03baf271bd..4502237b12a7 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1032,10 +1032,7 @@ payload contents" for more information.
 	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				  const struct cred *cred,
 				  key_perm_t perm,
-				  int (*restrict_link)(struct key *,
-						       const struct key_type *,
-						       unsigned long,
-						       const union key_payload *),
+				  key_restrict_link_func_t restrict_link,
 				  unsigned long flags,
 				  struct key *dest);
 
diff --git a/include/linux/key.h b/include/linux/key.h
index 9d9fac583dd3..3bb327043869 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -127,6 +127,10 @@ static inline bool is_key_possessed(const key_ref_t key_ref)
 	return (unsigned long) key_ref & 1UL;
 }
 
+typedef int (*key_restrict_link_func_t)(struct key *keyring,
+					const struct key_type *type,
+					const union key_payload *payload);
+
 /*****************************************************************************/
 /*
  * authentication token / access credential / keyring
@@ -215,9 +219,7 @@ struct key {
 	 * overrides this, allowing the kernel to add extra keys without
 	 * restriction.
 	 */
-	int (*restrict_link)(struct key *keyring,
-			     const struct key_type *type,
-			     const union key_payload *payload);
+	key_restrict_link_func_t restrict_link;
 };
 
 extern struct key *key_alloc(struct key_type *type,
@@ -226,9 +228,7 @@ extern struct key *key_alloc(struct key_type *type,
 			     const struct cred *cred,
 			     key_perm_t perm,
 			     unsigned long flags,
-			     int (*restrict_link)(struct key *,
-						  const struct key_type *,
-						  const union key_payload *));
+			     key_restrict_link_func_t restrict_link);
 
 
 #define KEY_ALLOC_IN_QUOTA		0x0000	/* add to quota, reject if would overrun */
@@ -304,9 +304,7 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 				 const struct cred *cred,
 				 key_perm_t perm,
 				 unsigned long flags,
-				 int (*restrict_link)(struct key *,
-						      const struct key_type *,
-						      const union key_payload *),
+				 key_restrict_link_func_t restrict_link,
 				 struct key *dest);
 
 extern int restrict_link_reject(struct key *keyring,
diff --git a/security/keys/key.c b/security/keys/key.c
index b4958b36fa27..08dfa13f6a85 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -225,9 +225,7 @@ serial_exists:
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      kuid_t uid, kgid_t gid, const struct cred *cred,
 		      key_perm_t perm, unsigned long flags,
-		      int (*restrict_link)(struct key *,
-					   const struct key_type *,
-					   const union key_payload *))
+		      key_restrict_link_func_t restrict_link)
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -806,9 +804,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
 	int ret;
-	int (*restrict_link)(struct key *,
-			     const struct key_type *,
-			     const union key_payload *) = NULL;
+	key_restrict_link_func_t restrict_link = NULL;
 
 	/* look up the key type to see if it's one of the registered kernel
 	 * types */
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 3d95f7d02ba1..1b29ac759bf7 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -492,9 +492,7 @@ static long keyring_read(const struct key *keyring,
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 			  const struct cred *cred, key_perm_t perm,
 			  unsigned long flags,
-			  int (*restrict_link)(struct key *,
-					       const struct key_type *,
-					       const union key_payload *),
+			  key_restrict_link_func_t restrict_link,
 			  struct key *dest)
 {
 	struct key *keyring;
-- 
cgit v1.2.3


From aaf66c883813f0078e3dafe7d20d1461321ac14f Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Tue, 30 Aug 2016 11:33:13 -0700
Subject: KEYS: Split role of the keyring pointer for keyring restrict
 functions

The first argument to the restrict_link_func_t functions was a keyring
pointer. These functions are called by the key subsystem with this
argument set to the destination keyring, but restrict_link_by_signature
expects a pointer to the relevant trusted keyring.

Restrict functions may need something other than a single struct key
pointer to allow or reject key linkage, so the data used to make that
decision (such as the trust keyring) is moved to a new, fourth
argument. The first argument is now always the destination keyring.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 Documentation/security/keys.txt   |  8 ++++----
 certs/system_keyring.c            | 18 +++++++++++-------
 crypto/asymmetric_keys/restrict.c |  8 +++++---
 include/crypto/public_key.h       |  5 +++--
 include/keys/system_keyring.h     |  6 ++++--
 include/linux/key.h               |  8 +++++---
 security/keys/key.c               |  5 +++--
 security/keys/keyring.c           |  6 ++++--
 8 files changed, 39 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 4502237b12a7..bb575ab80207 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1054,10 +1054,10 @@ payload contents" for more information.
     can be verified by a key the kernel already has.
 
     When called, the restriction function will be passed the keyring being
-    added to, the key flags value and the type and payload of the key being
-    added.  Note that when a new key is being created, this is called between
-    payload preparsing and actual key creation.  The function should return 0
-    to allow the link or an error to reject it.
+    added to, the key type, the payload of the key being added, and data to be
+    used in the restriction check.  Note that when a new key is being created,
+    this is called between payload preparsing and actual key creation.  The
+    function should return 0 to allow the link or an error to reject it.
 
     A convenience function, restrict_link_reject, exists to always return
     -EPERM to in this case.
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 50979d6dcecd..e39cce68dcfa 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -32,11 +32,13 @@ extern __initconst const unsigned long system_certificate_list_size;
  * Restrict the addition of keys into a keyring based on the key-to-be-added
  * being vouched for by a key in the built in system keyring.
  */
-int restrict_link_by_builtin_trusted(struct key *keyring,
+int restrict_link_by_builtin_trusted(struct key *dest_keyring,
 				     const struct key_type *type,
-				     const union key_payload *payload)
+				     const union key_payload *payload,
+				     struct key *restriction_key)
 {
-	return restrict_link_by_signature(builtin_trusted_keys, type, payload);
+	return restrict_link_by_signature(dest_keyring, type, payload,
+					  builtin_trusted_keys);
 }
 
 #ifdef CONFIG_SECONDARY_TRUSTED_KEYRING
@@ -49,20 +51,22 @@ int restrict_link_by_builtin_trusted(struct key *keyring,
  * keyrings.
  */
 int restrict_link_by_builtin_and_secondary_trusted(
-	struct key *keyring,
+	struct key *dest_keyring,
 	const struct key_type *type,
-	const union key_payload *payload)
+	const union key_payload *payload,
+	struct key *restrict_key)
 {
 	/* If we have a secondary trusted keyring, then that contains a link
 	 * through to the builtin keyring and the search will follow that link.
 	 */
 	if (type == &key_type_keyring &&
-	    keyring == secondary_trusted_keys &&
+	    dest_keyring == secondary_trusted_keys &&
 	    payload == &builtin_trusted_keys->payload)
 		/* Allow the builtin keyring to be added to the secondary */
 		return 0;
 
-	return restrict_link_by_signature(secondary_trusted_keys, type, payload);
+	return restrict_link_by_signature(dest_keyring, type, payload,
+					  secondary_trusted_keys);
 }
 #endif
 
diff --git a/crypto/asymmetric_keys/restrict.c b/crypto/asymmetric_keys/restrict.c
index 19d1afb9890f..a3afbf783255 100644
--- a/crypto/asymmetric_keys/restrict.c
+++ b/crypto/asymmetric_keys/restrict.c
@@ -56,9 +56,10 @@ __setup("ca_keys=", ca_keys_setup);
 
 /**
  * restrict_link_by_signature - Restrict additions to a ring of public keys
- * @trust_keyring: A ring of keys that can be used to vouch for the new cert.
+ * @dest_keyring: Keyring being linked to.
  * @type: The type of key being added.
  * @payload: The payload of the new key.
+ * @trust_keyring: A ring of keys that can be used to vouch for the new cert.
  *
  * Check the new certificate against the ones in the trust keyring.  If one of
  * those is the signing key and validates the new certificate, then mark the
@@ -69,9 +70,10 @@ __setup("ca_keys=", ca_keys_setup);
  * signature check fails or the key is blacklisted and some other error if
  * there is a matching certificate but the signature check cannot be performed.
  */
-int restrict_link_by_signature(struct key *trust_keyring,
+int restrict_link_by_signature(struct key *dest_keyring,
 			       const struct key_type *type,
-			       const union key_payload *payload)
+			       const union key_payload *payload,
+			       struct key *trust_keyring)
 {
 	const struct public_key_signature *sig;
 	struct key *key;
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 882ca0e1e7a5..ec0262fa08f8 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -50,9 +50,10 @@ struct key;
 struct key_type;
 union key_payload;
 
-extern int restrict_link_by_signature(struct key *trust_keyring,
+extern int restrict_link_by_signature(struct key *dest_keyring,
 				      const struct key_type *type,
-				      const union key_payload *payload);
+				      const union key_payload *payload,
+				      struct key *trust_keyring);
 
 extern int verify_signature(const struct key *key,
 			    const struct public_key_signature *sig);
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 0d8762622ab9..359c2f936004 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -18,7 +18,8 @@
 
 extern int restrict_link_by_builtin_trusted(struct key *keyring,
 					    const struct key_type *type,
-					    const union key_payload *payload);
+					    const union key_payload *payload,
+					    struct key *restriction_key);
 
 #else
 #define restrict_link_by_builtin_trusted restrict_link_reject
@@ -28,7 +29,8 @@ extern int restrict_link_by_builtin_trusted(struct key *keyring,
 extern int restrict_link_by_builtin_and_secondary_trusted(
 	struct key *keyring,
 	const struct key_type *type,
-	const union key_payload *payload);
+	const union key_payload *payload,
+	struct key *restriction_key);
 #else
 #define restrict_link_by_builtin_and_secondary_trusted restrict_link_by_builtin_trusted
 #endif
diff --git a/include/linux/key.h b/include/linux/key.h
index 3bb327043869..c59d1008c4fc 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -127,9 +127,10 @@ static inline bool is_key_possessed(const key_ref_t key_ref)
 	return (unsigned long) key_ref & 1UL;
 }
 
-typedef int (*key_restrict_link_func_t)(struct key *keyring,
+typedef int (*key_restrict_link_func_t)(struct key *dest_keyring,
 					const struct key_type *type,
-					const union key_payload *payload);
+					const union key_payload *payload,
+					struct key *restriction_key);
 
 /*****************************************************************************/
 /*
@@ -309,7 +310,8 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 
 extern int restrict_link_reject(struct key *keyring,
 				const struct key_type *type,
-				const union key_payload *payload);
+				const union key_payload *payload,
+				struct key *restriction_key);
 
 extern int keyring_clear(struct key *keyring);
 
diff --git a/security/keys/key.c b/security/keys/key.c
index 08dfa13f6a85..27fc1bb40034 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -499,7 +499,7 @@ int key_instantiate_and_link(struct key *key,
 	if (keyring) {
 		if (keyring->restrict_link) {
 			ret = keyring->restrict_link(keyring, key->type,
-						     &prep.payload);
+						     &prep.payload, NULL);
 			if (ret < 0)
 				goto error;
 		}
@@ -851,7 +851,8 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	index_key.desc_len = strlen(index_key.description);
 
 	if (restrict_link) {
-		ret = restrict_link(keyring, index_key.type, &prep.payload);
+		ret = restrict_link(keyring, index_key.type, &prep.payload,
+				    NULL);
 		if (ret < 0) {
 			key_ref = ERR_PTR(ret);
 			goto error_free_prep;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 1b29ac759bf7..2ccc66ec35d7 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -517,6 +517,7 @@ EXPORT_SYMBOL(keyring_alloc);
  * @keyring: The keyring being added to.
  * @type: The type of key being added.
  * @payload: The payload of the key intended to be added.
+ * @data: Additional data for evaluating restriction.
  *
  * Reject the addition of any links to a keyring.  It can be overridden by
  * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when
@@ -527,7 +528,8 @@ EXPORT_SYMBOL(keyring_alloc);
  */
 int restrict_link_reject(struct key *keyring,
 			 const struct key_type *type,
-			 const union key_payload *payload)
+			 const union key_payload *payload,
+			 struct key *restriction_key)
 {
 	return -EPERM;
 }
@@ -1220,7 +1222,7 @@ static int __key_link_check_restriction(struct key *keyring, struct key *key)
 {
 	if (!keyring->restrict_link)
 		return 0;
-	return keyring->restrict_link(keyring, key->type, &key->payload);
+	return keyring->restrict_link(keyring, key->type, &key->payload, NULL);
 }
 
 /**
-- 
cgit v1.2.3


From e9cc0f689a7c0c9be6fed6861b3a3f49ad0e7a52 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Mon, 27 Jun 2016 16:10:59 -0700
Subject: KEYS: Add a key restriction struct

Key link restrictions require restriction-specific data as well as a
restriction-specific function pointer. As a first step toward replacing
the restrict_link pointer in struct key, define a more general
key_restriction structure that captures the required function, key, and
key type pointers. Key type modules should not be pinned on account of
this key type pointer because the pointer will be cleared by the garbage
collector if the key type is unregistered.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 include/linux/key.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index c59d1008c4fc..a06649f3223d 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -132,6 +132,12 @@ typedef int (*key_restrict_link_func_t)(struct key *dest_keyring,
 					const union key_payload *payload,
 					struct key *restriction_key);
 
+struct key_restriction {
+	key_restrict_link_func_t check;
+	struct key *key;
+	struct key_type *keytype;
+};
+
 /*****************************************************************************/
 /*
  * authentication token / access credential / keyring
-- 
cgit v1.2.3


From 3071f13d75f627ed8648535815a0506d50cbc6ed Mon Sep 17 00:00:00 2001
From: Agustin Vega-Frias <agustinv@codeaurora.org>
Date: Fri, 31 Mar 2017 14:13:43 -0400
Subject: perf: qcom: Add L3 cache PMU driver

This adds a new dynamic PMU to the Perf Events framework to program
and control the L3 cache PMUs in some Qualcomm Technologies SOCs.

The driver supports a distributed cache architecture where the overall
cache for a socket is comprised of multiple slices each with its own PMU.
Access to each individual PMU is provided even though all CPUs share all
the slices. User space needs to aggregate to individual counts to provide
a global picture.

The driver exports formatting and event information to sysfs so it can
be used by the perf user space tools with the syntaxes:
   perf stat -a -e l3cache_0_0/read-miss/
   perf stat -a -e l3cache_0_0/event=0x21/

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Agustin Vega-Frias <agustinv@codeaurora.org>
[will: fixed sparse issues]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 Documentation/perf/qcom_l3_pmu.txt |  25 ++
 drivers/perf/Kconfig               |  10 +
 drivers/perf/Makefile              |   1 +
 drivers/perf/qcom_l3_pmu.c         | 849 +++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h         |   1 +
 5 files changed, 886 insertions(+)
 create mode 100644 Documentation/perf/qcom_l3_pmu.txt
 create mode 100644 drivers/perf/qcom_l3_pmu.c

(limited to 'include/linux')

diff --git a/Documentation/perf/qcom_l3_pmu.txt b/Documentation/perf/qcom_l3_pmu.txt
new file mode 100644
index 000000000000..96b3a9444a0d
--- /dev/null
+++ b/Documentation/perf/qcom_l3_pmu.txt
@@ -0,0 +1,25 @@
+Qualcomm Datacenter Technologies L3 Cache Performance Monitoring Unit (PMU)
+===========================================================================
+
+This driver supports the L3 cache PMUs found in Qualcomm Datacenter Technologies
+Centriq SoCs. The L3 cache on these SOCs is composed of multiple slices, shared
+by all cores within a socket. Each slice is exposed as a separate uncore perf
+PMU with device name l3cache_<socket>_<instance>. User space is responsible
+for aggregating across slices.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs
+the driver also exposes a "cpumask" sysfs attribute which contains a mask
+consisting of one CPU per socket which will be used to handle all the PMU
+events on that socket.
+
+The hardware implements 32bit event counters and has a flat 8bit event space
+exposed via the "event" format attribute. In addition to the 32bit physical
+counters the driver supports virtual 64bit hardware counters by using hardware
+counter chaining. This feature is exposed via the "lc" (long counter) format
+flag. E.g.:
+
+  perf stat -e l3cache_0_0/read-miss,lc/
+
+Given that these are uncore PMUs the driver does not support sampling, therefore
+"perf record" will not work. Per-task perf sessions are not supported.
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 93651907874f..c436e0d303e7 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -21,6 +21,16 @@ config QCOM_L2_PMU
 	  Adds the L2 cache PMU into the perf events subsystem for
 	  monitoring L2 cache events.
 
+config QCOM_L3_PMU
+	bool "Qualcomm Technologies L3-cache PMU"
+	depends on ARCH_QCOM && ARM64 && PERF_EVENTS && ACPI
+	select QCOM_IRQ_COMBINER
+	help
+	   Provides support for the L3 cache performance monitor unit (PMU)
+	   in Qualcomm Technologies processors.
+	   Adds the L3 cache PMU into the perf events subsystem for
+	   monitoring L3 cache events.
+
 config XGENE_PMU
         depends on PERF_EVENTS && ARCH_XGENE
         bool "APM X-Gene SoC PMU"
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index ef24833c94a8..ef0c6b210345 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o
 obj-$(CONFIG_QCOM_L2_PMU)	+= qcom_l2_pmu.o
+obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
new file mode 100644
index 000000000000..7f6b62b29e9d
--- /dev/null
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -0,0 +1,849 @@
+/*
+ * Driver for the L3 cache PMUs in Qualcomm Technologies chips.
+ *
+ * The driver supports a distributed cache architecture where the overall
+ * cache for a socket is comprised of multiple slices each with its own PMU.
+ * Access to each individual PMU is provided even though all CPUs share all
+ * the slices. User space needs to aggregate to individual counts to provide
+ * a global picture.
+ *
+ * See Documentation/perf/qcom_l3_pmu.txt for more details.
+ *
+ * Copyright (c) 2015-2017, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+/*
+ * General constants
+ */
+
+/* Number of counters on each PMU */
+#define L3_NUM_COUNTERS  8
+/* Mask for the event type field within perf_event_attr.config and EVTYPE reg */
+#define L3_EVTYPE_MASK   0xFF
+/*
+ * Bit position of the 'long counter' flag within perf_event_attr.config.
+ * Reserve some space between the event type and this flag to allow expansion
+ * in the event type field.
+ */
+#define L3_EVENT_LC_BIT  32
+
+/*
+ * Register offsets
+ */
+
+/* Perfmon registers */
+#define L3_HML3_PM_CR       0x000
+#define L3_HML3_PM_EVCNTR(__cntr) (0x420 + ((__cntr) & 0x7) * 8)
+#define L3_HML3_PM_CNTCTL(__cntr) (0x120 + ((__cntr) & 0x7) * 8)
+#define L3_HML3_PM_EVTYPE(__cntr) (0x220 + ((__cntr) & 0x7) * 8)
+#define L3_HML3_PM_FILTRA   0x300
+#define L3_HML3_PM_FILTRB   0x308
+#define L3_HML3_PM_FILTRC   0x310
+#define L3_HML3_PM_FILTRAM  0x304
+#define L3_HML3_PM_FILTRBM  0x30C
+#define L3_HML3_PM_FILTRCM  0x314
+
+/* Basic counter registers */
+#define L3_M_BC_CR         0x500
+#define L3_M_BC_SATROLL_CR 0x504
+#define L3_M_BC_CNTENSET   0x508
+#define L3_M_BC_CNTENCLR   0x50C
+#define L3_M_BC_INTENSET   0x510
+#define L3_M_BC_INTENCLR   0x514
+#define L3_M_BC_GANG       0x718
+#define L3_M_BC_OVSR       0x740
+#define L3_M_BC_IRQCTL     0x96C
+
+/*
+ * Bit field definitions
+ */
+
+/* L3_HML3_PM_CR */
+#define PM_CR_RESET           (0)
+
+/* L3_HML3_PM_XCNTCTL/L3_HML3_PM_CNTCTLx */
+#define PMCNT_RESET           (0)
+
+/* L3_HML3_PM_EVTYPEx */
+#define EVSEL(__val)          ((__val) & L3_EVTYPE_MASK)
+
+/* Reset value for all the filter registers */
+#define PM_FLTR_RESET         (0)
+
+/* L3_M_BC_CR */
+#define BC_RESET              (1UL << 1)
+#define BC_ENABLE             (1UL << 0)
+
+/* L3_M_BC_SATROLL_CR */
+#define BC_SATROLL_CR_RESET   (0)
+
+/* L3_M_BC_CNTENSET */
+#define PMCNTENSET(__cntr)    (1UL << ((__cntr) & 0x7))
+
+/* L3_M_BC_CNTENCLR */
+#define PMCNTENCLR(__cntr)    (1UL << ((__cntr) & 0x7))
+#define BC_CNTENCLR_RESET     (0xFF)
+
+/* L3_M_BC_INTENSET */
+#define PMINTENSET(__cntr)    (1UL << ((__cntr) & 0x7))
+
+/* L3_M_BC_INTENCLR */
+#define PMINTENCLR(__cntr)    (1UL << ((__cntr) & 0x7))
+#define BC_INTENCLR_RESET     (0xFF)
+
+/* L3_M_BC_GANG */
+#define GANG_EN(__cntr)       (1UL << ((__cntr) & 0x7))
+#define BC_GANG_RESET         (0)
+
+/* L3_M_BC_OVSR */
+#define PMOVSRCLR(__cntr)     (1UL << ((__cntr) & 0x7))
+#define PMOVSRCLR_RESET       (0xFF)
+
+/* L3_M_BC_IRQCTL */
+#define PMIRQONMSBEN(__cntr)  (1UL << ((__cntr) & 0x7))
+#define BC_IRQCTL_RESET       (0x0)
+
+/*
+ * Events
+ */
+
+#define L3_EVENT_CYCLES		0x01
+#define L3_EVENT_READ_HIT		0x20
+#define L3_EVENT_READ_MISS		0x21
+#define L3_EVENT_READ_HIT_D		0x22
+#define L3_EVENT_READ_MISS_D		0x23
+#define L3_EVENT_WRITE_HIT		0x24
+#define L3_EVENT_WRITE_MISS		0x25
+
+/*
+ * Decoding of settings from perf_event_attr
+ *
+ * The config format for perf events is:
+ * - config: bits 0-7: event type
+ *           bit  32:  HW counter size requested, 0: 32 bits, 1: 64 bits
+ */
+
+static inline u32 get_event_type(struct perf_event *event)
+{
+	return (event->attr.config) & L3_EVTYPE_MASK;
+}
+
+static inline bool event_uses_long_counter(struct perf_event *event)
+{
+	return !!(event->attr.config & BIT_ULL(L3_EVENT_LC_BIT));
+}
+
+static inline int event_num_counters(struct perf_event *event)
+{
+	return event_uses_long_counter(event) ? 2 : 1;
+}
+
+/*
+ * Main PMU, inherits from the core perf PMU type
+ */
+struct l3cache_pmu {
+	struct pmu		pmu;
+	struct hlist_node	node;
+	void __iomem		*regs;
+	struct perf_event	*events[L3_NUM_COUNTERS];
+	unsigned long		used_mask[BITS_TO_LONGS(L3_NUM_COUNTERS)];
+	cpumask_t		cpumask;
+};
+
+#define to_l3cache_pmu(p) (container_of(p, struct l3cache_pmu, pmu))
+
+/*
+ * Type used to group hardware counter operations
+ *
+ * Used to implement two types of hardware counters, standard (32bits) and
+ * long (64bits). The hardware supports counter chaining which we use to
+ * implement long counters. This support is exposed via the 'lc' flag field
+ * in perf_event_attr.config.
+ */
+struct l3cache_event_ops {
+	/* Called to start event monitoring */
+	void (*start)(struct perf_event *event);
+	/* Called to stop event monitoring */
+	void (*stop)(struct perf_event *event, int flags);
+	/* Called to update the perf_event */
+	void (*update)(struct perf_event *event);
+};
+
+/*
+ * Implementation of long counter operations
+ *
+ * 64bit counters are implemented by chaining two of the 32bit physical
+ * counters. The PMU only supports chaining of adjacent even/odd pairs
+ * and for simplicity the driver always configures the odd counter to
+ * count the overflows of the lower-numbered even counter. Note that since
+ * the resulting hardware counter is 64bits no IRQs are required to maintain
+ * the software counter which is also 64bits.
+ */
+
+static void qcom_l3_cache__64bit_counter_start(struct perf_event *event)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u32 evsel = get_event_type(event);
+	u32 gang;
+
+	/* Set the odd counter to count the overflows of the even counter */
+	gang = readl_relaxed(l3pmu->regs + L3_M_BC_GANG);
+	gang |= GANG_EN(idx + 1);
+	writel_relaxed(gang, l3pmu->regs + L3_M_BC_GANG);
+
+	/* Initialize the hardware counters and reset prev_count*/
+	local64_set(&event->hw.prev_count, 0);
+	writel_relaxed(0, l3pmu->regs + L3_HML3_PM_EVCNTR(idx + 1));
+	writel_relaxed(0, l3pmu->regs + L3_HML3_PM_EVCNTR(idx));
+
+	/*
+	 * Set the event types, the upper half must use zero and the lower
+	 * half the actual event type
+	 */
+	writel_relaxed(EVSEL(0), l3pmu->regs + L3_HML3_PM_EVTYPE(idx + 1));
+	writel_relaxed(EVSEL(evsel), l3pmu->regs + L3_HML3_PM_EVTYPE(idx));
+
+	/* Finally, enable the counters */
+	writel_relaxed(PMCNT_RESET, l3pmu->regs + L3_HML3_PM_CNTCTL(idx + 1));
+	writel_relaxed(PMCNTENSET(idx + 1), l3pmu->regs + L3_M_BC_CNTENSET);
+	writel_relaxed(PMCNT_RESET, l3pmu->regs + L3_HML3_PM_CNTCTL(idx));
+	writel_relaxed(PMCNTENSET(idx), l3pmu->regs + L3_M_BC_CNTENSET);
+}
+
+static void qcom_l3_cache__64bit_counter_stop(struct perf_event *event,
+					      int flags)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u32 gang = readl_relaxed(l3pmu->regs + L3_M_BC_GANG);
+
+	/* Disable the counters */
+	writel_relaxed(PMCNTENCLR(idx), l3pmu->regs + L3_M_BC_CNTENCLR);
+	writel_relaxed(PMCNTENCLR(idx + 1), l3pmu->regs + L3_M_BC_CNTENCLR);
+
+	/* Disable chaining */
+	writel_relaxed(gang & ~GANG_EN(idx + 1), l3pmu->regs + L3_M_BC_GANG);
+}
+
+static void qcom_l3_cache__64bit_counter_update(struct perf_event *event)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u32 hi, lo;
+	u64 prev, new;
+
+	do {
+		prev = local64_read(&event->hw.prev_count);
+		do {
+			hi = readl_relaxed(l3pmu->regs + L3_HML3_PM_EVCNTR(idx + 1));
+			lo = readl_relaxed(l3pmu->regs + L3_HML3_PM_EVCNTR(idx));
+		} while (hi != readl_relaxed(l3pmu->regs + L3_HML3_PM_EVCNTR(idx + 1)));
+		new = ((u64)hi << 32) | lo;
+	} while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+
+	local64_add(new - prev, &event->count);
+}
+
+static const struct l3cache_event_ops event_ops_long = {
+	.start = qcom_l3_cache__64bit_counter_start,
+	.stop = qcom_l3_cache__64bit_counter_stop,
+	.update = qcom_l3_cache__64bit_counter_update,
+};
+
+/*
+ * Implementation of standard counter operations
+ *
+ * 32bit counters use a single physical counter and a hardware feature that
+ * asserts the overflow IRQ on the toggling of the most significant bit in
+ * the counter. This feature allows the counters to be left free-running
+ * without needing the usual reprogramming required to properly handle races
+ * during concurrent calls to update.
+ */
+
+static void qcom_l3_cache__32bit_counter_start(struct perf_event *event)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u32 evsel = get_event_type(event);
+	u32 irqctl = readl_relaxed(l3pmu->regs + L3_M_BC_IRQCTL);
+
+	/* Set the counter to assert the overflow IRQ on MSB toggling */
+	writel_relaxed(irqctl | PMIRQONMSBEN(idx), l3pmu->regs + L3_M_BC_IRQCTL);
+
+	/* Initialize the hardware counter and reset prev_count*/
+	local64_set(&event->hw.prev_count, 0);
+	writel_relaxed(0, l3pmu->regs + L3_HML3_PM_EVCNTR(idx));
+
+	/* Set the event type */
+	writel_relaxed(EVSEL(evsel), l3pmu->regs + L3_HML3_PM_EVTYPE(idx));
+
+	/* Enable interrupt generation by this counter */
+	writel_relaxed(PMINTENSET(idx), l3pmu->regs + L3_M_BC_INTENSET);
+
+	/* Finally, enable the counter */
+	writel_relaxed(PMCNT_RESET, l3pmu->regs + L3_HML3_PM_CNTCTL(idx));
+	writel_relaxed(PMCNTENSET(idx), l3pmu->regs + L3_M_BC_CNTENSET);
+}
+
+static void qcom_l3_cache__32bit_counter_stop(struct perf_event *event,
+					      int flags)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u32 irqctl = readl_relaxed(l3pmu->regs + L3_M_BC_IRQCTL);
+
+	/* Disable the counter */
+	writel_relaxed(PMCNTENCLR(idx), l3pmu->regs + L3_M_BC_CNTENCLR);
+
+	/* Disable interrupt generation by this counter */
+	writel_relaxed(PMINTENCLR(idx), l3pmu->regs + L3_M_BC_INTENCLR);
+
+	/* Set the counter to not assert the overflow IRQ on MSB toggling */
+	writel_relaxed(irqctl & ~PMIRQONMSBEN(idx), l3pmu->regs + L3_M_BC_IRQCTL);
+}
+
+static void qcom_l3_cache__32bit_counter_update(struct perf_event *event)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	int idx = event->hw.idx;
+	u32 prev, new;
+
+	do {
+		prev = local64_read(&event->hw.prev_count);
+		new = readl_relaxed(l3pmu->regs + L3_HML3_PM_EVCNTR(idx));
+	} while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev);
+
+	local64_add(new - prev, &event->count);
+}
+
+static const struct l3cache_event_ops event_ops_std = {
+	.start = qcom_l3_cache__32bit_counter_start,
+	.stop = qcom_l3_cache__32bit_counter_stop,
+	.update = qcom_l3_cache__32bit_counter_update,
+};
+
+/* Retrieve the appropriate operations for the given event */
+static
+const struct l3cache_event_ops *l3cache_event_get_ops(struct perf_event *event)
+{
+	if (event_uses_long_counter(event))
+		return &event_ops_long;
+	else
+		return &event_ops_std;
+}
+
+/*
+ * Top level PMU functions.
+ */
+
+static inline void qcom_l3_cache__init(struct l3cache_pmu *l3pmu)
+{
+	int i;
+
+	writel_relaxed(BC_RESET, l3pmu->regs + L3_M_BC_CR);
+
+	/*
+	 * Use writel for the first programming command to ensure the basic
+	 * counter unit is stopped before proceeding
+	 */
+	writel(BC_SATROLL_CR_RESET, l3pmu->regs + L3_M_BC_SATROLL_CR);
+
+	writel_relaxed(BC_CNTENCLR_RESET, l3pmu->regs + L3_M_BC_CNTENCLR);
+	writel_relaxed(BC_INTENCLR_RESET, l3pmu->regs + L3_M_BC_INTENCLR);
+	writel_relaxed(PMOVSRCLR_RESET, l3pmu->regs + L3_M_BC_OVSR);
+	writel_relaxed(BC_GANG_RESET, l3pmu->regs + L3_M_BC_GANG);
+	writel_relaxed(BC_IRQCTL_RESET, l3pmu->regs + L3_M_BC_IRQCTL);
+	writel_relaxed(PM_CR_RESET, l3pmu->regs + L3_HML3_PM_CR);
+
+	for (i = 0; i < L3_NUM_COUNTERS; ++i) {
+		writel_relaxed(PMCNT_RESET, l3pmu->regs + L3_HML3_PM_CNTCTL(i));
+		writel_relaxed(EVSEL(0), l3pmu->regs + L3_HML3_PM_EVTYPE(i));
+	}
+
+	writel_relaxed(PM_FLTR_RESET, l3pmu->regs + L3_HML3_PM_FILTRA);
+	writel_relaxed(PM_FLTR_RESET, l3pmu->regs + L3_HML3_PM_FILTRAM);
+	writel_relaxed(PM_FLTR_RESET, l3pmu->regs + L3_HML3_PM_FILTRB);
+	writel_relaxed(PM_FLTR_RESET, l3pmu->regs + L3_HML3_PM_FILTRBM);
+	writel_relaxed(PM_FLTR_RESET, l3pmu->regs + L3_HML3_PM_FILTRC);
+	writel_relaxed(PM_FLTR_RESET, l3pmu->regs + L3_HML3_PM_FILTRCM);
+
+	/*
+	 * Use writel here to ensure all programming commands are done
+	 *  before proceeding
+	 */
+	writel(BC_ENABLE, l3pmu->regs + L3_M_BC_CR);
+}
+
+static irqreturn_t qcom_l3_cache__handle_irq(int irq_num, void *data)
+{
+	struct l3cache_pmu *l3pmu = data;
+	/* Read the overflow status register */
+	long status = readl_relaxed(l3pmu->regs + L3_M_BC_OVSR);
+	int idx;
+
+	if (status == 0)
+		return IRQ_NONE;
+
+	/* Clear the bits we read on the overflow status register */
+	writel_relaxed(status, l3pmu->regs + L3_M_BC_OVSR);
+
+	for_each_set_bit(idx, &status, L3_NUM_COUNTERS) {
+		struct perf_event *event;
+		const struct l3cache_event_ops *ops;
+
+		event = l3pmu->events[idx];
+		if (!event)
+			continue;
+
+		/*
+		 * Since the IRQ is not enabled for events using long counters
+		 * we should never see one of those here, however, be consistent
+		 * and use the ops indirections like in the other operations.
+		 */
+
+		ops = l3cache_event_get_ops(event);
+		ops->update(event);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Implementation of abstract pmu functionality required by
+ * the core perf events code.
+ */
+
+static void qcom_l3_cache__pmu_enable(struct pmu *pmu)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(pmu);
+
+	/* Ensure the other programming commands are observed before enabling */
+	wmb();
+
+	writel_relaxed(BC_ENABLE, l3pmu->regs + L3_M_BC_CR);
+}
+
+static void qcom_l3_cache__pmu_disable(struct pmu *pmu)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(pmu);
+
+	writel_relaxed(0, l3pmu->regs + L3_M_BC_CR);
+
+	/* Ensure the basic counter unit is stopped before proceeding */
+	wmb();
+}
+
+/*
+ * We must NOT create groups containing events from multiple hardware PMUs,
+ * although mixing different software and hardware PMUs is allowed.
+ */
+static bool qcom_l3_cache__validate_event_group(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct perf_event *sibling;
+	int counters = 0;
+
+	if (leader->pmu != event->pmu && !is_software_event(leader))
+		return false;
+
+	counters = event_num_counters(event);
+	counters += event_num_counters(leader);
+
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
+		if (is_software_event(sibling))
+			continue;
+		if (sibling->pmu != event->pmu)
+			return false;
+		counters += event_num_counters(sibling);
+	}
+
+	/*
+	 * If the group requires more counters than the HW has, it
+	 * cannot ever be scheduled.
+	 */
+	return counters <= L3_NUM_COUNTERS;
+}
+
+static int qcom_l3_cache__event_init(struct perf_event *event)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * Is the event for this PMU?
+	 */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * There are no per-counter mode filters in the PMU.
+	 */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_hv || event->attr.exclude_idle)
+		return -EINVAL;
+
+	/*
+	 * Sampling not supported since these events are not core-attributable.
+	 */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * Task mode not available, we run the counters as socket counters,
+	 * not attributable to any CPU and therefore cannot attribute per-task.
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	/* Validate the group */
+	if (!qcom_l3_cache__validate_event_group(event))
+		return -EINVAL;
+
+	hwc->idx = -1;
+
+	/*
+	 * Many perf core operations (eg. events rotation) operate on a
+	 * single CPU context. This is obvious for CPU PMUs, where one
+	 * expects the same sets of events being observed on all CPUs,
+	 * but can lead to issues for off-core PMUs, like this one, where
+	 * each event could be theoretically assigned to a different CPU.
+	 * To mitigate this, we enforce CPU assignment to one designated
+	 * processor (the one described in the "cpumask" attribute exported
+	 * by the PMU device). perf user space tools honor this and avoid
+	 * opening more than one copy of the events.
+	 */
+	event->cpu = cpumask_first(&l3pmu->cpumask);
+
+	return 0;
+}
+
+static void qcom_l3_cache__event_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	const struct l3cache_event_ops *ops = l3cache_event_get_ops(event);
+
+	hwc->state = 0;
+	ops->start(event);
+}
+
+static void qcom_l3_cache__event_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	const struct l3cache_event_ops *ops = l3cache_event_get_ops(event);
+
+	if (hwc->state & PERF_HES_STOPPED)
+		return;
+
+	ops->stop(event, flags);
+	if (flags & PERF_EF_UPDATE)
+		ops->update(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static int qcom_l3_cache__event_add(struct perf_event *event, int flags)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int order = event_uses_long_counter(event) ? 1 : 0;
+	int idx;
+
+	/*
+	 * Try to allocate a counter.
+	 */
+	idx = bitmap_find_free_region(l3pmu->used_mask, L3_NUM_COUNTERS, order);
+	if (idx < 0)
+		/* The counters are all in use. */
+		return -EAGAIN;
+
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+	l3pmu->events[idx] = event;
+
+	if (flags & PERF_EF_START)
+		qcom_l3_cache__event_start(event, 0);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+
+	return 0;
+}
+
+static void qcom_l3_cache__event_del(struct perf_event *event, int flags)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int order = event_uses_long_counter(event) ? 1 : 0;
+
+	/* Stop and clean up */
+	qcom_l3_cache__event_stop(event,  flags | PERF_EF_UPDATE);
+	l3pmu->events[hwc->idx] = NULL;
+	bitmap_release_region(l3pmu->used_mask, hwc->idx, order);
+
+	/* Propagate changes to the userspace mapping. */
+	perf_event_update_userpage(event);
+}
+
+static void qcom_l3_cache__event_read(struct perf_event *event)
+{
+	const struct l3cache_event_ops *ops = l3cache_event_get_ops(event);
+
+	ops->update(event);
+}
+
+/*
+ * Add sysfs attributes
+ *
+ * We export:
+ * - formats, used by perf user space and other tools to configure events
+ * - events, used by perf user space and other tools to create events
+ *   symbolically, e.g.:
+ *     perf stat -a -e l3cache_0_0/event=read-miss/ ls
+ *     perf stat -a -e l3cache_0_0/event=0x21/ ls
+ * - cpumask, used by perf user space and other tools to know on which CPUs
+ *   to open the events
+ */
+
+/* formats */
+
+static ssize_t l3cache_pmu_format_show(struct device *dev,
+				       struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+	return sprintf(buf, "%s\n", (char *) eattr->var);
+}
+
+#define L3CACHE_PMU_FORMAT_ATTR(_name, _config)				      \
+	(&((struct dev_ext_attribute[]) {				      \
+		{ .attr = __ATTR(_name, 0444, l3cache_pmu_format_show, NULL), \
+		  .var = (void *) _config, }				      \
+	})[0].attr.attr)
+
+static struct attribute *qcom_l3_cache_pmu_formats[] = {
+	L3CACHE_PMU_FORMAT_ATTR(event, "config:0-7"),
+	L3CACHE_PMU_FORMAT_ATTR(lc, "config:" __stringify(L3_EVENT_LC_BIT)),
+	NULL,
+};
+
+static struct attribute_group qcom_l3_cache_pmu_format_group = {
+	.name = "format",
+	.attrs = qcom_l3_cache_pmu_formats,
+};
+
+/* events */
+
+static ssize_t l3cache_pmu_event_show(struct device *dev,
+				     struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sprintf(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define L3CACHE_EVENT_ATTR(_name, _id)					     \
+	(&((struct perf_pmu_events_attr[]) {				     \
+		{ .attr = __ATTR(_name, 0444, l3cache_pmu_event_show, NULL), \
+		  .id = _id, }						     \
+	})[0].attr.attr)
+
+static struct attribute *qcom_l3_cache_pmu_events[] = {
+	L3CACHE_EVENT_ATTR(cycles, L3_EVENT_CYCLES),
+	L3CACHE_EVENT_ATTR(read-hit, L3_EVENT_READ_HIT),
+	L3CACHE_EVENT_ATTR(read-miss, L3_EVENT_READ_MISS),
+	L3CACHE_EVENT_ATTR(read-hit-d-side, L3_EVENT_READ_HIT_D),
+	L3CACHE_EVENT_ATTR(read-miss-d-side, L3_EVENT_READ_MISS_D),
+	L3CACHE_EVENT_ATTR(write-hit, L3_EVENT_WRITE_HIT),
+	L3CACHE_EVENT_ATTR(write-miss, L3_EVENT_WRITE_MISS),
+	NULL
+};
+
+static struct attribute_group qcom_l3_cache_pmu_events_group = {
+	.name = "events",
+	.attrs = qcom_l3_cache_pmu_events,
+};
+
+/* cpumask */
+
+static ssize_t qcom_l3_cache_pmu_cpumask_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct l3cache_pmu *l3pmu = to_l3cache_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, &l3pmu->cpumask);
+}
+
+static DEVICE_ATTR(cpumask, 0444, qcom_l3_cache_pmu_cpumask_show, NULL);
+
+static struct attribute *qcom_l3_cache_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group qcom_l3_cache_pmu_cpumask_attr_group = {
+	.attrs = qcom_l3_cache_pmu_cpumask_attrs,
+};
+
+/*
+ * Per PMU device attribute groups
+ */
+static const struct attribute_group *qcom_l3_cache_pmu_attr_grps[] = {
+	&qcom_l3_cache_pmu_format_group,
+	&qcom_l3_cache_pmu_events_group,
+	&qcom_l3_cache_pmu_cpumask_attr_group,
+	NULL,
+};
+
+/*
+ * Probing functions and data.
+ */
+
+static int qcom_l3_cache_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct l3cache_pmu *l3pmu = hlist_entry_safe(node, struct l3cache_pmu, node);
+
+	/* If there is not a CPU/PMU association pick this CPU */
+	if (cpumask_empty(&l3pmu->cpumask))
+		cpumask_set_cpu(cpu, &l3pmu->cpumask);
+
+	return 0;
+}
+
+static int qcom_l3_cache_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct l3cache_pmu *l3pmu = hlist_entry_safe(node, struct l3cache_pmu, node);
+	unsigned int target;
+
+	if (!cpumask_test_and_clear_cpu(cpu, &l3pmu->cpumask))
+		return 0;
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+	perf_pmu_migrate_context(&l3pmu->pmu, cpu, target);
+	cpumask_set_cpu(target, &l3pmu->cpumask);
+	return 0;
+}
+
+static int qcom_l3_cache_pmu_probe(struct platform_device *pdev)
+{
+	struct l3cache_pmu *l3pmu;
+	struct acpi_device *acpi_dev;
+	struct resource *memrc;
+	int ret;
+	char *name;
+
+	/* Initialize the PMU data structures */
+
+	acpi_dev = ACPI_COMPANION(&pdev->dev);
+	if (!acpi_dev)
+		return -ENODEV;
+
+	l3pmu = devm_kzalloc(&pdev->dev, sizeof(*l3pmu), GFP_KERNEL);
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "l3cache_%s_%s",
+		      acpi_dev->parent->pnp.unique_id, acpi_dev->pnp.unique_id);
+	if (!l3pmu || !name)
+		return -ENOMEM;
+
+	l3pmu->pmu = (struct pmu) {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.pmu_enable	= qcom_l3_cache__pmu_enable,
+		.pmu_disable	= qcom_l3_cache__pmu_disable,
+		.event_init	= qcom_l3_cache__event_init,
+		.add		= qcom_l3_cache__event_add,
+		.del		= qcom_l3_cache__event_del,
+		.start		= qcom_l3_cache__event_start,
+		.stop		= qcom_l3_cache__event_stop,
+		.read		= qcom_l3_cache__event_read,
+
+		.attr_groups	= qcom_l3_cache_pmu_attr_grps,
+	};
+
+	memrc = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	l3pmu->regs = devm_ioremap_resource(&pdev->dev, memrc);
+	if (IS_ERR(l3pmu->regs)) {
+		dev_err(&pdev->dev, "Can't map PMU @%pa\n", &memrc->start);
+		return PTR_ERR(l3pmu->regs);
+	}
+
+	qcom_l3_cache__init(l3pmu);
+
+	ret = platform_get_irq(pdev, 0);
+	if (ret <= 0)
+		return ret;
+
+	ret = devm_request_irq(&pdev->dev, ret, qcom_l3_cache__handle_irq, 0,
+			       name, l3pmu);
+	if (ret) {
+		dev_err(&pdev->dev, "Request for IRQ failed for slice @%pa\n",
+			&memrc->start);
+		return ret;
+	}
+
+	/* Add this instance to the list used by the offline callback */
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE, &l3pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug", ret);
+		return ret;
+	}
+
+	ret = perf_pmu_register(&l3pmu->pmu, name, -1);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to register L3 cache PMU (%d)\n", ret);
+		return ret;
+	}
+
+	dev_info(&pdev->dev, "Registered %s, type: %d\n", name, l3pmu->pmu.type);
+
+	return 0;
+}
+
+static const struct acpi_device_id qcom_l3_cache_pmu_acpi_match[] = {
+	{ "QCOM8081", },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, qcom_l3_cache_pmu_acpi_match);
+
+static struct platform_driver qcom_l3_cache_pmu_driver = {
+	.driver = {
+		.name = "qcom-l3cache-pmu",
+		.acpi_match_table = ACPI_PTR(qcom_l3_cache_pmu_acpi_match),
+	},
+	.probe = qcom_l3_cache_pmu_probe,
+};
+
+static int __init register_qcom_l3_cache_pmu_driver(void)
+{
+	int ret;
+
+	/* Install a hook to update the reader CPU in case it goes offline */
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
+				      "perf/qcom/l3cache:online",
+				      qcom_l3_cache_pmu_online_cpu,
+				      qcom_l3_cache_pmu_offline_cpu);
+	if (ret)
+		return ret;
+
+	return platform_driver_register(&qcom_l3_cache_pmu_driver);
+}
+device_initcall(register_qcom_l3_cache_pmu_driver);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 62d240e962f0..cfcfab37d9c4 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -137,6 +137,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_CCN_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
+	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
 	CPUHP_AP_WORKQUEUE_ONLINE,
 	CPUHP_AP_RCUTREE_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit v1.2.3


From b80f0f6c9ed3958ff4002b6135f43a1ef312a610 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 3 Apr 2017 12:57:35 -0400
Subject: ftrace: Have init/main.c call ftrace directly to free init memory

Relying on free_reserved_area() to call ftrace to free init memory proved to
not be sufficient. The issue is that on x86, when debug_pagealloc is
enabled, the init memory is not freed, but simply set as not present. Since
ftrace was uninformed of this, starting function tracing still tries to
update pages that are not present according to the page tables, causing
ftrace to bug, as well as killing the kernel itself.

Instead of relying on free_reserved_area(), have init/main.c call ftrace
directly just before it frees the init memory. Then it needs to use
__init_begin and __init_end to know where the init memory location is.
Looking at all archs (and testing what I can), it appears that this should
work for each of them.

Reported-by: kernel test robot <xiaolong.ye@intel.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 6 +++---
 init/main.c            | 1 +
 kernel/trace/ftrace.c  | 7 ++++---
 mm/page_alloc.c        | 3 ---
 4 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0276a2c487e6..ef7123219f14 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -147,9 +147,9 @@ struct ftrace_ops_hash {
 	struct mutex			regex_lock;
 };
 
-void ftrace_free_mem(void *start, void *end);
+void ftrace_free_init_mem(void);
 #else
-static inline void ftrace_free_mem(void *start, void *end) { }
+static inline void ftrace_free_init_mem(void) { }
 #endif
 
 /*
@@ -266,7 +266,7 @@ static inline int ftrace_nr_registered_ops(void)
 }
 static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
-static inline void ftrace_free_mem(void *start, void *end) { }
+static inline void ftrace_free_init_mem(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_STACK_TRACER
diff --git a/init/main.c b/init/main.c
index c0137b916aa1..0e8849f74561 100644
--- a/init/main.c
+++ b/init/main.c
@@ -962,6 +962,7 @@ static int __ref kernel_init(void *unused)
 	kernel_init_freeable();
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
+	ftrace_free_init_mem();
 	free_initmem();
 	mark_readonly();
 	system_state = SYSTEM_RUNNING;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index aff7a2c08387..8efd9fe7aec0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -36,6 +36,7 @@
 
 #include <trace/events/sched.h>
 
+#include <asm/sections.h>
 #include <asm/setup.h>
 
 #include "trace_output.h"
@@ -5279,10 +5280,10 @@ void ftrace_module_init(struct module *mod)
 }
 #endif /* CONFIG_MODULES */
 
-void ftrace_free_mem(void *start_ptr, void *end_ptr)
+void __init ftrace_free_init_mem(void)
 {
-	unsigned long start = (unsigned long)start_ptr;
-	unsigned long end = (unsigned long)end_ptr;
+	unsigned long start = (unsigned long)(&__init_begin);
+	unsigned long end = (unsigned long)(&__init_end);
 	struct ftrace_page **last_pg = &ftrace_pages_start;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee82bfb7cd8..178bf9c2a2cb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6606,9 +6606,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 	void *pos;
 	unsigned long pages = 0;
 
-	/* This may be .init text, inform ftrace to remove it */
-	ftrace_free_mem(start, end);
-
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
-- 
cgit v1.2.3


From 877c57d0d0cac2c8fc661f708d8ee3fa7aa8d28b Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Date: Fri, 24 Mar 2017 11:45:49 +0200
Subject: tpm_crb: request and relinquish locality 0

This commit adds support for requesting and relinquishing locality 0 in
tpm_crb for the course of command transmission.

In order to achieve this, two new callbacks are added to struct
tpm_class_ops:

- request_locality
- relinquish_locality

With CRB interface you first set either requestAccess or relinquish bit
from TPM_LOC_CTRL_x register and then wait for locAssigned and
tpmRegValidSts bits to be set in the TPM_LOC_STATE_x register.

The reason why were are doing this is to make sure that the driver
will work properly with Intel TXT that uses locality 2. There's no
explicit guarantee that it would relinquish this locality. In more
general sense this commit enables tpm_crb to be a well behaving
citizen in a multi locality environment.

Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Tested-by: Jerry Snitselaar <jsnitsel@redhat.com>
---
 drivers/char/tpm/tpm-chip.c      |  1 +
 drivers/char/tpm/tpm-interface.c | 16 ++++++++++++++++
 drivers/char/tpm/tpm.h           |  3 +++
 drivers/char/tpm/tpm_crb.c       | 41 ++++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h              |  3 ++-
 5 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index aade6995f310..a321bd57f3e9 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -231,6 +231,7 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
 		goto out;
 	}
 
+	chip->locality = -1;
 	return chip;
 
 out:
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 16abbf9cb53a..158c1db83f05 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -389,6 +389,7 @@ ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space,
 	ssize_t len = 0;
 	u32 count, ordinal;
 	unsigned long stop;
+	bool need_locality;
 
 	if (!tpm_validate_command(chip, space, buf, bufsiz))
 		return -EINVAL;
@@ -412,6 +413,16 @@ ssize_t tpm_transmit(struct tpm_chip *chip, struct tpm_space *space,
 	if (chip->dev.parent)
 		pm_runtime_get_sync(chip->dev.parent);
 
+	/* Store the decision as chip->locality will be changed. */
+	need_locality = chip->locality == -1;
+
+	if (need_locality && chip->ops->request_locality)  {
+		rc = chip->ops->request_locality(chip, 0);
+		if (rc < 0)
+			goto out_no_locality;
+		chip->locality = rc;
+	}
+
 	rc = tpm2_prepare_space(chip, space, ordinal, buf);
 	if (rc)
 		goto out;
@@ -471,6 +482,11 @@ out_recv:
 	rc = tpm2_commit_space(chip, space, ordinal, buf, &len);
 
 out:
+	if (need_locality && chip->ops->relinquish_locality) {
+		chip->ops->relinquish_locality(chip, chip->locality);
+		chip->locality = -1;
+	}
+out_no_locality:
 	if (chip->dev.parent)
 		pm_runtime_put_sync(chip->dev.parent);
 
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 5eacb3fd2ed2..4b4c8dee3096 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -228,6 +228,9 @@ struct tpm_chip {
 	struct tpm_space work_space;
 	u32 nr_commands;
 	u32 *cc_attrs_tbl;
+
+	/* active locality */
+	int locality;
 };
 
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c
index 9f3160912152..d91e47dc2d79 100644
--- a/drivers/char/tpm/tpm_crb.c
+++ b/drivers/char/tpm/tpm_crb.c
@@ -34,6 +34,16 @@ enum crb_defaults {
 	CRB_ACPI_START_INDEX = 1,
 };
 
+enum crb_loc_ctrl {
+	CRB_LOC_CTRL_REQUEST_ACCESS	= BIT(0),
+	CRB_LOC_CTRL_RELINQUISH		= BIT(1),
+};
+
+enum crb_loc_state {
+	CRB_LOC_STATE_LOC_ASSIGNED	= BIT(1),
+	CRB_LOC_STATE_TPM_REG_VALID_STS	= BIT(7),
+};
+
 enum crb_ctrl_req {
 	CRB_CTRL_REQ_CMD_READY	= BIT(0),
 	CRB_CTRL_REQ_GO_IDLE	= BIT(1),
@@ -172,6 +182,35 @@ static int __maybe_unused crb_cmd_ready(struct device *dev,
 	return 0;
 }
 
+static int crb_request_locality(struct tpm_chip *chip, int loc)
+{
+	struct crb_priv *priv = dev_get_drvdata(&chip->dev);
+	u32 value = CRB_LOC_STATE_LOC_ASSIGNED |
+		CRB_LOC_STATE_TPM_REG_VALID_STS;
+
+	if (!priv->regs_h)
+		return 0;
+
+	iowrite32(CRB_LOC_CTRL_REQUEST_ACCESS, &priv->regs_h->loc_ctrl);
+	if (!crb_wait_for_reg_32(&priv->regs_h->loc_state, value, value,
+				 TPM2_TIMEOUT_C)) {
+		dev_warn(&chip->dev, "TPM_LOC_STATE_x.requestAccess timed out\n");
+		return -ETIME;
+	}
+
+	return 0;
+}
+
+static void crb_relinquish_locality(struct tpm_chip *chip, int loc)
+{
+	struct crb_priv *priv = dev_get_drvdata(&chip->dev);
+
+	if (!priv->regs_h)
+		return;
+
+	iowrite32(CRB_LOC_CTRL_RELINQUISH, &priv->regs_h->loc_ctrl);
+}
+
 static u8 crb_status(struct tpm_chip *chip)
 {
 	struct crb_priv *priv = dev_get_drvdata(&chip->dev);
@@ -278,6 +317,8 @@ static const struct tpm_class_ops tpm_crb = {
 	.send = crb_send,
 	.cancel = crb_cancel,
 	.req_canceled = crb_req_canceled,
+	.request_locality = crb_request_locality,
+	.relinquish_locality = crb_relinquish_locality,
 	.req_complete_mask = CRB_DRV_STS_COMPLETE,
 	.req_complete_val = CRB_DRV_STS_COMPLETE,
 };
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index da158f06e0b2..5a090f5ab335 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -48,7 +48,8 @@ struct tpm_class_ops {
 	u8 (*status) (struct tpm_chip *chip);
 	bool (*update_timeouts)(struct tpm_chip *chip,
 				unsigned long *timeout_cap);
-
+	int (*request_locality)(struct tpm_chip *chip, int loc);
+	void (*relinquish_locality)(struct tpm_chip *chip, int loc);
 };
 
 #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE)
-- 
cgit v1.2.3


From f65fd1aa4f9881d5540192d11f7b8ed2fec936db Mon Sep 17 00:00:00 2001
From: Sasha Neftin <sasha.neftin@intel.com>
Date: Mon, 3 Apr 2017 16:02:50 -0500
Subject: PCI: Avoid FLR for Intel 82579 NICs

Per Intel Specification Update 335553-002 (see link below), some 82579
network adapters advertise a Function Level Reset (FLR) capability, but
they can hang when an FLR is triggered.

To reproduce the problem, attach the device to a VM, then detach and try to
attach again.

Add a quirk to prevent the use of FLR on these devices.

[bhelgaas: changelog, comments]
Link: http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/82579lm-82579v-gigabit-network-connection-spec-update.pdf
Signed-off-by: Sasha Neftin <sasha.neftin@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c    | 6 ++++++
 drivers/pci/quirks.c | 8 ++++++++
 include/linux/pci.h  | 2 ++
 3 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7904d02ffdb9..bef14777bb30 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3781,6 +3781,9 @@ static int pcie_flr(struct pci_dev *dev, int probe)
 	if (!(cap & PCI_EXP_DEVCAP_FLR))
 		return -ENOTTY;
 
+	if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET)
+		return -ENOTTY;
+
 	if (probe)
 		return 0;
 
@@ -3801,6 +3804,9 @@ static int pci_af_flr(struct pci_dev *dev, int probe)
 	if (!pos)
 		return -ENOTTY;
 
+	if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET)
+		return -ENOTTY;
+
 	pci_read_config_byte(dev, pos + PCI_AF_CAP, &cap);
 	if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
 		return -ENOTTY;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index f754453fe754..823271b13d12 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -4633,3 +4633,11 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
 DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
+
+/* FLR may cause some 82579 devices to hang. */
+static void quirk_intel_no_flr(struct pci_dev *dev)
+{
+	dev->dev_flags |= PCI_DEV_FLAGS_NO_FLR_RESET;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502, quirk_intel_no_flr);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503, quirk_intel_no_flr);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..22cad2c66d59 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -178,6 +178,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
 	/* Get VPD from function 0 VPD */
 	PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8),
+	/* Do not use FLR even if device advertises PCI_AF_CAP */
+	PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From c8b5d129ee293bcf972e7279ac996bb8a138505c Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@linux-m68k.org>
Date: Mon, 3 Apr 2017 15:50:03 +1000
Subject: net: usbnet: support 64bit stats
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the net stats64 counters to the usbnet core. With that
in place put the hooks into every usbnet driver to use it.

This is a strait forward addition of 64bit counters for RX and TX packet
and byte counts. It is done in the same style as for the other net drivers
that support stats64. Note that the other stats fields remain as 32bit
sized values (error counts, etc).

The motivation to add this is that it is not particularly difficult to
get the RX and TX byte counts to wrap on 32bit platforms.

Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/asix_devices.c    |  3 +++
 drivers/net/usb/ax88172a.c        |  1 +
 drivers/net/usb/ax88179_178a.c    |  1 +
 drivers/net/usb/cdc_mbim.c        |  1 +
 drivers/net/usb/cdc_ncm.c         |  1 +
 drivers/net/usb/dm9601.c          |  1 +
 drivers/net/usb/int51x1.c         |  1 +
 drivers/net/usb/mcs7830.c         |  1 +
 drivers/net/usb/qmi_wwan.c        |  1 +
 drivers/net/usb/rndis_host.c      |  1 +
 drivers/net/usb/sierra_net.c      |  1 +
 drivers/net/usb/smsc75xx.c        |  1 +
 drivers/net/usb/smsc95xx.c        |  1 +
 drivers/net/usb/sr9700.c          |  1 +
 drivers/net/usb/sr9800.c          |  1 +
 drivers/net/usb/usbnet.c          | 55 ++++++++++++++++++++++++++++++++++++---
 drivers/net/wireless/rndis_wlan.c |  1 +
 include/linux/usb/usbnet.h        |  4 +++
 18 files changed, 73 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index 38456d0bcfd2..a3aa0a27dfe5 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -206,6 +206,7 @@ static const struct net_device_ops ax88172_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= asix_ioctl,
@@ -591,6 +592,7 @@ static const struct net_device_ops ax88772_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= asix_ioctl,
@@ -1044,6 +1046,7 @@ static const struct net_device_ops ax88178_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= asix_set_multicast,
diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c
index 6308386b09df..501576f53854 100644
--- a/drivers/net/usb/ax88172a.c
+++ b/drivers/net/usb/ax88172a.c
@@ -143,6 +143,7 @@ static const struct net_device_ops ax88172a_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address	= asix_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= ax88172a_ioctl,
diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index 4a0ae7ce83f6..51cf60092a18 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -959,6 +959,7 @@ static const struct net_device_ops ax88179_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_change_mtu		= ax88179_change_mtu,
 	.ndo_set_mac_address	= ax88179_set_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 3a98f3762a4c..a6b997cffd3b 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -100,6 +100,7 @@ static const struct net_device_ops cdc_mbim_netdev_ops = {
 	.ndo_stop             = usbnet_stop,
 	.ndo_start_xmit       = usbnet_start_xmit,
 	.ndo_tx_timeout       = usbnet_tx_timeout,
+	.ndo_get_stats64      = usbnet_get_stats64,
 	.ndo_change_mtu       = cdc_ncm_change_mtu,
 	.ndo_set_mac_address  = eth_mac_addr,
 	.ndo_validate_addr    = eth_validate_addr,
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index b6c1d3abad96..bb3f71f9fbde 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -753,6 +753,7 @@ static const struct net_device_ops cdc_ncm_netdev_ops = {
 	.ndo_stop	     = usbnet_stop,
 	.ndo_start_xmit	     = usbnet_start_xmit,
 	.ndo_tx_timeout	     = usbnet_tx_timeout,
+	.ndo_get_stats64     = usbnet_get_stats64,
 	.ndo_change_mtu	     = cdc_ncm_change_mtu,
 	.ndo_set_mac_address = eth_mac_addr,
 	.ndo_validate_addr   = eth_validate_addr,
diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
index fea1b64ca26a..b91f92e4e5f2 100644
--- a/drivers/net/usb/dm9601.c
+++ b/drivers/net/usb/dm9601.c
@@ -343,6 +343,7 @@ static const struct net_device_ops dm9601_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl 		= dm9601_ioctl,
 	.ndo_set_rx_mode	= dm9601_set_multicast,
diff --git a/drivers/net/usb/int51x1.c b/drivers/net/usb/int51x1.c
index 4ff70b22c6ee..5a43b77a6b9c 100644
--- a/drivers/net/usb/int51x1.c
+++ b/drivers/net/usb/int51x1.c
@@ -144,6 +144,7 @@ static const struct net_device_ops int51x1_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= int51x1_set_multicast,
diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c
index 5771ff261fa8..5a47e5510ca8 100644
--- a/drivers/net/usb/mcs7830.c
+++ b/drivers/net/usb/mcs7830.c
@@ -475,6 +475,7 @@ static const struct net_device_ops mcs7830_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl 		= mcs7830_ioctl,
 	.ndo_set_rx_mode	= mcs7830_set_multicast,
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 629fe64cd74a..adbed261cc8a 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -542,6 +542,7 @@ static const struct net_device_ops qmi_wwan_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address	= qmi_wwan_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index c5b21138b7eb..e96e2e5673d7 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -291,6 +291,7 @@ static const struct net_device_ops rndis_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c
index c8f60b887c22..2110ab3513f0 100644
--- a/drivers/net/usb/sierra_net.c
+++ b/drivers/net/usb/sierra_net.c
@@ -199,6 +199,7 @@ static const struct net_device_ops sierra_net_device_ops = {
 	.ndo_start_xmit         = usbnet_start_xmit,
 	.ndo_tx_timeout         = usbnet_tx_timeout,
 	.ndo_change_mtu         = usbnet_change_mtu,
+	.ndo_get_stats64        = usbnet_get_stats64,
 	.ndo_set_mac_address    = eth_mac_addr,
 	.ndo_validate_addr      = eth_validate_addr,
 };
diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c
index 1ab0ff43c6a2..1ce01dbd494f 100644
--- a/drivers/net/usb/smsc75xx.c
+++ b/drivers/net/usb/smsc75xx.c
@@ -1381,6 +1381,7 @@ static const struct net_device_ops smsc75xx_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_change_mtu		= smsc75xx_change_mtu,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index 4a8bf960cbb9..c2f67cecdf5b 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1248,6 +1248,7 @@ static const struct net_device_ops smsc95xx_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl 		= smsc95xx_ioctl,
diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c
index 950a3a9466bd..317287f4409c 100644
--- a/drivers/net/usb/sr9700.c
+++ b/drivers/net/usb/sr9700.c
@@ -308,6 +308,7 @@ static const struct net_device_ops sr9700_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= sr9700_ioctl,
 	.ndo_set_rx_mode	= sr9700_set_multicast,
diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c
index a696b628782c..9277a0f228df 100644
--- a/drivers/net/usb/sr9800.c
+++ b/drivers/net/usb/sr9800.c
@@ -679,6 +679,7 @@ static const struct net_device_ops sr9800_netdev_ops = {
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address	= sr_set_mac_address,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_do_ioctl		= sr_ioctl,
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 13d4ec5f6f34..9890656af735 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -316,6 +316,7 @@ static void __usbnet_status_stop_force(struct usbnet *dev)
  */
 void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 {
+	struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
 	int	status;
 
 	if (test_bit(EVENT_RX_PAUSED, &dev->flags)) {
@@ -327,8 +328,10 @@ void usbnet_skb_return (struct usbnet *dev, struct sk_buff *skb)
 	if (skb->protocol == 0)
 		skb->protocol = eth_type_trans (skb, dev->net);
 
-	dev->net->stats.rx_packets++;
-	dev->net->stats.rx_bytes += skb->len;
+	u64_stats_update_begin(&stats64->syncp);
+	stats64->rx_packets++;
+	stats64->rx_bytes += skb->len;
+	u64_stats_update_end(&stats64->syncp);
 
 	netif_dbg(dev, rx_status, dev->net, "< rx, len %zu, type 0x%x\n",
 		  skb->len + sizeof (struct ethhdr), skb->protocol);
@@ -981,6 +984,37 @@ int usbnet_set_link_ksettings(struct net_device *net,
 }
 EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings);
 
+void usbnet_get_stats64(struct net_device *net, struct rtnl_link_stats64 *stats)
+{
+	struct usbnet *dev = netdev_priv(net);
+	unsigned int start;
+	int cpu;
+
+	netdev_stats_to_stats64(stats, &net->stats);
+
+	for_each_possible_cpu(cpu) {
+		struct pcpu_sw_netstats *stats64;
+		u64 rx_packets, rx_bytes;
+		u64 tx_packets, tx_bytes;
+
+		stats64 = per_cpu_ptr(dev->stats64, cpu);
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats64->syncp);
+			rx_packets = stats64->rx_packets;
+			rx_bytes = stats64->rx_bytes;
+			tx_packets = stats64->tx_packets;
+			tx_bytes = stats64->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&stats64->syncp, start));
+
+		stats->rx_packets += rx_packets;
+		stats->rx_bytes += rx_bytes;
+		stats->tx_packets += tx_packets;
+		stats->tx_bytes += tx_bytes;
+	}
+}
+EXPORT_SYMBOL_GPL(usbnet_get_stats64);
+
 u32 usbnet_get_link (struct net_device *net)
 {
 	struct usbnet *dev = netdev_priv(net);
@@ -1212,8 +1246,12 @@ static void tx_complete (struct urb *urb)
 	struct usbnet		*dev = entry->dev;
 
 	if (urb->status == 0) {
-		dev->net->stats.tx_packets += entry->packets;
-		dev->net->stats.tx_bytes += entry->length;
+		struct pcpu_sw_netstats *stats64 = this_cpu_ptr(dev->stats64);
+
+		u64_stats_update_begin(&stats64->syncp);
+		stats64->tx_packets += entry->packets;
+		stats64->tx_bytes += entry->length;
+		u64_stats_update_end(&stats64->syncp);
 	} else {
 		dev->net->stats.tx_errors++;
 
@@ -1570,6 +1608,7 @@ void usbnet_disconnect (struct usb_interface *intf)
 	usb_free_urb(dev->interrupt);
 	kfree(dev->padding_pkt);
 
+	free_percpu(dev->stats64);
 	free_netdev(net);
 }
 EXPORT_SYMBOL_GPL(usbnet_disconnect);
@@ -1581,6 +1620,7 @@ static const struct net_device_ops usbnet_netdev_ops = {
 	.ndo_tx_timeout		= usbnet_tx_timeout,
 	.ndo_set_rx_mode	= usbnet_set_rx_mode,
 	.ndo_change_mtu		= usbnet_change_mtu,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 };
@@ -1642,6 +1682,11 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	dev->intf = udev;
 	dev->driver_info = info;
 	dev->driver_name = name;
+
+	dev->stats64 = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+	if (!dev->stats64)
+		goto out0;
+
 	dev->msg_enable = netif_msg_init (msg_level, NETIF_MSG_DRV
 				| NETIF_MSG_PROBE | NETIF_MSG_LINK);
 	init_waitqueue_head(&dev->wait);
@@ -1781,6 +1826,8 @@ out1:
 	 */
 	cancel_work_sync(&dev->kevent);
 	del_timer_sync(&dev->delay);
+	free_percpu(dev->stats64);
+out0:
 	free_netdev(net);
 out:
 	return status;
diff --git a/drivers/net/wireless/rndis_wlan.c b/drivers/net/wireless/rndis_wlan.c
index 785334f7a538..3b68eaffb48c 100644
--- a/drivers/net/wireless/rndis_wlan.c
+++ b/drivers/net/wireless/rndis_wlan.c
@@ -3392,6 +3392,7 @@ static const struct net_device_ops rndis_wlan_netdev_ops = {
 	.ndo_stop		= usbnet_stop,
 	.ndo_start_xmit		= usbnet_start_xmit,
 	.ndo_tx_timeout		= usbnet_tx_timeout,
+	.ndo_get_stats64	= usbnet_get_stats64,
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_rx_mode	= rndis_wlan_set_multicast_list,
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index e2b56917450f..7dffa5624ea6 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -64,6 +64,8 @@ struct usbnet {
 	struct usb_anchor	deferred;
 	struct tasklet_struct	bh;
 
+	struct pcpu_sw_netstats __percpu *stats64;
+
 	struct work_struct	kevent;
 	unsigned long		flags;
 #		define EVENT_TX_HALT	0
@@ -278,5 +280,7 @@ extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags);
 extern void usbnet_status_stop(struct usbnet *dev);
 
 extern void usbnet_update_max_qlen(struct usbnet *dev);
+extern void usbnet_get_stats64(struct net_device *dev,
+			       struct rtnl_link_stats64 *stats);
 
 #endif /* __LINUX_USB_USBNET_H */
-- 
cgit v1.2.3


From f9dc4d1f0d6f75c102ee13c0a939d9ae880a3c1e Mon Sep 17 00:00:00 2001
From: Ram Amrani <Ram.Amrani@cavium.com>
Date: Mon, 3 Apr 2017 12:21:13 +0300
Subject: qed: Manage with less memory regions for RoCE

It's possible some configurations would prevent driver from utilizing
all the Memory Regions due to a lack of ILT lines.
In such a case, calculate how many memory regions would have to be
dropped due to limit, and manage without those.

Signed-off-by: Ram Amrani <Ram.Amrani@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_cxt.c  | 113 +++++++++++++++++++++--------
 drivers/net/ethernet/qlogic/qed/qed_cxt.h  |  15 +++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c  |  33 ++++++++-
 drivers/net/ethernet/qlogic/qed/qed_main.c |   1 -
 include/linux/qed/qed_if.h                 |   1 -
 5 files changed, 125 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 8db023422bfd..485b8b22ec7a 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -543,7 +543,22 @@ static u32 qed_ilt_get_dynamic_line_cnt(struct qed_hwfn *p_hwfn,
 	return lines_to_skip;
 }
 
-int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
+static struct qed_ilt_client_cfg *qed_cxt_set_cli(struct qed_ilt_client_cfg
+						  *p_cli)
+{
+	p_cli->active = false;
+	p_cli->first.val = 0;
+	p_cli->last.val = 0;
+	return p_cli;
+}
+
+static struct qed_ilt_cli_blk *qed_cxt_set_blk(struct qed_ilt_cli_blk *p_blk)
+{
+	p_blk->total_size = 0;
+	return p_blk;
+}
+
+int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn, u32 *line_count)
 {
 	struct qed_cxt_mngr *p_mngr = p_hwfn->p_cxt_mngr;
 	u32 curr_line, total, i, task_size, line;
@@ -567,7 +582,8 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 		   p_hwfn->my_id, p_hwfn->p_cxt_mngr->pf_start_line);
 
 	/* CDUC */
-	p_cli = &p_mngr->clients[ILT_CLI_CDUC];
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_CDUC]);
+
 	curr_line = p_mngr->pf_start_line;
 
 	/* CDUC PF */
@@ -576,7 +592,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	/* get the counters for the CDUC and QM clients  */
 	qed_cxt_cdu_iids(p_mngr, &cdu_iids);
 
-	p_blk = &p_cli->pf_blks[CDUC_BLK];
+	p_blk = qed_cxt_set_blk(&p_cli->pf_blks[CDUC_BLK]);
 
 	total = cdu_iids.pf_cids * CONN_CXT_SIZE(p_hwfn);
 
@@ -590,7 +606,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 							       ILT_CLI_CDUC);
 
 	/* CDUC VF */
-	p_blk = &p_cli->vf_blks[CDUC_BLK];
+	p_blk = qed_cxt_set_blk(&p_cli->vf_blks[CDUC_BLK]);
 	total = cdu_iids.per_vf_cids * CONN_CXT_SIZE(p_hwfn);
 
 	qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
@@ -604,7 +620,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 				     ILT_CLI_CDUC);
 
 	/* CDUT PF */
-	p_cli = &p_mngr->clients[ILT_CLI_CDUT];
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_CDUT]);
 	p_cli->first.val = curr_line;
 
 	/* first the 'working' task memory */
@@ -613,7 +629,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 		if (!p_seg || p_seg->count == 0)
 			continue;
 
-		p_blk = &p_cli->pf_blks[CDUT_SEG_BLK(i)];
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[CDUT_SEG_BLK(i)]);
 		total = p_seg->count * p_mngr->task_type_size[p_seg->type];
 		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line, total,
 				     p_mngr->task_type_size[p_seg->type]);
@@ -628,7 +644,8 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 		if (!p_seg || p_seg->count == 0)
 			continue;
 
-		p_blk = &p_cli->pf_blks[CDUT_FL_SEG_BLK(i, PF)];
+		p_blk =
+		    qed_cxt_set_blk(&p_cli->pf_blks[CDUT_FL_SEG_BLK(i, PF)]);
 
 		if (!p_seg->has_fl_mem) {
 			/* The segment is active (total size pf 'working'
@@ -673,7 +690,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 		/* 'working' memory */
 		total = p_seg->count * p_mngr->task_type_size[p_seg->type];
 
-		p_blk = &p_cli->vf_blks[CDUT_SEG_BLK(0)];
+		p_blk = qed_cxt_set_blk(&p_cli->vf_blks[CDUT_SEG_BLK(0)]);
 		qed_ilt_cli_blk_fill(p_cli, p_blk,
 				     curr_line, total,
 				     p_mngr->task_type_size[p_seg->type]);
@@ -682,7 +699,8 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 				     ILT_CLI_CDUT);
 
 		/* 'init' memory */
-		p_blk = &p_cli->vf_blks[CDUT_FL_SEG_BLK(0, VF)];
+		p_blk =
+		    qed_cxt_set_blk(&p_cli->vf_blks[CDUT_FL_SEG_BLK(0, VF)]);
 		if (!p_seg->has_fl_mem) {
 			/* see comment above */
 			line = p_cli->vf_blks[CDUT_SEG_BLK(0)].start_line;
@@ -710,8 +728,8 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	}
 
 	/* QM */
-	p_cli = &p_mngr->clients[ILT_CLI_QM];
-	p_blk = &p_cli->pf_blks[0];
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_QM]);
+	p_blk = qed_cxt_set_blk(&p_cli->pf_blks[0]);
 
 	qed_cxt_qm_iids(p_hwfn, &qm_iids);
 	total = qed_qm_pf_mem_size(p_hwfn->rel_pf_id, qm_iids.cids,
@@ -735,7 +753,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	p_cli->pf_total_lines = curr_line - p_blk->start_line;
 
 	/* SRC */
-	p_cli = &p_mngr->clients[ILT_CLI_SRC];
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_SRC]);
 	qed_cxt_src_iids(p_mngr, &src_iids);
 
 	/* Both the PF and VFs searcher connections are stored in the per PF
@@ -749,7 +767,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 
 		total = roundup_pow_of_two(local_max);
 
-		p_blk = &p_cli->pf_blks[0];
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[0]);
 		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
 				     total * sizeof(struct src_ent),
 				     sizeof(struct src_ent));
@@ -760,11 +778,11 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	}
 
 	/* TM PF */
-	p_cli = &p_mngr->clients[ILT_CLI_TM];
+	p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_TM]);
 	qed_cxt_tm_iids(p_hwfn, p_mngr, &tm_iids);
 	total = tm_iids.pf_cids + tm_iids.pf_tids_total;
 	if (total) {
-		p_blk = &p_cli->pf_blks[0];
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[0]);
 		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
 				     total * TM_ELEM_SIZE, TM_ELEM_SIZE);
 
@@ -776,7 +794,7 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	/* TM VF */
 	total = tm_iids.per_vf_cids + tm_iids.per_vf_tids;
 	if (total) {
-		p_blk = &p_cli->vf_blks[0];
+		p_blk = qed_cxt_set_blk(&p_cli->vf_blks[0]);
 		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
 				     total * TM_ELEM_SIZE, TM_ELEM_SIZE);
 
@@ -793,8 +811,8 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 	total = qed_cxt_get_srq_count(p_hwfn);
 
 	if (total) {
-		p_cli = &p_mngr->clients[ILT_CLI_TSDM];
-		p_blk = &p_cli->pf_blks[SRQ_BLK];
+		p_cli = qed_cxt_set_cli(&p_mngr->clients[ILT_CLI_TSDM]);
+		p_blk = qed_cxt_set_blk(&p_cli->pf_blks[SRQ_BLK]);
 		qed_ilt_cli_blk_fill(p_cli, p_blk, curr_line,
 				     total * SRQ_CXT_SIZE, SRQ_CXT_SIZE);
 
@@ -803,13 +821,50 @@ int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn)
 		p_cli->pf_total_lines = curr_line - p_blk->start_line;
 	}
 
+	*line_count = curr_line - p_hwfn->p_cxt_mngr->pf_start_line;
+
 	if (curr_line - p_hwfn->p_cxt_mngr->pf_start_line >
-	    RESC_NUM(p_hwfn, QED_ILT)) {
-		DP_ERR(p_hwfn, "too many ilt lines...#lines=%d\n",
-		       curr_line - p_hwfn->p_cxt_mngr->pf_start_line);
+	    RESC_NUM(p_hwfn, QED_ILT))
 		return -EINVAL;
+
+	return 0;
+}
+
+u32 qed_cxt_cfg_ilt_compute_excess(struct qed_hwfn *p_hwfn, u32 used_lines)
+{
+	struct qed_ilt_client_cfg *p_cli;
+	u32 excess_lines, available_lines;
+	struct qed_cxt_mngr *p_mngr;
+	u32 ilt_page_size, elem_size;
+	struct qed_tid_seg *p_seg;
+	int i;
+
+	available_lines = RESC_NUM(p_hwfn, QED_ILT);
+	excess_lines = used_lines - available_lines;
+
+	if (!excess_lines)
+		return 0;
+
+	if (p_hwfn->hw_info.personality != QED_PCI_ETH_ROCE)
+		return 0;
+
+	p_mngr = p_hwfn->p_cxt_mngr;
+	p_cli = &p_mngr->clients[ILT_CLI_CDUT];
+	ilt_page_size = ILT_PAGE_IN_BYTES(p_cli->p_size.val);
+
+	for (i = 0; i < NUM_TASK_PF_SEGMENTS; i++) {
+		p_seg = qed_cxt_tid_seg_info(p_hwfn, i);
+		if (!p_seg || p_seg->count == 0)
+			continue;
+
+		elem_size = p_mngr->task_type_size[p_seg->type];
+		if (!elem_size)
+			continue;
+
+		return (ilt_page_size / elem_size) * excess_lines;
 	}
 
+	DP_NOTICE(p_hwfn, "failed computing excess ILT lines\n");
 	return 0;
 }
 
@@ -1893,13 +1948,12 @@ int qed_cxt_get_cid_info(struct qed_hwfn *p_hwfn, struct qed_cxt_info *p_info)
 }
 
 static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn,
-				   struct qed_rdma_pf_params *p_params)
+				   struct qed_rdma_pf_params *p_params,
+				   u32 num_tasks)
 {
-	u32 num_cons, num_tasks, num_qps, num_mrs, num_srqs;
+	u32 num_cons, num_qps, num_srqs;
 	enum protocol_type proto;
 
-	num_mrs = min_t(u32, RDMA_MAX_TIDS, p_params->num_mrs);
-	num_tasks = num_mrs;	/* each mr uses a single task id */
 	num_srqs = min_t(u32, 32 * 1024, p_params->num_srqs);
 
 	switch (p_hwfn->hw_info.personality) {
@@ -1928,7 +1982,7 @@ static void qed_rdma_set_pf_params(struct qed_hwfn *p_hwfn,
 	}
 }
 
-int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn)
+int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
 {
 	/* Set the number of required CORE connections */
 	u32 core_cids = 1; /* SPQ */
@@ -1940,9 +1994,10 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn)
 	switch (p_hwfn->hw_info.personality) {
 	case QED_PCI_ETH_ROCE:
 	{
-		qed_rdma_set_pf_params(p_hwfn,
-				       &p_hwfn->
-				       pf_params.rdma_pf_params);
+			qed_rdma_set_pf_params(p_hwfn,
+					       &p_hwfn->
+					       pf_params.rdma_pf_params,
+					       rdma_tasks);
 		/* no need for break since RoCE coexist with Ethernet */
 	}
 	case QED_PCI_ETH:
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.h b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
index 8b010324268a..f34b2889f4bb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.h
@@ -105,19 +105,28 @@ u32 qed_cxt_get_proto_cid_count(struct qed_hwfn *p_hwfn,
  * @brief qed_cxt_set_pf_params - Set the PF params for cxt init
  *
  * @param p_hwfn
- *
+ * @param rdma_tasks - requested maximum
  * @return int
  */
-int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn);
+int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks);
 
 /**
  * @brief qed_cxt_cfg_ilt_compute - compute ILT init parameters
  *
  * @param p_hwfn
+ * @param last_line
  *
  * @return int
  */
-int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn);
+int qed_cxt_cfg_ilt_compute(struct qed_hwfn *p_hwfn, u32 *last_line);
+
+/**
+ * @brief qed_cxt_cfg_ilt_compute_excess - how many lines can be decreased
+ *
+ * @param p_hwfn
+ * @param used_lines
+ */
+u32 qed_cxt_cfg_ilt_compute_excess(struct qed_hwfn *p_hwfn, u32 used_lines);
 
 /**
  * @brief qed_cxt_mngr_alloc - Allocate and init the context manager struct
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index b48c80ec4e5b..249878533fd9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -848,8 +848,10 @@ int qed_resc_alloc(struct qed_dev *cdev)
 #ifdef CONFIG_QED_LL2
 	struct qed_ll2_info *p_ll2_info;
 #endif
+	u32 rdma_tasks, excess_tasks;
 	struct qed_consq *p_consq;
 	struct qed_eq *p_eq;
+	u32 line_count;
 	int i, rc = 0;
 
 	if (IS_VF(cdev))
@@ -871,7 +873,7 @@ int qed_resc_alloc(struct qed_dev *cdev)
 		/* Set the HW cid/tid numbers (in the contest manager)
 		 * Must be done prior to any further computations.
 		 */
-		rc = qed_cxt_set_pf_params(p_hwfn);
+		rc = qed_cxt_set_pf_params(p_hwfn, RDMA_MAX_TIDS);
 		if (rc)
 			goto alloc_err;
 
@@ -883,9 +885,32 @@ int qed_resc_alloc(struct qed_dev *cdev)
 		qed_init_qm_info(p_hwfn);
 
 		/* Compute the ILT client partition */
-		rc = qed_cxt_cfg_ilt_compute(p_hwfn);
-		if (rc)
-			goto alloc_err;
+		rc = qed_cxt_cfg_ilt_compute(p_hwfn, &line_count);
+		if (rc) {
+			DP_NOTICE(p_hwfn,
+				  "too many ILT lines; re-computing with less lines\n");
+			/* In case there are not enough ILT lines we reduce the
+			 * number of RDMA tasks and re-compute.
+			 */
+			excess_tasks =
+			    qed_cxt_cfg_ilt_compute_excess(p_hwfn, line_count);
+			if (!excess_tasks)
+				goto alloc_err;
+
+			rdma_tasks = RDMA_MAX_TIDS - excess_tasks;
+			rc = qed_cxt_set_pf_params(p_hwfn, rdma_tasks);
+			if (rc)
+				goto alloc_err;
+
+			rc = qed_cxt_cfg_ilt_compute(p_hwfn, &line_count);
+			if (rc) {
+				DP_ERR(p_hwfn,
+				       "failed ILT compute. Requested too many lines: %u\n",
+				       line_count);
+
+				goto alloc_err;
+			}
+		}
 
 		/* CID map / ILT shadow table / T2
 		 * The talbes sizes are determined by the computations above
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index d4edb993b1b0..634e7a2433a9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -877,7 +877,6 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 		params->rdma_pf_params.num_qps = QED_ROCE_QPS;
 		params->rdma_pf_params.min_dpis = QED_ROCE_DPIS;
 		/* divide by 3 the MRs to avoid MF ILT overflow */
-		params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS;
 		params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
 	}
 
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 8e0065c52857..625f80f08f91 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -263,7 +263,6 @@ struct qed_rdma_pf_params {
 	 * the doorbell BAR).
 	 */
 	u32 min_dpis;		/* number of requested DPIs */
-	u32 num_mrs;		/* number of requested memory regions */
 	u32 num_qps;		/* number of requested Queue Pairs */
 	u32 num_srqs;		/* number of requested SRQ */
 	u8 roce_edpm_mode;	/* see QED_ROCE_EDPM_MODE_ENABLE */
-- 
cgit v1.2.3


From 815429b39d94c64f6d05eed9e7c1a9bdfdd5bd70 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Wed, 29 Mar 2017 19:30:17 +0900
Subject: extcon: Add new extcon_register_notifier_all() to monitor all
 external connectors

The extcon core already provides the extcon_register_notifier() function
in order to register the notifier block which is used to monitor
the state change for the specific external connector such as EXTCON_USB,
EXTCON_USB_HOST and so on. The extcon consumer uses the this function.

The extcon consumer might need to monitor the all supported external
connectors from the extcon device. In this case, The extcon consumer
should have each notifier_block structure for each external connector.

This patch adds the new extcon_register_notifier_all() function
that extcon consumer is able to monitor the state change of all
supported external connectors by using only one notifier_block structure.

- List of new added functions:
int extcon_register_notifier_all(struct extcon_dev *edev,
			struct notifier_block *nb);
int extcon_unregister_notifier_all(struct extcon_dev *edev,
			struct notifier_block *nb);
int devm_extcon_register_notifier_all(struct device *dev,
			struct extcon_dev *edev, struct notifier_block *nb);
void devm_extcon_unregister_notifier_all(struct device *dev,
			struct extcon_dev *edev, struct notifier_block *nb);

Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/extcon/devres.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/extcon/extcon.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/extcon/extcon.h |  3 +++
 include/linux/extcon.h  | 21 ++++++++++++----
 4 files changed, 146 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/devres.c b/drivers/extcon/devres.c
index b40eb1805927..186fd735eb28 100644
--- a/drivers/extcon/devres.c
+++ b/drivers/extcon/devres.c
@@ -50,6 +50,13 @@ static void devm_extcon_dev_notifier_unreg(struct device *dev, void *res)
 	extcon_unregister_notifier(this->edev, this->id, this->nb);
 }
 
+static void devm_extcon_dev_notifier_all_unreg(struct device *dev, void *res)
+{
+	struct extcon_dev_notifier_devres *this = res;
+
+	extcon_unregister_notifier_all(this->edev, this->nb);
+}
+
 /**
  * devm_extcon_dev_allocate - Allocate managed extcon device
  * @dev:		device owning the extcon device being created
@@ -214,3 +221,57 @@ void devm_extcon_unregister_notifier(struct device *dev,
 			       devm_extcon_dev_match, edev));
 }
 EXPORT_SYMBOL(devm_extcon_unregister_notifier);
+
+/**
+ * devm_extcon_register_notifier_all()
+ *		- Resource-managed extcon_register_notifier_all()
+ * @dev:	device to allocate extcon device
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ *
+ * This function manages automatically the notifier of extcon device using
+ * device resource management and simplify the control of unregistering
+ * the notifier of extcon device. To get more information, refer that function.
+ *
+ * Returns 0 if success or negaive error number if failure.
+ */
+int devm_extcon_register_notifier_all(struct device *dev, struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	struct extcon_dev_notifier_devres *ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_extcon_dev_notifier_all_unreg, sizeof(*ptr),
+				GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = extcon_register_notifier_all(edev, nb);
+	if (ret) {
+		devres_free(ptr);
+		return ret;
+	}
+
+	ptr->edev = edev;
+	ptr->nb = nb;
+	devres_add(dev, ptr);
+
+	return 0;
+}
+EXPORT_SYMBOL(devm_extcon_register_notifier_all);
+
+/**
+ * devm_extcon_unregister_notifier_all()
+ *		- Resource-managed extcon_unregister_notifier_all()
+ * @dev:	device to allocate extcon device
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ */
+void devm_extcon_unregister_notifier_all(struct device *dev,
+				struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	WARN_ON(devres_release(dev, devm_extcon_dev_notifier_all_unreg,
+			       devm_extcon_dev_match, edev));
+}
+EXPORT_SYMBOL(devm_extcon_unregister_notifier_all);
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 09ac5e70c2f3..e7750545469f 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -448,8 +448,19 @@ int extcon_sync(struct extcon_dev *edev, unsigned int id)
 	spin_lock_irqsave(&edev->lock, flags);
 
 	state = !!(edev->state & BIT(index));
+
+	/*
+	 * Call functions in a raw notifier chain for the specific one
+	 * external connector.
+	 */
 	raw_notifier_call_chain(&edev->nh[index], state, edev);
 
+	/*
+	 * Call functions in a raw notifier chain for the all supported
+	 * external connectors.
+	 */
+	raw_notifier_call_chain(&edev->nh_all, state, edev);
+
 	/* This could be in interrupt handler */
 	prop_buf = (char *)get_zeroed_page(GFP_ATOMIC);
 	if (!prop_buf) {
@@ -954,6 +965,59 @@ int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 }
 EXPORT_SYMBOL_GPL(extcon_unregister_notifier);
 
+/**
+ * extcon_register_notifier_all() - Register a notifier block for all connectors
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ *
+ * This fucntion registers a notifier block in order to receive the state
+ * change of all supported external connectors from extcon device.
+ * And The second parameter given to the callback of nb (val) is
+ * the current state and third parameter is the edev pointer.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_register_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!edev || !nb)
+		return -EINVAL;
+
+	spin_lock_irqsave(&edev->lock, flags);
+	ret = raw_notifier_chain_register(&edev->nh_all, nb);
+	spin_unlock_irqrestore(&edev->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_register_notifier_all);
+
+/**
+ * extcon_unregister_notifier_all() - Unregister a notifier block from extcon.
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_unregister_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!edev || !nb)
+		return -EINVAL;
+
+	spin_lock_irqsave(&edev->lock, flags);
+	ret = raw_notifier_chain_unregister(&edev->nh_all, nb);
+	spin_unlock_irqrestore(&edev->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_unregister_notifier_all);
+
 static struct attribute *extcon_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_name.attr,
@@ -1212,6 +1276,8 @@ int extcon_dev_register(struct extcon_dev *edev)
 	for (index = 0; index < edev->max_supported; index++)
 		RAW_INIT_NOTIFIER_HEAD(&edev->nh[index]);
 
+	RAW_INIT_NOTIFIER_HEAD(&edev->nh_all);
+
 	dev_set_drvdata(&edev->dev, edev);
 	edev->state = 0;
 
diff --git a/drivers/extcon/extcon.h b/drivers/extcon/extcon.h
index 993ddccafe11..dddddcfa0587 100644
--- a/drivers/extcon/extcon.h
+++ b/drivers/extcon/extcon.h
@@ -21,6 +21,8 @@
  * @dev:		Device of this extcon.
  * @state:		Attach/detach state of this extcon. Do not provide at
  *			register-time.
+ * @nh_all:		Notifier for the state change events for all supported
+ *			external connectors from this extcon.
  * @nh:			Notifier for the state change events from this extcon
  * @entry:		To support list of extcon devices so that users can
  *			search for extcon devices based on the extcon name.
@@ -43,6 +45,7 @@ struct extcon_dev {
 
 	/* Internal data. Please do not set. */
 	struct device dev;
+	struct raw_notifier_head nh_all;
 	struct raw_notifier_head *nh;
 	struct list_head entry;
 	int max_supported;
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 7010fb01a81a..7e206a9f88db 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -236,11 +236,11 @@ extern int extcon_set_property_capability(struct extcon_dev *edev,
 				unsigned int id, unsigned int prop);
 
 /*
- * Following APIs are to monitor every action of a notifier.
- * Registrar gets notified for every external port of a connection device.
- * Probably this could be used to debug an action of notifier; however,
- * we do not recommend to use this for normal 'notifiee' device drivers who
- * want to be notified by a specific external port of the notifier.
+ * Following APIs are to monitor the status change of the external connectors.
+ * extcon_register_notifier(*edev, id, *nb) : Register a notifier block
+ *			for specific external connector of the extcon.
+ * extcon_register_notifier_all(*edev, *nb) : Register a notifier block
+ *			for all supported external connectors of the extcon.
  */
 extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
 				    struct notifier_block *nb);
@@ -253,6 +253,17 @@ extern void devm_extcon_unregister_notifier(struct device *dev,
 				struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb);
 
+extern int extcon_register_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb);
+extern int extcon_unregister_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb);
+extern int devm_extcon_register_notifier_all(struct device *dev,
+				struct extcon_dev *edev,
+				struct notifier_block *nb);
+extern void devm_extcon_unregister_notifier_all(struct device *dev,
+				struct extcon_dev *edev,
+				struct notifier_block *nb);
+
 /*
  * Following API get the extcon device from devicetree.
  * This function use phandle of devicetree to get extcon device directly.
-- 
cgit v1.2.3


From e96a7705e7d3fef96aec9b590c63b2f6f7d2ba22 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 23 Mar 2017 15:56:08 +0100
Subject: sched/rtmutex/deadline: Fix a PI crash for deadline tasks

A crash happened while I was playing with deadline PI rtmutex.

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
    PGD 232a75067 PUD 230947067 PMD 0
    Oops: 0000 [#1] SMP
    CPU: 1 PID: 10994 Comm: a.out Not tainted

    Call Trace:
    [<ffffffff810b658c>] enqueue_task+0x2c/0x80
    [<ffffffff810ba763>] activate_task+0x23/0x30
    [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
    [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
    [<ffffffff8164e783>] __schedule+0xd3/0x900
    [<ffffffff8164efd9>] schedule+0x29/0x70
    [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
    [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
    [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
    [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
    [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
    [<ffffffff810edd50>] SyS_futex+0x80/0x180

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task(), will access them with rq lock held but
not holding pi_lock.

In order to tackle it, we introduce new "pi_top_task" pointer
cached in task_struct, and add new rt_mutex_update_top_task()
to update its value, it can be called by rt_mutex_setprio()
which held both owner's pi_lock and rq lock. Thus "pi_top_task"
can be safely accessed by enqueue_task_dl() under rq lock.

Originally-From: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.157682758@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  2 ++
 include/linux/sched/rt.h  |  1 +
 kernel/fork.c             |  1 +
 kernel/locking/rtmutex.c  | 29 +++++++++++++++++++++--------
 kernel/sched/core.c       |  2 ++
 6 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..2c487e0879d5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -181,6 +181,7 @@ extern struct cred init_cred;
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
+	.pi_top_task = NULL,						\
 	.pi_waiters_leftmost = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..1ea2eee7bc4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -775,6 +775,8 @@ struct task_struct {
 	/* PI waiters blocked on a rt_mutex held by this task: */
 	struct rb_root			pi_waiters;
 	struct rb_node			*pi_waiters_leftmost;
+	/* Updated under owner's pi_lock and rq lock */
+	struct task_struct		*pi_top_task;
 	/* Deadlock detection and priority inheritance handling: */
 	struct rt_mutex_waiter		*pi_blocked_on;
 #endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 3bd668414f61..10ee7eeb0ee2 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -21,6 +21,7 @@ static inline int rt_task(struct task_struct *p)
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
+extern void rt_mutex_update_top_task(struct task_struct *p);
 extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..b30196a00b0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
 	p->pi_waiters = RB_ROOT;
 	p->pi_waiters_leftmost = NULL;
+	p->pi_top_task = NULL;
 	p->pi_blocked_on = NULL;
 #endif
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 71ecf0624410..bc05b104eaed 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,6 +322,19 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
+/*
+ * Must hold both p->pi_lock and task_rq(p)->lock.
+ */
+void rt_mutex_update_top_task(struct task_struct *p)
+{
+	if (!task_has_pi_waiters(p)) {
+		p->pi_top_task = NULL;
+		return;
+	}
+
+	p->pi_top_task = task_top_pi_waiter(p)->task;
+}
+
 /*
  * Calculate task priority from the waiter tree priority
  *
@@ -337,12 +350,12 @@ int rt_mutex_getprio(struct task_struct *task)
 		   task->normal_prio);
 }
 
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
 struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
-	if (likely(!task_has_pi_waiters(task)))
-		return NULL;
-
-	return task_top_pi_waiter(task)->task;
+	return task->pi_top_task;
 }
 
 /*
@@ -351,12 +364,12 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
  */
 int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
-	if (!task_has_pi_waiters(task))
+	struct task_struct *top_task = rt_mutex_get_top_task(task);
+
+	if (!top_task)
 		return newprio;
 
-	if (task_top_pi_waiter(task)->task->prio <= newprio)
-		return task_top_pi_waiter(task)->task->prio;
-	return newprio;
+	return min(top_task->prio, newprio);
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab9f6ac099a7..e1f44ec701c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3713,6 +3713,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		goto out_unlock;
 	}
 
+	rt_mutex_update_top_task(p);
+
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 
-- 
cgit v1.2.3


From acd58620e415aee4a43a808d7d2fd87259ee0001 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:11 +0100
Subject: sched/rtmutex: Refactor rt_mutex_setprio()

With the introduction of SCHED_DEADLINE the whole notion that priority
is a single number is gone, therefore the @prio argument to
rt_mutex_setprio() doesn't make sense anymore.

So rework the code to pass a pi_task instead.

Note this also fixes a problem with pi_top_task caching; previously we
would not set the pointer (call rt_mutex_update_top_task) if the
priority didn't change, this could lead to a stale pointer.

As for the XXX, I think its fine to use pi_task->prio, because if it
differs from waiter->prio, a PI chain update is immenent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.303827095@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched/rt.h |  24 ++++------
 kernel/locking/rtmutex.c | 112 +++++++++++++----------------------------------
 kernel/sched/core.c      |  66 ++++++++++++++++++++++------
 3 files changed, 91 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 10ee7eeb0ee2..f93329aba31a 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,28 +18,20 @@ static inline int rt_task(struct task_struct *p)
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern void rt_mutex_update_top_task(struct task_struct *p);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+	return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-					      int newprio)
-{
-	return newprio;
-}
-
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	return NULL;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4b1015ef0dc7..00b49cdbb4e0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,67 +322,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
-/*
- * Must hold both p->pi_lock and task_rq(p)->lock.
- */
-void rt_mutex_update_top_task(struct task_struct *p)
-{
-	if (!task_has_pi_waiters(p)) {
-		p->pi_top_task = NULL;
-		return;
-	}
-
-	p->pi_top_task = task_top_pi_waiter(p)->task;
-}
-
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
-
-	return min(task_top_pi_waiter(task)->prio,
-		   task->normal_prio);
-}
-
-/*
- * Must hold either p->pi_lock or task_rq(p)->lock.
- */
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	return task->pi_top_task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
+static void rt_mutex_adjust_prio(struct task_struct *p)
 {
-	struct task_struct *top_task = rt_mutex_get_top_task(task);
+	struct task_struct *pi_task = NULL;
 
-	if (!top_task)
-		return newprio;
+	lockdep_assert_held(&p->pi_lock);
 
-	return min(top_task->prio, newprio);
-}
+	if (task_has_pi_waiters(p))
+		pi_task = task_top_pi_waiter(p)->task;
 
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
-	int prio = rt_mutex_getprio(task);
-
-	if (task->prio != prio || dl_prio(prio))
-		rt_mutex_setprio(task, prio);
+	rt_mutex_setprio(p, pi_task);
 }
 
 /*
@@ -742,7 +691,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		 */
 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 
 	} else if (prerequeue_top_waiter == waiter) {
 		/*
@@ -758,7 +707,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		rt_mutex_dequeue_pi(task, waiter);
 		waiter = rt_mutex_top_waiter(lock);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 	} else {
 		/*
 		 * Nothing changed. No need to do any priority
@@ -966,7 +915,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		return -EDEADLK;
 
 	raw_spin_lock(&task->pi_lock);
-	__rt_mutex_adjust_prio(task);
+	rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
@@ -988,7 +937,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
 
-		__rt_mutex_adjust_prio(owner);
+		rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1040,13 +989,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 	waiter = rt_mutex_top_waiter(lock);
 
 	/*
-	 * Remove it from current->pi_waiters. We do not adjust a
-	 * possible priority boost right now. We execute wakeup in the
-	 * boosted mode and go back to normal after releasing
-	 * lock->wait_lock.
+	 * Remove it from current->pi_waiters and deboost.
+	 *
+	 * We must in fact deboost here in order to ensure we call
+	 * rt_mutex_setprio() to update p->pi_top_task before the
+	 * task unblocks.
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
-	__rt_mutex_adjust_prio(current);
+	rt_mutex_adjust_prio(current);
 
 	/*
 	 * As we are waking up the top waiter, and the waiter stays
@@ -1058,9 +1008,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 	 */
 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-	raw_spin_unlock(&current->pi_lock);
-
+	/*
+	 * We deboosted before waking the top waiter task such that we don't
+	 * run two tasks with the 'same' priority (and ensure the
+	 * p->pi_top_task pointer points to a blocked task). This however can
+	 * lead to priority inversion if we would get preempted after the
+	 * deboost but before waking our donor task, hence the preempt_disable()
+	 * before unlock.
+	 *
+	 * Pairs with preempt_enable() in rt_mutex_postunlock();
+	 */
+	preempt_disable();
 	wake_q_add(wake_q, waiter->task);
+	raw_spin_unlock(&current->pi_lock);
 }
 
 /*
@@ -1095,7 +1055,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-	__rt_mutex_adjust_prio(owner);
+	rt_mutex_adjust_prio(owner);
 
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
@@ -1134,8 +1094,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || (waiter->prio == task->prio &&
-			!dl_prio(task->prio))) {
+	if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
@@ -1389,17 +1348,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 	 * Queue the next waiter for wakeup once we release the wait_lock.
 	 */
 	mark_wakeup_next_waiter(wake_q, lock);
-
-	/*
-	 * We should deboost before waking the top waiter task such that
-	 * we don't run two tasks with the 'same' priority. This however
-	 * can lead to prio-inversion if we would get preempted after
-	 * the deboost but before waking our high-prio task, hence the
-	 * preempt_disable before unlock. Pairs with preempt_enable() in
-	 * rt_mutex_postunlock();
-	 */
-	preempt_disable();
-
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	return true; /* call rt_mutex_postunlock() */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1f44ec701c8..37b152a782ae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function);
 
 #ifdef CONFIG_RT_MUTEXES
 
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+	if (pi_task)
+		prio = min(prio, pi_task->prio);
+
+	return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+	return __rt_effective_prio(pi_task, prio);
+}
+
 /*
  * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3682,18 +3697,42 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int oldprio, queued, running, queue_flag =
+	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	BUG_ON(prio > MAX_PRIO);
+	/* XXX used to be waiter->prio, not waiter->task->prio */
+	prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+	/*
+	 * If nothing changed; bail early.
+	 */
+	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+		return;
 
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
+	/*
+	 * Set under pi_lock && rq->lock, such that the value can be used under
+	 * either lock.
+	 *
+	 * Note that there is loads of tricky to make this pointer cache work
+	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+	 * ensure a task is de-boosted (pi_task is set to NULL) before the
+	 * task is allowed to run again (and can exit). This ensures the pointer
+	 * points to a blocked task -- which guaratees the task is present.
+	 */
+	p->pi_top_task = pi_task;
+
+	/*
+	 * For FIFO/RR we only need to set prio, if that matches we're done.
+	 */
+	if (prio == p->prio && !dl_prio(prio))
+		goto out_unlock;
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3713,9 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		goto out_unlock;
 	}
 
-	rt_mutex_update_top_task(p);
-
-	trace_sched_pi_setprio(p, prio);
+	trace_sched_pi_setprio(p, prio); /* broken */
 	oldprio = p->prio;
 
 	if (oldprio == prio)
@@ -3739,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
-		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
@@ -3777,6 +3813,11 @@ out_unlock:
 	balance_callback(rq);
 	preempt_enable();
 }
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	return prio;
+}
 #endif
 
 void set_user_nice(struct task_struct *p, long nice)
@@ -4023,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 	 * Keep a potential priority boosting if called from
 	 * sched_setscheduler().
 	 */
+	p->prio = normal_prio(p);
 	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-	else
-		p->prio = normal_prio(p);
+		p->prio = rt_effective_prio(p, p->prio);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -4313,7 +4353,7 @@ change:
 		 * the runqueue. This will be done when the task deboost
 		 * itself.
 		 */
-		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		new_effective_prio = rt_effective_prio(p, newprio);
 		if (new_effective_prio == oldprio)
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
-- 
cgit v1.2.3


From 6d56111c92d247bb64301029fe88365aa4caf16e Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Tue, 21 Mar 2017 22:05:22 +0100
Subject: KVM: arm/arm64: vgic: Fix GICC_PMR uaccess on GICv3 and clarify ABI

As an oversight, for GICv2, we accidentally export the GICC_PMR register
in the format of the GICH_VMCR.VMPriMask field in the lower 5 bits of a
word, meaning that userspace must always use the lower 5 bits to
communicate with the KVM device and must shift the value left by 3
places to obtain the actual priority mask level.

Since GICv3 supports the full 8 bits of priority masking in the ICH_VMCR,
we have to fix the value we export when emulating a GICv2 on top of a
hardware GICv3 and exporting the emulated GICv2 state to userspace.

Take the chance to clarify this aspect of the ABI.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <cdall@linaro.org>
---
 Documentation/virtual/kvm/devices/arm-vgic.txt |  6 ++++++
 include/linux/irqchip/arm-gic.h                |  3 +++
 virt/kvm/arm/vgic/vgic-mmio-v2.c               | 20 ++++++++++++++++++--
 virt/kvm/arm/vgic/vgic-v2.c                    |  8 ++++----
 virt/kvm/arm/vgic/vgic.h                       |  9 ++++++++-
 5 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 76e61c883347..b2f60ca8b60c 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -83,6 +83,12 @@ Groups:
 
     Bits for undefined preemption levels are RAZ/WI.
 
+    For historical reasons and to provide ABI compatibility with userspace we
+    export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask
+    field in the lower 5 bits of a word, meaning that userspace must always
+    use the lower 5 bits to communicate with the KVM device and must shift the
+    value left by 3 places to obtain the actual priority mask level.
+
   Limitations:
     - Priorities are not implemented, and registers are RAZ/WI
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index eafc965b3eb8..dc30f3d057eb 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -96,6 +96,9 @@
 #define GICH_MISR_EOI			(1 << 0)
 #define GICH_MISR_U			(1 << 1)
 
+#define GICV_PMR_PRIORITY_SHIFT		3
+#define GICV_PMR_PRIORITY_MASK		(0x1f << GICV_PMR_PRIORITY_SHIFT)
+
 #ifndef __ASSEMBLY__
 
 #include <linux/irqdomain.h>
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a3ad7ff95c9b..0a4283ed9aa7 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -229,7 +229,15 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu,
 		val = vmcr.ctlr;
 		break;
 	case GIC_CPU_PRIMASK:
-		val = vmcr.pmr;
+		/*
+		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+		 * the PMR field as GICH_VMCR.VMPriMask rather than
+		 * GICC_PMR.Priority, so we expose the upper five bits of
+		 * priority mask to userspace using the lower bits in the
+		 * unsigned long.
+		 */
+		val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >>
+			GICV_PMR_PRIORITY_SHIFT;
 		break;
 	case GIC_CPU_BINPOINT:
 		val = vmcr.bpr;
@@ -262,7 +270,15 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
 		vmcr.ctlr = val;
 		break;
 	case GIC_CPU_PRIMASK:
-		vmcr.pmr = val;
+		/*
+		 * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the
+		 * the PMR field as GICH_VMCR.VMPriMask rather than
+		 * GICC_PMR.Priority, so we expose the upper five bits of
+		 * priority mask to userspace using the lower bits in the
+		 * unsigned long.
+		 */
+		vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) &
+			GICV_PMR_PRIORITY_MASK;
 		break;
 	case GIC_CPU_BINPOINT:
 		vmcr.bpr = val;
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index 94cf4b9b6471..b637d9c7afe3 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -206,8 +206,8 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 		GICH_VMCR_ALIAS_BINPOINT_MASK;
 	vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) &
 		GICH_VMCR_BINPOINT_MASK;
-	vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) &
-		GICH_VMCR_PRIMASK_MASK;
+	vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) <<
+		 GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
 
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
 }
@@ -222,8 +222,8 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
 			GICH_VMCR_ALIAS_BINPOINT_SHIFT;
 	vmcrp->bpr  = (vmcr & GICH_VMCR_BINPOINT_MASK) >>
 			GICH_VMCR_BINPOINT_SHIFT;
-	vmcrp->pmr  = (vmcr & GICH_VMCR_PRIMASK_MASK) >>
-			GICH_VMCR_PRIMASK_SHIFT;
+	vmcrp->pmr  = ((vmcr & GICH_VMCR_PRIMASK_MASK) >>
+			GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
 }
 
 void vgic_v2_enable(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 91566f5aac9b..6cf557e9f718 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -81,11 +81,18 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
 		return irq->pending_latch || irq->line_level;
 }
 
+/*
+ * This struct provides an intermediate representation of the fields contained
+ * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
+ * state to userspace can generate either GICv2 or GICv3 CPU interface
+ * registers regardless of the hardware backed GIC used.
+ */
 struct vgic_vmcr {
 	u32	ctlr;
 	u32	abpr;
 	u32	bpr;
-	u32	pmr;
+	u32	pmr;  /* Priority mask field in the GICC_PMR and
+		       * ICC_PMR_EL1 priority field format */
 	/* Below member variable are valid only for GICv3 */
 	u32	grpen0;
 	u32	grpen1;
-- 
cgit v1.2.3


From dabf54dd1c6369160f8d4c793a8613dfb4e7848a Mon Sep 17 00:00:00 2001
From: Yegor Yefremov <yegorslists@googlemail.com>
Date: Fri, 17 Feb 2017 16:52:33 +0100
Subject: can: ti_hecc: Convert TI HECC driver to DT only driver

This patch converts TI HECC driver to DT only driver. This results in
removing ti_hecc.h containing now obsolete platform data.

Former transceiver_switch callback function will be now modelled via
regulator API.

Signed-off-by: Anton Glukhov <anton.a.glukhov@gmail.com>
Signed-off-by: Yegor Yefremov <yegorslists@googlemail.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/ti_hecc.c            | 170 ++++++++++++++++++-----------------
 include/linux/can/platform/ti_hecc.h |  44 ---------
 2 files changed, 88 insertions(+), 126 deletions(-)
 delete mode 100644 include/linux/can/platform/ti_hecc.h

(limited to 'include/linux')

diff --git a/drivers/net/can/ti_hecc.c b/drivers/net/can/ti_hecc.c
index 6749b1829469..b8aac538275c 100644
--- a/drivers/net/can/ti_hecc.c
+++ b/drivers/net/can/ti_hecc.c
@@ -17,25 +17,6 @@
  *
  */
 
-/*
- * Your platform definitions should specify module ram offsets and interrupt
- * number to use as follows:
- *
- * static struct ti_hecc_platform_data am3517_evm_hecc_pdata = {
- *         .scc_hecc_offset        = 0,
- *         .scc_ram_offset         = 0x3000,
- *         .hecc_ram_offset        = 0x3000,
- *         .mbx_offset             = 0x2000,
- *         .int_line               = 0,
- *         .revision               = 1,
- *         .transceiver_switch     = hecc_phy_control,
- * };
- *
- * Please see include/linux/can/platform/ti_hecc.h for description of
- * above fields.
- *
- */
-
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -46,11 +27,13 @@
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/regulator/consumer.h>
 
 #include <linux/can/dev.h>
 #include <linux/can/error.h>
 #include <linux/can/led.h>
-#include <linux/can/platform/ti_hecc.h>
 
 #define DRV_NAME "ti_hecc"
 #define HECC_MODULE_VERSION     "0.7"
@@ -214,15 +197,14 @@ struct ti_hecc_priv {
 	struct net_device *ndev;
 	struct clk *clk;
 	void __iomem *base;
-	u32 scc_ram_offset;
-	u32 hecc_ram_offset;
-	u32 mbx_offset;
-	u32 int_line;
+	void __iomem *hecc_ram;
+	void __iomem *mbx;
+	bool use_hecc1int;
 	spinlock_t mbx_lock; /* CANME register needs protection */
 	u32 tx_head;
 	u32 tx_tail;
 	u32 rx_next;
-	void (*transceiver_switch)(int);
+	struct regulator *reg_xceiver;
 };
 
 static inline int get_tx_head_mb(struct ti_hecc_priv *priv)
@@ -242,20 +224,18 @@ static inline int get_tx_head_prio(struct ti_hecc_priv *priv)
 
 static inline void hecc_write_lam(struct ti_hecc_priv *priv, u32 mbxno, u32 val)
 {
-	__raw_writel(val, priv->base + priv->hecc_ram_offset + mbxno * 4);
+	__raw_writel(val, priv->hecc_ram + mbxno * 4);
 }
 
 static inline void hecc_write_mbx(struct ti_hecc_priv *priv, u32 mbxno,
 	u32 reg, u32 val)
 {
-	__raw_writel(val, priv->base + priv->mbx_offset + mbxno * 0x10 +
-			reg);
+	__raw_writel(val, priv->mbx + mbxno * 0x10 + reg);
 }
 
 static inline u32 hecc_read_mbx(struct ti_hecc_priv *priv, u32 mbxno, u32 reg)
 {
-	return __raw_readl(priv->base + priv->mbx_offset + mbxno * 0x10 +
-			reg);
+	return __raw_readl(priv->mbx + mbxno * 0x10 + reg);
 }
 
 static inline void hecc_write(struct ti_hecc_priv *priv, u32 reg, u32 val)
@@ -311,11 +291,16 @@ static int ti_hecc_set_btc(struct ti_hecc_priv *priv)
 	return 0;
 }
 
-static void ti_hecc_transceiver_switch(const struct ti_hecc_priv *priv,
-					int on)
+static int ti_hecc_transceiver_switch(const struct ti_hecc_priv *priv,
+				      int on)
 {
-	if (priv->transceiver_switch)
-		priv->transceiver_switch(on);
+	if (!priv->reg_xceiver)
+		return 0;
+
+	if (on)
+		return regulator_enable(priv->reg_xceiver);
+	else
+		return regulator_disable(priv->reg_xceiver);
 }
 
 static void ti_hecc_reset(struct net_device *ndev)
@@ -409,7 +394,7 @@ static void ti_hecc_start(struct net_device *ndev)
 
 	/* Prevent message over-write & Enable interrupts */
 	hecc_write(priv, HECC_CANOPC, HECC_SET_REG);
-	if (priv->int_line) {
+	if (priv->use_hecc1int) {
 		hecc_write(priv, HECC_CANMIL, HECC_SET_REG);
 		hecc_write(priv, HECC_CANGIM, HECC_CANGIM_DEF_MASK |
 			HECC_CANGIM_I1EN | HECC_CANGIM_SIL);
@@ -760,7 +745,7 @@ static irqreturn_t ti_hecc_interrupt(int irq, void *dev_id)
 	unsigned long ack, flags;
 
 	int_status = hecc_read(priv,
-		(priv->int_line) ? HECC_CANGIF1 : HECC_CANGIF0);
+		(priv->use_hecc1int) ? HECC_CANGIF1 : HECC_CANGIF0);
 
 	if (!int_status)
 		return IRQ_NONE;
@@ -806,7 +791,7 @@ static irqreturn_t ti_hecc_interrupt(int irq, void *dev_id)
 	}
 
 	/* clear all interrupt conditions - read back to avoid spurious ints */
-	if (priv->int_line) {
+	if (priv->use_hecc1int) {
 		hecc_write(priv, HECC_CANGIF1, HECC_SET_REG);
 		int_status = hecc_read(priv, HECC_CANGIF1);
 	} else {
@@ -872,58 +857,87 @@ static const struct net_device_ops ti_hecc_netdev_ops = {
 	.ndo_change_mtu		= can_change_mtu,
 };
 
+static const struct of_device_id ti_hecc_dt_ids[] = {
+	{
+		.compatible = "ti,am3517-hecc",
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(of, ti_hecc_dt_ids);
+
 static int ti_hecc_probe(struct platform_device *pdev)
 {
 	struct net_device *ndev = (struct net_device *)0;
 	struct ti_hecc_priv *priv;
-	struct ti_hecc_platform_data *pdata;
-	struct resource *mem, *irq;
-	void __iomem *addr;
+	struct device_node *np = pdev->dev.of_node;
+	struct resource *res, *irq;
+	struct regulator *reg_xceiver;
 	int err = -ENODEV;
 
-	pdata = dev_get_platdata(&pdev->dev);
-	if (!pdata) {
-		dev_err(&pdev->dev, "No platform data\n");
-		goto probe_exit;
+	if (!IS_ENABLED(CONFIG_OF) || !np)
+		return -EINVAL;
+
+	reg_xceiver = devm_regulator_get(&pdev->dev, "xceiver");
+	if (PTR_ERR(reg_xceiver) == -EPROBE_DEFER)
+		return -EPROBE_DEFER;
+	else if (IS_ERR(reg_xceiver))
+		reg_xceiver = NULL;
+
+	ndev = alloc_candev(sizeof(struct ti_hecc_priv), HECC_MAX_TX_MBOX);
+	if (!ndev) {
+		dev_err(&pdev->dev, "alloc_candev failed\n");
+		return -ENOMEM;
 	}
+	priv = netdev_priv(ndev);
 
-	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!mem) {
-		dev_err(&pdev->dev, "No mem resources\n");
-		goto probe_exit;
+	/* handle hecc memory */
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "hecc");
+	if (!res) {
+		dev_err(&pdev->dev, "can't get IORESOURCE_MEM hecc\n");
+		return -EINVAL;
 	}
-	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!irq) {
-		dev_err(&pdev->dev, "No irq resource\n");
-		goto probe_exit;
+
+	priv->base = devm_ioremap_resource(&pdev->dev, res);
+	if (!priv->base) {
+		dev_err(&pdev->dev, "hecc ioremap failed\n");
+		return -ENOMEM;
 	}
-	if (!request_mem_region(mem->start, resource_size(mem), pdev->name)) {
-		dev_err(&pdev->dev, "HECC region already claimed\n");
-		err = -EBUSY;
-		goto probe_exit;
+
+	/* handle hecc-ram memory */
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "hecc-ram");
+	if (!res) {
+		dev_err(&pdev->dev, "can't get IORESOURCE_MEM hecc-ram\n");
+		return -EINVAL;
 	}
-	addr = ioremap(mem->start, resource_size(mem));
-	if (!addr) {
-		dev_err(&pdev->dev, "ioremap failed\n");
-		err = -ENOMEM;
-		goto probe_exit_free_region;
+
+	priv->hecc_ram = devm_ioremap_resource(&pdev->dev, res);
+	if (!priv->hecc_ram) {
+		dev_err(&pdev->dev, "hecc-ram ioremap failed\n");
+		return -ENOMEM;
 	}
 
-	ndev = alloc_candev(sizeof(struct ti_hecc_priv), HECC_MAX_TX_MBOX);
-	if (!ndev) {
-		dev_err(&pdev->dev, "alloc_candev failed\n");
-		err = -ENOMEM;
-		goto probe_exit_iounmap;
+	/* handle mbx memory */
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "mbx");
+	if (!res) {
+		dev_err(&pdev->dev, "can't get IORESOURCE_MEM mbx\n");
+		return -EINVAL;
+	}
+
+	priv->mbx = devm_ioremap_resource(&pdev->dev, res);
+	if (!priv->mbx) {
+		dev_err(&pdev->dev, "mbx ioremap failed\n");
+		return -ENOMEM;
+	}
+
+	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!irq) {
+		dev_err(&pdev->dev, "No irq resource\n");
+		goto probe_exit;
 	}
 
-	priv = netdev_priv(ndev);
 	priv->ndev = ndev;
-	priv->base = addr;
-	priv->scc_ram_offset = pdata->scc_ram_offset;
-	priv->hecc_ram_offset = pdata->hecc_ram_offset;
-	priv->mbx_offset = pdata->mbx_offset;
-	priv->int_line = pdata->int_line;
-	priv->transceiver_switch = pdata->transceiver_switch;
+	priv->reg_xceiver = reg_xceiver;
+	priv->use_hecc1int = of_property_read_bool(np, "ti,use-hecc1int");
 
 	priv->can.bittiming_const = &ti_hecc_bittiming_const;
 	priv->can.do_set_mode = ti_hecc_do_set_mode;
@@ -971,32 +985,23 @@ probe_exit_clk:
 	clk_put(priv->clk);
 probe_exit_candev:
 	free_candev(ndev);
-probe_exit_iounmap:
-	iounmap(addr);
-probe_exit_free_region:
-	release_mem_region(mem->start, resource_size(mem));
 probe_exit:
 	return err;
 }
 
 static int ti_hecc_remove(struct platform_device *pdev)
 {
-	struct resource *res;
 	struct net_device *ndev = platform_get_drvdata(pdev);
 	struct ti_hecc_priv *priv = netdev_priv(ndev);
 
 	unregister_candev(ndev);
 	clk_disable_unprepare(priv->clk);
 	clk_put(priv->clk);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	iounmap(priv->base);
-	release_mem_region(res->start, resource_size(res));
 	free_candev(ndev);
 
 	return 0;
 }
 
-
 #ifdef CONFIG_PM
 static int ti_hecc_suspend(struct platform_device *pdev, pm_message_t state)
 {
@@ -1045,6 +1050,7 @@ static int ti_hecc_resume(struct platform_device *pdev)
 static struct platform_driver ti_hecc_driver = {
 	.driver = {
 		.name    = DRV_NAME,
+		.of_match_table = ti_hecc_dt_ids,
 	},
 	.probe = ti_hecc_probe,
 	.remove = ti_hecc_remove,
diff --git a/include/linux/can/platform/ti_hecc.h b/include/linux/can/platform/ti_hecc.h
deleted file mode 100644
index a52f47ca6c8a..000000000000
--- a/include/linux/can/platform/ti_hecc.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _CAN_PLATFORM_TI_HECC_H
-#define _CAN_PLATFORM_TI_HECC_H
-
-/*
- * TI HECC (High End CAN Controller) driver platform header
- *
- * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation version 2.
- *
- * This program is distributed as is WITHOUT ANY WARRANTY of any
- * kind, whether express or implied; without even the implied warranty
- * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-/**
- * struct hecc_platform_data - HECC Platform Data
- *
- * @scc_hecc_offset:	mostly 0 - should really never change
- * @scc_ram_offset:	SCC RAM offset
- * @hecc_ram_offset:	HECC RAM offset
- * @mbx_offset:		Mailbox RAM offset
- * @int_line:		Interrupt line to use - 0 or 1
- * @version:		version for future use
- * @transceiver_switch:	platform specific callback fn for transceiver control
- *
- * Platform data structure to get all platform specific settings.
- * this structure also accounts the fact that the IP may have different
- * RAM and mailbox offsets for different SOC's
- */
-struct ti_hecc_platform_data {
-	u32 scc_hecc_offset;
-	u32 scc_ram_offset;
-	u32 hecc_ram_offset;
-	u32 mbx_offset;
-	u32 int_line;
-	u32 version;
-	void (*transceiver_switch) (int);
-};
-#endif /* !_CAN_PLATFORM_TI_HECC_H */
-- 
cgit v1.2.3


From 8e8cda6d737d356054c9eeef642aec0e8ae7e6bc Mon Sep 17 00:00:00 2001
From: Mario Kicherer <dev@kicherer.org>
Date: Tue, 21 Feb 2017 12:19:47 +0100
Subject: can: initial support for network namespaces

This patch adds initial support for network namespaces. The changes only
enable support in the CAN raw, proc and af_can code. GW and BCM still
have their checks that ensure that they are used only from the main
namespace.

The patch boils down to moving the global structures, i.e. the global
filter list and their /proc stats, into a per-namespace structure and passing
around the corresponding "struct net" in a lot of different places.

Changes since v1:
 - rebased on current HEAD (2bfe01e)
 - fixed overlong line

Signed-off-by: Mario Kicherer <dev@kicherer.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/core.h    |   7 ++-
 include/net/net_namespace.h |   4 ++
 include/net/netns/can.h     |  31 ++++++++++
 net/can/af_can.c            | 122 +++++++++++++++++++++----------------
 net/can/af_can.h            |   4 +-
 net/can/bcm.c               |  13 ++--
 net/can/gw.c                |   4 +-
 net/can/proc.c              | 144 +++++++++++++++++++++-----------------------
 net/can/raw.c               |  92 ++++++++++++++++------------
 9 files changed, 242 insertions(+), 179 deletions(-)
 create mode 100644 include/net/netns/can.h

(limited to 'include/linux')

diff --git a/include/linux/can/core.h b/include/linux/can/core.h
index df08a41d5be5..319a0da827b8 100644
--- a/include/linux/can/core.h
+++ b/include/linux/can/core.h
@@ -45,12 +45,13 @@ struct can_proto {
 extern int  can_proto_register(const struct can_proto *cp);
 extern void can_proto_unregister(const struct can_proto *cp);
 
-int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
+int can_rx_register(struct net *net, struct net_device *dev,
+		    canid_t can_id, canid_t mask,
 		    void (*func)(struct sk_buff *, void *),
 		    void *data, char *ident, struct sock *sk);
 
-extern void can_rx_unregister(struct net_device *dev, canid_t can_id,
-			      canid_t mask,
+extern void can_rx_unregister(struct net *net, struct net_device *dev,
+			      canid_t can_id, canid_t mask,
 			      void (*func)(struct sk_buff *, void *),
 			      void *data);
 
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index af8fe8a909dc..fe80bb48ab1f 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -27,6 +27,7 @@
 #include <net/netns/nftables.h>
 #include <net/netns/xfrm.h>
 #include <net/netns/mpls.h>
+#include <net/netns/can.h>
 #include <linux/ns_common.h>
 #include <linux/idr.h>
 #include <linux/skbuff.h>
@@ -140,6 +141,9 @@ struct net {
 #endif
 #if IS_ENABLED(CONFIG_MPLS)
 	struct netns_mpls	mpls;
+#endif
+#if IS_ENABLED(CONFIG_CAN)
+	struct netns_can	can;
 #endif
 	struct sock		*diag_nlsk;
 	atomic_t		fnhe_genid;
diff --git a/include/net/netns/can.h b/include/net/netns/can.h
new file mode 100644
index 000000000000..e8beba772f1a
--- /dev/null
+++ b/include/net/netns/can.h
@@ -0,0 +1,31 @@
+/*
+ * can in net namespaces
+ */
+
+#ifndef __NETNS_CAN_H__
+#define __NETNS_CAN_H__
+
+#include <linux/spinlock.h>
+
+struct dev_rcv_lists;
+
+struct netns_can {
+#if IS_ENABLED(CONFIG_PROC_FS)
+	struct proc_dir_entry *proc_dir;
+	struct proc_dir_entry *pde_version;
+	struct proc_dir_entry *pde_stats;
+	struct proc_dir_entry *pde_reset_stats;
+	struct proc_dir_entry *pde_rcvlist_all;
+	struct proc_dir_entry *pde_rcvlist_fil;
+	struct proc_dir_entry *pde_rcvlist_inv;
+	struct proc_dir_entry *pde_rcvlist_sff;
+	struct proc_dir_entry *pde_rcvlist_eff;
+	struct proc_dir_entry *pde_rcvlist_err;
+#endif
+
+	/* receive filters subscribed for 'all' CAN devices */
+	struct dev_rcv_lists *can_rx_alldev_list;
+	spinlock_t can_rcvlists_lock;
+};
+
+#endif /* __NETNS_CAN_H__ */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 5488e4a6ccd0..abf7d854a94d 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -75,9 +75,7 @@ static int stats_timer __read_mostly = 1;
 module_param(stats_timer, int, S_IRUGO);
 MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");
 
-/* receive filters subscribed for 'all' CAN devices */
-struct dev_rcv_lists can_rx_alldev_list;
-static DEFINE_SPINLOCK(can_rcvlists_lock);
+static int can_net_id;
 
 static struct kmem_cache *rcv_cache __read_mostly;
 
@@ -145,9 +143,6 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
 	if (protocol < 0 || protocol >= CAN_NPROTO)
 		return -EINVAL;
 
-	if (!net_eq(net, &init_net))
-		return -EAFNOSUPPORT;
-
 	cp = can_get_proto(protocol);
 
 #ifdef CONFIG_MODULES
@@ -331,10 +326,11 @@ EXPORT_SYMBOL(can_send);
  * af_can rx path
  */
 
-static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev)
+static struct dev_rcv_lists *find_dev_rcv_lists(struct net *net,
+						struct net_device *dev)
 {
 	if (!dev)
-		return &can_rx_alldev_list;
+		return net->can.can_rx_alldev_list;
 	else
 		return (struct dev_rcv_lists *)dev->ml_priv;
 }
@@ -467,9 +463,9 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
  *  -ENOMEM on missing cache mem to create subscription entry
  *  -ENODEV unknown device
  */
-int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
-		    void (*func)(struct sk_buff *, void *), void *data,
-		    char *ident, struct sock *sk)
+int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
+		    canid_t mask, void (*func)(struct sk_buff *, void *),
+		    void *data, char *ident, struct sock *sk)
 {
 	struct receiver *r;
 	struct hlist_head *rl;
@@ -481,13 +477,16 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
 	if (dev && dev->type != ARPHRD_CAN)
 		return -ENODEV;
 
+	if (dev && !net_eq(net, dev_net(dev)))
+		return -ENODEV;
+
 	r = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
 	if (!r)
 		return -ENOMEM;
 
-	spin_lock(&can_rcvlists_lock);
+	spin_lock(&net->can.can_rcvlists_lock);
 
-	d = find_dev_rcv_lists(dev);
+	d = find_dev_rcv_lists(net, dev);
 	if (d) {
 		rl = find_rcv_list(&can_id, &mask, d);
 
@@ -510,7 +509,7 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask,
 		err = -ENODEV;
 	}
 
-	spin_unlock(&can_rcvlists_lock);
+	spin_unlock(&net->can.can_rcvlists_lock);
 
 	return err;
 }
@@ -540,8 +539,9 @@ static void can_rx_delete_receiver(struct rcu_head *rp)
  * Description:
  *  Removes subscription entry depending on given (subscription) values.
  */
-void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
-		       void (*func)(struct sk_buff *, void *), void *data)
+void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
+		       canid_t mask, void (*func)(struct sk_buff *, void *),
+		       void *data)
 {
 	struct receiver *r = NULL;
 	struct hlist_head *rl;
@@ -550,9 +550,12 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
 	if (dev && dev->type != ARPHRD_CAN)
 		return;
 
-	spin_lock(&can_rcvlists_lock);
+	if (dev && !net_eq(net, dev_net(dev)))
+		return;
 
-	d = find_dev_rcv_lists(dev);
+	spin_lock(&net->can.can_rcvlists_lock);
+
+	d = find_dev_rcv_lists(net, dev);
 	if (!d) {
 		pr_err("BUG: receive list not found for "
 		       "dev %s, id %03X, mask %03X\n",
@@ -598,7 +601,7 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask,
 	}
 
  out:
-	spin_unlock(&can_rcvlists_lock);
+	spin_unlock(&net->can.can_rcvlists_lock);
 
 	/* schedule the receiver item for deletion */
 	if (r) {
@@ -696,10 +699,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_lock();
 
 	/* deliver the packet to sockets listening on all devices */
-	matches = can_rcv_filter(&can_rx_alldev_list, skb);
+	matches = can_rcv_filter(dev_net(dev)->can.can_rx_alldev_list, skb);
 
 	/* find receive list for this device */
-	d = find_dev_rcv_lists(dev);
+	d = find_dev_rcv_lists(dev_net(dev), dev);
 	if (d)
 		matches += can_rcv_filter(d, skb);
 
@@ -719,9 +722,6 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
 {
 	struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
 
-	if (unlikely(!net_eq(dev_net(dev), &init_net)))
-		goto drop;
-
 	if (WARN_ONCE(dev->type != ARPHRD_CAN ||
 		      skb->len != CAN_MTU ||
 		      cfd->len > CAN_MAX_DLEN,
@@ -743,9 +743,6 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
 {
 	struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
 
-	if (unlikely(!net_eq(dev_net(dev), &init_net)))
-		goto drop;
-
 	if (WARN_ONCE(dev->type != ARPHRD_CAN ||
 		      skb->len != CANFD_MTU ||
 		      cfd->len > CANFD_MAX_DLEN,
@@ -835,9 +832,6 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct dev_rcv_lists *d;
 
-	if (!net_eq(dev_net(dev), &init_net))
-		return NOTIFY_DONE;
-
 	if (dev->type != ARPHRD_CAN)
 		return NOTIFY_DONE;
 
@@ -855,7 +849,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
 		break;
 
 	case NETDEV_UNREGISTER:
-		spin_lock(&can_rcvlists_lock);
+		spin_lock(&dev_net(dev)->can.can_rcvlists_lock);
 
 		d = dev->ml_priv;
 		if (d) {
@@ -869,7 +863,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
 			pr_err("can: notifier: receive list not found for dev "
 			       "%s\n", dev->name);
 
-		spin_unlock(&can_rcvlists_lock);
+		spin_unlock(&dev_net(dev)->can.can_rcvlists_lock);
 
 		break;
 	}
@@ -877,6 +871,40 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg,
 	return NOTIFY_DONE;
 }
 
+static int can_pernet_init(struct net *net)
+{
+	net->can.can_rcvlists_lock =
+		__SPIN_LOCK_UNLOCKED(net->can.can_rcvlists_lock);
+	net->can.can_rx_alldev_list =
+		kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL);
+
+	if (IS_ENABLED(CONFIG_PROC_FS))
+		can_init_proc(net);
+
+	return 0;
+}
+
+static void can_pernet_exit(struct net *net)
+{
+	struct net_device *dev;
+
+	if (IS_ENABLED(CONFIG_PROC_FS))
+		can_remove_proc(net);
+
+	/* remove created dev_rcv_lists from still registered CAN devices */
+	rcu_read_lock();
+	for_each_netdev_rcu(net, dev) {
+		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
+			struct dev_rcv_lists *d = dev->ml_priv;
+
+			BUG_ON(d->entries);
+			kfree(d);
+			dev->ml_priv = NULL;
+		}
+	}
+	rcu_read_unlock();
+}
+
 /*
  * af_can module init/exit functions
  */
@@ -902,6 +930,13 @@ static struct notifier_block can_netdev_notifier __read_mostly = {
 	.notifier_call = can_notifier,
 };
 
+static struct pernet_operations can_pernet_ops __read_mostly = {
+	.init = can_pernet_init,
+	.exit = can_pernet_exit,
+	.id = &can_net_id,
+	.size = 0,
+};
+
 static __init int can_init(void)
 {
 	/* check for correct padding to be able to use the structs similarly */
@@ -912,8 +947,6 @@ static __init int can_init(void)
 
 	pr_info("can: controller area network core (" CAN_VERSION_STRING ")\n");
 
-	memset(&can_rx_alldev_list, 0, sizeof(can_rx_alldev_list));
-
 	rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
 				      0, 0, NULL);
 	if (!rcv_cache)
@@ -925,9 +958,10 @@ static __init int can_init(void)
 			setup_timer(&can_stattimer, can_stat_update, 0);
 			mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
 		}
-		can_init_proc();
 	}
 
+	register_pernet_subsys(&can_pernet_ops);
+
 	/* protocol register */
 	sock_register(&can_family_ops);
 	register_netdevice_notifier(&can_netdev_notifier);
@@ -939,13 +973,9 @@ static __init int can_init(void)
 
 static __exit void can_exit(void)
 {
-	struct net_device *dev;
-
 	if (IS_ENABLED(CONFIG_PROC_FS)) {
 		if (stats_timer)
 			del_timer_sync(&can_stattimer);
-
-		can_remove_proc();
 	}
 
 	/* protocol unregister */
@@ -954,19 +984,7 @@ static __exit void can_exit(void)
 	unregister_netdevice_notifier(&can_netdev_notifier);
 	sock_unregister(PF_CAN);
 
-	/* remove created dev_rcv_lists from still registered CAN devices */
-	rcu_read_lock();
-	for_each_netdev_rcu(&init_net, dev) {
-		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
-
-			struct dev_rcv_lists *d = dev->ml_priv;
-
-			BUG_ON(d->entries);
-			kfree(d);
-			dev->ml_priv = NULL;
-		}
-	}
-	rcu_read_unlock();
+	unregister_pernet_subsys(&can_pernet_ops);
 
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 
diff --git a/net/can/af_can.h b/net/can/af_can.h
index b86f5129e838..f273c9d9b129 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -114,8 +114,8 @@ struct s_pstats {
 extern struct dev_rcv_lists can_rx_alldev_list;
 
 /* function prototypes for the CAN networklayer procfs (proc.c) */
-void can_init_proc(void);
-void can_remove_proc(void);
+void can_init_proc(struct net *net);
+void can_remove_proc(struct net *net);
 void can_stat_update(unsigned long data);
 
 /* structures and variables from af_can.c needed in proc.c for reading */
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 95d13b233c65..1976629a8463 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -764,8 +764,8 @@ static void bcm_remove_op(struct bcm_op *op)
 static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
 {
 	if (op->rx_reg_dev == dev) {
-		can_rx_unregister(dev, op->can_id, REGMASK(op->can_id),
-				  bcm_rx_handler, op);
+		can_rx_unregister(&init_net, dev, op->can_id,
+				  REGMASK(op->can_id), bcm_rx_handler, op);
 
 		/* mark as removed subscription */
 		op->rx_reg_dev = NULL;
@@ -808,7 +808,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
 					}
 				}
 			} else
-				can_rx_unregister(NULL, op->can_id,
+				can_rx_unregister(&init_net, NULL, op->can_id,
 						  REGMASK(op->can_id),
 						  bcm_rx_handler, op);
 
@@ -1222,7 +1222,8 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 
 			dev = dev_get_by_index(&init_net, ifindex);
 			if (dev) {
-				err = can_rx_register(dev, op->can_id,
+				err = can_rx_register(&init_net, dev,
+						      op->can_id,
 						      REGMASK(op->can_id),
 						      bcm_rx_handler, op,
 						      "bcm", sk);
@@ -1232,7 +1233,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
 			}
 
 		} else
-			err = can_rx_register(NULL, op->can_id,
+			err = can_rx_register(&init_net, NULL, op->can_id,
 					      REGMASK(op->can_id),
 					      bcm_rx_handler, op, "bcm", sk);
 		if (err) {
@@ -1528,7 +1529,7 @@ static int bcm_release(struct socket *sock)
 				}
 			}
 		} else
-			can_rx_unregister(NULL, op->can_id,
+			can_rx_unregister(&init_net, NULL, op->can_id,
 					  REGMASK(op->can_id),
 					  bcm_rx_handler, op);
 
diff --git a/net/can/gw.c b/net/can/gw.c
index 7056a1a2bb70..3c117a33e15f 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -440,14 +440,14 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
 
 static inline int cgw_register_filter(struct cgw_job *gwj)
 {
-	return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id,
+	return can_rx_register(&init_net, gwj->src.dev, gwj->ccgw.filter.can_id,
 			       gwj->ccgw.filter.can_mask, can_can_gw_rcv,
 			       gwj, "gw", NULL);
 }
 
 static inline void cgw_unregister_filter(struct cgw_job *gwj)
 {
-	can_rx_unregister(gwj->src.dev, gwj->ccgw.filter.can_id,
+	can_rx_unregister(&init_net, gwj->src.dev, gwj->ccgw.filter.can_id,
 			  gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj);
 }
 
diff --git a/net/can/proc.c b/net/can/proc.c
index 85ef7bb0f176..9a8d54d57b22 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -62,17 +62,6 @@
 #define CAN_PROC_RCVLIST_EFF "rcvlist_eff"
 #define CAN_PROC_RCVLIST_ERR "rcvlist_err"
 
-static struct proc_dir_entry *can_dir;
-static struct proc_dir_entry *pde_version;
-static struct proc_dir_entry *pde_stats;
-static struct proc_dir_entry *pde_reset_stats;
-static struct proc_dir_entry *pde_rcvlist_all;
-static struct proc_dir_entry *pde_rcvlist_fil;
-static struct proc_dir_entry *pde_rcvlist_inv;
-static struct proc_dir_entry *pde_rcvlist_sff;
-static struct proc_dir_entry *pde_rcvlist_eff;
-static struct proc_dir_entry *pde_rcvlist_err;
-
 static int user_reset;
 
 static const char rx_list_name[][8] = {
@@ -351,20 +340,21 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
 static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 {
 	/* double cast to prevent GCC warning */
-	int idx = (int)(long)m->private;
+	int idx = (int)(long)PDE_DATA(m->file->f_inode);
 	struct net_device *dev;
 	struct dev_rcv_lists *d;
+	struct net *net = m->private;
 
 	seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
 
 	rcu_read_lock();
 
 	/* receive list for 'all' CAN devices (dev == NULL) */
-	d = &can_rx_alldev_list;
+	d = net->can.can_rx_alldev_list;
 	can_rcvlist_proc_show_one(m, idx, NULL, d);
 
 	/* receive list for registered CAN devices */
-	for_each_netdev_rcu(&init_net, dev) {
+	for_each_netdev_rcu(net, dev) {
 		if (dev->type == ARPHRD_CAN && dev->ml_priv)
 			can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv);
 	}
@@ -377,7 +367,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 
 static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_rcvlist_proc_show, PDE_DATA(inode));
+	return single_open_net(inode, file, can_rcvlist_proc_show);
 }
 
 static const struct file_operations can_rcvlist_proc_fops = {
@@ -417,6 +407,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 {
 	struct net_device *dev;
 	struct dev_rcv_lists *d;
+	struct net *net = m->private;
 
 	/* RX_SFF */
 	seq_puts(m, "\nreceive list 'rx_sff':\n");
@@ -424,11 +415,11 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 	rcu_read_lock();
 
 	/* sff receive list for 'all' CAN devices (dev == NULL) */
-	d = &can_rx_alldev_list;
+	d = net->can.can_rx_alldev_list;
 	can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff));
 
 	/* sff receive list for registered CAN devices */
-	for_each_netdev_rcu(&init_net, dev) {
+	for_each_netdev_rcu(net, dev) {
 		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
 			d = dev->ml_priv;
 			can_rcvlist_proc_show_array(m, dev, d->rx_sff,
@@ -444,7 +435,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
 
 static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_rcvlist_sff_proc_show, NULL);
+	return single_open_net(inode, file, can_rcvlist_sff_proc_show);
 }
 
 static const struct file_operations can_rcvlist_sff_proc_fops = {
@@ -460,6 +451,7 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 {
 	struct net_device *dev;
 	struct dev_rcv_lists *d;
+	struct net *net = m->private;
 
 	/* RX_EFF */
 	seq_puts(m, "\nreceive list 'rx_eff':\n");
@@ -467,11 +459,11 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 	rcu_read_lock();
 
 	/* eff receive list for 'all' CAN devices (dev == NULL) */
-	d = &can_rx_alldev_list;
+	d = net->can.can_rx_alldev_list;
 	can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff));
 
 	/* eff receive list for registered CAN devices */
-	for_each_netdev_rcu(&init_net, dev) {
+	for_each_netdev_rcu(net, dev) {
 		if (dev->type == ARPHRD_CAN && dev->ml_priv) {
 			d = dev->ml_priv;
 			can_rcvlist_proc_show_array(m, dev, d->rx_eff,
@@ -487,7 +479,7 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
 
 static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_rcvlist_eff_proc_show, NULL);
+	return single_open_net(inode, file, can_rcvlist_eff_proc_show);
 }
 
 static const struct file_operations can_rcvlist_eff_proc_fops = {
@@ -498,82 +490,86 @@ static const struct file_operations can_rcvlist_eff_proc_fops = {
 	.release	= single_release,
 };
 
-/*
- * proc utility functions
- */
-
-static void can_remove_proc_readentry(const char *name)
-{
-	if (can_dir)
-		remove_proc_entry(name, can_dir);
-}
-
 /*
  * can_init_proc - create main CAN proc directory and procfs entries
  */
-void can_init_proc(void)
+void can_init_proc(struct net *net)
 {
 	/* create /proc/net/can directory */
-	can_dir = proc_mkdir("can", init_net.proc_net);
+	net->can.proc_dir = proc_net_mkdir(net, "can", net->proc_net);
 
-	if (!can_dir) {
-		pr_info("can: failed to create /proc/net/can.\n");
+	if (!net->can.proc_dir) {
+		printk(KERN_INFO "can: failed to create /proc/net/can . "
+			   "CONFIG_PROC_FS missing?\n");
 		return;
 	}
 
 	/* own procfs entries from the AF_CAN core */
-	pde_version     = proc_create(CAN_PROC_VERSION, 0644, can_dir,
-				      &can_version_proc_fops);
-	pde_stats       = proc_create(CAN_PROC_STATS, 0644, can_dir,
-				      &can_stats_proc_fops);
-	pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644, can_dir,
-				      &can_reset_stats_proc_fops);
-	pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644, can_dir,
-					   &can_rcvlist_proc_fops, (void *)RX_ERR);
-	pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644, can_dir,
-					   &can_rcvlist_proc_fops, (void *)RX_ALL);
-	pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644, can_dir,
-					   &can_rcvlist_proc_fops, (void *)RX_FIL);
-	pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, can_dir,
-					   &can_rcvlist_proc_fops, (void *)RX_INV);
-	pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644, can_dir,
-				      &can_rcvlist_eff_proc_fops);
-	pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, can_dir,
-				      &can_rcvlist_sff_proc_fops);
+	net->can.pde_version     = proc_create(CAN_PROC_VERSION, 0644,
+					       net->can.proc_dir,
+					       &can_version_proc_fops);
+	net->can.pde_stats       = proc_create(CAN_PROC_STATS, 0644,
+					       net->can.proc_dir,
+					       &can_stats_proc_fops);
+	net->can.pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644,
+					       net->can.proc_dir,
+					       &can_reset_stats_proc_fops);
+	net->can.pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644,
+						    net->can.proc_dir,
+						    &can_rcvlist_proc_fops,
+						    (void *)RX_ERR);
+	net->can.pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644,
+						    net->can.proc_dir,
+						    &can_rcvlist_proc_fops,
+						    (void *)RX_ALL);
+	net->can.pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644,
+						    net->can.proc_dir,
+						    &can_rcvlist_proc_fops,
+						    (void *)RX_FIL);
+	net->can.pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644,
+						    net->can.proc_dir,
+						    &can_rcvlist_proc_fops,
+						    (void *)RX_INV);
+	net->can.pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644,
+					       net->can.proc_dir,
+					       &can_rcvlist_eff_proc_fops);
+	net->can.pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644,
+					       net->can.proc_dir,
+					       &can_rcvlist_sff_proc_fops);
 }
 
 /*
  * can_remove_proc - remove procfs entries and main CAN proc directory
  */
-void can_remove_proc(void)
+void can_remove_proc(struct net *net)
 {
-	if (pde_version)
-		can_remove_proc_readentry(CAN_PROC_VERSION);
+	if (net->can.pde_version)
+		remove_proc_entry(CAN_PROC_VERSION, net->can.proc_dir);
 
-	if (pde_stats)
-		can_remove_proc_readentry(CAN_PROC_STATS);
+	if (net->can.pde_stats)
+		remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir);
 
-	if (pde_reset_stats)
-		can_remove_proc_readentry(CAN_PROC_RESET_STATS);
+	if (net->can.pde_reset_stats)
+		remove_proc_entry(CAN_PROC_RESET_STATS, net->can.proc_dir);
 
-	if (pde_rcvlist_err)
-		can_remove_proc_readentry(CAN_PROC_RCVLIST_ERR);
+	if (net->can.pde_rcvlist_err)
+		remove_proc_entry(CAN_PROC_RCVLIST_ERR, net->can.proc_dir);
 
-	if (pde_rcvlist_all)
-		can_remove_proc_readentry(CAN_PROC_RCVLIST_ALL);
+	if (net->can.pde_rcvlist_all)
+		remove_proc_entry(CAN_PROC_RCVLIST_ALL, net->can.proc_dir);
 
-	if (pde_rcvlist_fil)
-		can_remove_proc_readentry(CAN_PROC_RCVLIST_FIL);
+	if (net->can.pde_rcvlist_fil)
+		remove_proc_entry(CAN_PROC_RCVLIST_FIL, net->can.proc_dir);
 
-	if (pde_rcvlist_inv)
-		can_remove_proc_readentry(CAN_PROC_RCVLIST_INV);
+	if (net->can.pde_rcvlist_inv)
+		remove_proc_entry(CAN_PROC_RCVLIST_INV, net->can.proc_dir);
 
-	if (pde_rcvlist_eff)
-		can_remove_proc_readentry(CAN_PROC_RCVLIST_EFF);
+	if (net->can.pde_rcvlist_eff)
+		remove_proc_entry(CAN_PROC_RCVLIST_EFF, net->can.proc_dir);
 
-	if (pde_rcvlist_sff)
-		can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF);
+	if (net->can.pde_rcvlist_sff)
+		remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir);
 
-	if (can_dir)
-		remove_proc_entry("can", init_net.proc_net);
+	if (net->can.proc_dir)
+		remove_proc_entry("can", net->proc_net);
 }
diff --git a/net/can/raw.c b/net/can/raw.c
index 6dc546a06673..864c80dbdb72 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -181,20 +181,21 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 		kfree_skb(skb);
 }
 
-static int raw_enable_filters(struct net_device *dev, struct sock *sk,
-			      struct can_filter *filter, int count)
+static int raw_enable_filters(struct net *net, struct net_device *dev,
+			      struct sock *sk, struct can_filter *filter,
+			      int count)
 {
 	int err = 0;
 	int i;
 
 	for (i = 0; i < count; i++) {
-		err = can_rx_register(dev, filter[i].can_id,
+		err = can_rx_register(net, dev, filter[i].can_id,
 				      filter[i].can_mask,
 				      raw_rcv, sk, "raw", sk);
 		if (err) {
 			/* clean up successfully registered filters */
 			while (--i >= 0)
-				can_rx_unregister(dev, filter[i].can_id,
+				can_rx_unregister(net, dev, filter[i].can_id,
 						  filter[i].can_mask,
 						  raw_rcv, sk);
 			break;
@@ -204,57 +205,62 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk,
 	return err;
 }
 
-static int raw_enable_errfilter(struct net_device *dev, struct sock *sk,
-				can_err_mask_t err_mask)
+static int raw_enable_errfilter(struct net *net, struct net_device *dev,
+				struct sock *sk, can_err_mask_t err_mask)
 {
 	int err = 0;
 
 	if (err_mask)
-		err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG,
+		err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG,
 				      raw_rcv, sk, "raw", sk);
 
 	return err;
 }
 
-static void raw_disable_filters(struct net_device *dev, struct sock *sk,
-			      struct can_filter *filter, int count)
+static void raw_disable_filters(struct net *net, struct net_device *dev,
+				struct sock *sk, struct can_filter *filter,
+				int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++)
-		can_rx_unregister(dev, filter[i].can_id, filter[i].can_mask,
-				  raw_rcv, sk);
+		can_rx_unregister(net, dev, filter[i].can_id,
+				  filter[i].can_mask, raw_rcv, sk);
 }
 
-static inline void raw_disable_errfilter(struct net_device *dev,
+static inline void raw_disable_errfilter(struct net *net,
+					 struct net_device *dev,
 					 struct sock *sk,
 					 can_err_mask_t err_mask)
 
 {
 	if (err_mask)
-		can_rx_unregister(dev, 0, err_mask | CAN_ERR_FLAG,
+		can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG,
 				  raw_rcv, sk);
 }
 
-static inline void raw_disable_allfilters(struct net_device *dev,
+static inline void raw_disable_allfilters(struct net *net,
+					  struct net_device *dev,
 					  struct sock *sk)
 {
 	struct raw_sock *ro = raw_sk(sk);
 
-	raw_disable_filters(dev, sk, ro->filter, ro->count);
-	raw_disable_errfilter(dev, sk, ro->err_mask);
+	raw_disable_filters(net, dev, sk, ro->filter, ro->count);
+	raw_disable_errfilter(net, dev, sk, ro->err_mask);
 }
 
-static int raw_enable_allfilters(struct net_device *dev, struct sock *sk)
+static int raw_enable_allfilters(struct net *net, struct net_device *dev,
+				 struct sock *sk)
 {
 	struct raw_sock *ro = raw_sk(sk);
 	int err;
 
-	err = raw_enable_filters(dev, sk, ro->filter, ro->count);
+	err = raw_enable_filters(net, dev, sk, ro->filter, ro->count);
 	if (!err) {
-		err = raw_enable_errfilter(dev, sk, ro->err_mask);
+		err = raw_enable_errfilter(net, dev, sk, ro->err_mask);
 		if (err)
-			raw_disable_filters(dev, sk, ro->filter, ro->count);
+			raw_disable_filters(net, dev, sk, ro->filter,
+					    ro->count);
 	}
 
 	return err;
@@ -267,7 +273,7 @@ static int raw_notifier(struct notifier_block *nb,
 	struct raw_sock *ro = container_of(nb, struct raw_sock, notifier);
 	struct sock *sk = &ro->sk;
 
-	if (!net_eq(dev_net(dev), &init_net))
+	if (!net_eq(dev_net(dev), sock_net(sk)))
 		return NOTIFY_DONE;
 
 	if (dev->type != ARPHRD_CAN)
@@ -282,7 +288,7 @@ static int raw_notifier(struct notifier_block *nb,
 		lock_sock(sk);
 		/* remove current filters & unregister */
 		if (ro->bound)
-			raw_disable_allfilters(dev, sk);
+			raw_disable_allfilters(dev_net(dev), dev, sk);
 
 		if (ro->count > 1)
 			kfree(ro->filter);
@@ -358,13 +364,13 @@ static int raw_release(struct socket *sock)
 		if (ro->ifindex) {
 			struct net_device *dev;
 
-			dev = dev_get_by_index(&init_net, ro->ifindex);
+			dev = dev_get_by_index(sock_net(sk), ro->ifindex);
 			if (dev) {
-				raw_disable_allfilters(dev, sk);
+				raw_disable_allfilters(dev_net(dev), dev, sk);
 				dev_put(dev);
 			}
 		} else
-			raw_disable_allfilters(NULL, sk);
+			raw_disable_allfilters(sock_net(sk), NULL, sk);
 	}
 
 	if (ro->count > 1)
@@ -404,7 +410,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 	if (addr->can_ifindex) {
 		struct net_device *dev;
 
-		dev = dev_get_by_index(&init_net, addr->can_ifindex);
+		dev = dev_get_by_index(sock_net(sk), addr->can_ifindex);
 		if (!dev) {
 			err = -ENODEV;
 			goto out;
@@ -420,13 +426,13 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 		ifindex = dev->ifindex;
 
 		/* filters set by default/setsockopt */
-		err = raw_enable_allfilters(dev, sk);
+		err = raw_enable_allfilters(sock_net(sk), dev, sk);
 		dev_put(dev);
 	} else {
 		ifindex = 0;
 
 		/* filters set by default/setsockopt */
-		err = raw_enable_allfilters(NULL, sk);
+		err = raw_enable_allfilters(sock_net(sk), NULL, sk);
 	}
 
 	if (!err) {
@@ -435,13 +441,15 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 			if (ro->ifindex) {
 				struct net_device *dev;
 
-				dev = dev_get_by_index(&init_net, ro->ifindex);
+				dev = dev_get_by_index(sock_net(sk),
+						       ro->ifindex);
 				if (dev) {
-					raw_disable_allfilters(dev, sk);
+					raw_disable_allfilters(dev_net(dev),
+							       dev, sk);
 					dev_put(dev);
 				}
 			} else
-				raw_disable_allfilters(NULL, sk);
+				raw_disable_allfilters(sock_net(sk), NULL, sk);
 		}
 		ro->ifindex = ifindex;
 		ro->bound = 1;
@@ -517,15 +525,16 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
 		lock_sock(sk);
 
 		if (ro->bound && ro->ifindex)
-			dev = dev_get_by_index(&init_net, ro->ifindex);
+			dev = dev_get_by_index(sock_net(sk), ro->ifindex);
 
 		if (ro->bound) {
 			/* (try to) register the new filters */
 			if (count == 1)
-				err = raw_enable_filters(dev, sk, &sfilter, 1);
+				err = raw_enable_filters(sock_net(sk), dev, sk,
+							 &sfilter, 1);
 			else
-				err = raw_enable_filters(dev, sk, filter,
-							 count);
+				err = raw_enable_filters(sock_net(sk), dev, sk,
+							 filter, count);
 			if (err) {
 				if (count > 1)
 					kfree(filter);
@@ -533,7 +542,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
 			}
 
 			/* remove old filter registrations */
-			raw_disable_filters(dev, sk, ro->filter, ro->count);
+			raw_disable_filters(sock_net(sk), dev, sk, ro->filter,
+					    ro->count);
 		}
 
 		/* remove old filter space */
@@ -569,18 +579,20 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
 		lock_sock(sk);
 
 		if (ro->bound && ro->ifindex)
-			dev = dev_get_by_index(&init_net, ro->ifindex);
+			dev = dev_get_by_index(sock_net(sk), ro->ifindex);
 
 		/* remove current error mask */
 		if (ro->bound) {
 			/* (try to) register the new err_mask */
-			err = raw_enable_errfilter(dev, sk, err_mask);
+			err = raw_enable_errfilter(sock_net(sk), dev, sk,
+						   err_mask);
 
 			if (err)
 				goto out_err;
 
 			/* remove old err_mask registration */
-			raw_disable_errfilter(dev, sk, ro->err_mask);
+			raw_disable_errfilter(sock_net(sk), dev, sk,
+					      ro->err_mask);
 		}
 
 		/* link new err_mask to the socket */
@@ -741,7 +753,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 			return -EINVAL;
 	}
 
-	dev = dev_get_by_index(&init_net, ifindex);
+	dev = dev_get_by_index(sock_net(sk), ifindex);
 	if (!dev)
 		return -ENXIO;
 
-- 
cgit v1.2.3


From 62e24c5775ecb387a3eb33701378ccfa6dbc98ee Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Fri, 5 Feb 2016 13:41:39 +0100
Subject: reset: add exported __reset_control_get, return NULL if optional

Rename the internal __reset_control_get/put functions to
__reset_control_get/put_internal and add an exported
__reset_control_get equivalent to __of_reset_control_get
that takes a struct device parameter.
This avoids the confusing call to __of_reset_control_get in
the non-DT case and fixes the devm_reset_control_get_optional
function to return NULL if RESET_CONTROLLER is enabled but
dev->of_node == NULL.

Fixes: bb475230b8e5 ("reset: make optional functions really optional")
Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Ramiro Oliveira <Ramiro.Oliveira@synopsys.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 22 ++++++++++++++++------
 include/linux/reset.h | 22 ++++++++++++++--------
 2 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index f1e5e65388bb..cd739d2fa160 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -275,7 +275,7 @@ int reset_control_status(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_status);
 
-static struct reset_control *__reset_control_get(
+static struct reset_control *__reset_control_get_internal(
 				struct reset_controller_dev *rcdev,
 				unsigned int index, bool shared)
 {
@@ -308,7 +308,7 @@ static struct reset_control *__reset_control_get(
 	return rstc;
 }
 
-static void __reset_control_put(struct reset_control *rstc)
+static void __reset_control_put_internal(struct reset_control *rstc)
 {
 	lockdep_assert_held(&reset_list_mutex);
 
@@ -377,7 +377,7 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 	}
 
 	/* reset_list_mutex also protects the rcdev's reset_control list */
-	rstc = __reset_control_get(rcdev, rstc_id, shared);
+	rstc = __reset_control_get_internal(rcdev, rstc_id, shared);
 
 	mutex_unlock(&reset_list_mutex);
 
@@ -385,6 +385,17 @@ struct reset_control *__of_reset_control_get(struct device_node *node,
 }
 EXPORT_SYMBOL_GPL(__of_reset_control_get);
 
+struct reset_control *__reset_control_get(struct device *dev, const char *id,
+					  int index, bool shared, bool optional)
+{
+	if (dev->of_node)
+		return __of_reset_control_get(dev->of_node, id, index, shared,
+					      optional);
+
+	return optional ? NULL : ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(__reset_control_get);
+
 /**
  * reset_control_put - free the reset controller
  * @rstc: reset controller
@@ -396,7 +407,7 @@ void reset_control_put(struct reset_control *rstc)
 		return;
 
 	mutex_lock(&reset_list_mutex);
-	__reset_control_put(rstc);
+	__reset_control_put_internal(rstc);
 	mutex_unlock(&reset_list_mutex);
 }
 EXPORT_SYMBOL_GPL(reset_control_put);
@@ -417,8 +428,7 @@ struct reset_control *__devm_reset_control_get(struct device *dev,
 	if (!ptr)
 		return ERR_PTR(-ENOMEM);
 
-	rstc = __of_reset_control_get(dev ? dev->of_node : NULL,
-				      id, index, shared, optional);
+	rstc = __reset_control_get(dev, id, index, shared, optional);
 	if (!IS_ERR(rstc)) {
 		*ptr = rstc;
 		devres_add(dev, ptr);
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 96fb139bdd08..13d8681210d5 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -15,6 +15,9 @@ int reset_control_status(struct reset_control *rstc);
 struct reset_control *__of_reset_control_get(struct device_node *node,
 				     const char *id, int index, bool shared,
 				     bool optional);
+struct reset_control *__reset_control_get(struct device *dev, const char *id,
+					  int index, bool shared,
+					  bool optional);
 void reset_control_put(struct reset_control *rstc);
 struct reset_control *__devm_reset_control_get(struct device *dev,
 				     const char *id, int index, bool shared,
@@ -72,6 +75,13 @@ static inline struct reset_control *__of_reset_control_get(
 	return optional ? NULL : ERR_PTR(-ENOTSUPP);
 }
 
+static inline struct reset_control *__reset_control_get(
+					struct device *dev, const char *id,
+					int index, bool shared, bool optional)
+{
+	return optional ? NULL : ERR_PTR(-ENOTSUPP);
+}
+
 static inline struct reset_control *__devm_reset_control_get(
 					struct device *dev, const char *id,
 					int index, bool shared, bool optional)
@@ -102,8 +112,7 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 #ifndef CONFIG_RESET_CONTROLLER
 	WARN_ON(1);
 #endif
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
-									false);
+	return __reset_control_get(dev, id, 0, false, false);
 }
 
 /**
@@ -131,22 +140,19 @@ __must_check reset_control_get_exclusive(struct device *dev, const char *id)
 static inline struct reset_control *reset_control_get_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
-									false);
+	return __reset_control_get(dev, id, 0, true, false);
 }
 
 static inline struct reset_control *reset_control_get_optional_exclusive(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, false,
-									true);
+	return __reset_control_get(dev, id, 0, false, true);
 }
 
 static inline struct reset_control *reset_control_get_optional_shared(
 					struct device *dev, const char *id)
 {
-	return __of_reset_control_get(dev ? dev->of_node : NULL, id, 0, true,
-									true);
+	return __reset_control_get(dev, id, 0, true, true);
 }
 
 /**
-- 
cgit v1.2.3


From b1a951fe469eb51646bf2e6d2c5f4a19fe1d4627 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 5 Feb 2017 21:47:22 +0200
Subject: net/utils: generic inet_pton_with_scope helper

Several locations in the stack need to handle ipv4/ipv6
(with scope) and port strings conversion to sockaddr.
Add a helper that takes either AF_INET, AF_INET6 or
AF_UNSPEC (for wildcard) to centralize this handling.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/inet.h |   6 +++
 net/core/utils.c     | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/inet.h b/include/linux/inet.h
index 4cca05c9678e..636ebe87e6f8 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -43,6 +43,8 @@
 #define _LINUX_INET_H
 
 #include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/socket.h>
 
 /*
  * These mimic similar macros defined in user-space for inet_ntop(3).
@@ -54,4 +56,8 @@
 extern __be32 in_aton(const char *str);
 extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
 extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
+
+extern int inet_pton_with_scope(struct net *net, unsigned short af,
+		const char *src, const char *port, struct sockaddr_storage *addr);
+
 #endif	/* _LINUX_INET_H */
diff --git a/net/core/utils.c b/net/core/utils.c
index 6592d7bbed39..f96cf527bb8f 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -26,9 +26,11 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/ratelimit.h>
+#include <linux/socket.h>
 
 #include <net/sock.h>
 #include <net/net_ratelimit.h>
+#include <net/ipv6.h>
 
 #include <asm/byteorder.h>
 #include <linux/uaccess.h>
@@ -300,6 +302,107 @@ out:
 }
 EXPORT_SYMBOL(in6_pton);
 
+static int inet4_pton(const char *src, u16 port_num,
+		struct sockaddr_storage *addr)
+{
+	struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+	int srclen = strlen(src);
+
+	if (srclen > INET_ADDRSTRLEN)
+		return -EINVAL;
+
+	if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+		     '\n', NULL) == 0)
+		return -EINVAL;
+
+	addr4->sin_family = AF_INET;
+	addr4->sin_port = htons(port_num);
+
+	return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+		struct sockaddr_storage *addr)
+{
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+	const char *scope_delim;
+	int srclen = strlen(src);
+
+	if (srclen > INET6_ADDRSTRLEN)
+		return -EINVAL;
+
+	if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+		     '%', &scope_delim) == 0)
+		return -EINVAL;
+
+	if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+	    src + srclen != scope_delim && *scope_delim == '%') {
+		struct net_device *dev;
+		char scope_id[16];
+		size_t scope_len = min_t(size_t, sizeof(scope_id),
+					 src + srclen - scope_delim - 1);
+
+		memcpy(scope_id, scope_delim + 1, scope_len);
+		scope_id[scope_len] = '\0';
+
+		dev = dev_get_by_name(net, scope_id);
+		if (dev) {
+			addr6->sin6_scope_id = dev->ifindex;
+			dev_put(dev);
+		} else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+			return -EINVAL;
+		}
+	}
+
+	addr6->sin6_family = AF_INET6;
+	addr6->sin6_port = htons(port_num);
+
+	return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+		const char *src, const char *port, struct sockaddr_storage *addr)
+{
+	u16 port_num;
+	int ret = -EINVAL;
+
+	if (port) {
+		if (kstrtou16(port, 0, &port_num))
+			return -EINVAL;
+	} else {
+		port_num = 0;
+	}
+
+	switch (af) {
+	case AF_INET:
+		ret = inet4_pton(src, port_num, addr);
+		break;
+	case AF_INET6:
+		ret = inet6_pton(net, src, port_num, addr);
+		break;
+	case AF_UNSPEC:
+		ret = inet4_pton(src, port_num, addr);
+		if (ret)
+			ret = inet6_pton(net, src, port_num, addr);
+		break;
+	default:
+		pr_err("unexpected address family %d\n", af);
+	};
+
+	return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
 void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
 			      __be32 from, __be32 to, bool pseudohdr)
 {
-- 
cgit v1.2.3


From 0f222ccce359d21f927d07df2069e7029497b790 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 23 Mar 2017 20:41:22 -0700
Subject: nvme_fc: Sync FC-NVME header with standard

Update FC-NVME definitions to match FC-NVME r1.14 (16-020vB) plus
change voted in by 2/22 FC-NVME Adhoc (see HOSTID below).

Includes the following:
- Addition of "status_code" field to ERSP IU
- Addition of FC-NVME LS RJT reason_codes and reason_explanations
- CreateAssociation payload, HostID field shortened to 16 bytes

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/nvme-fc.h | 68 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 4b45226bd604..e997c4a49a88 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -16,8 +16,7 @@
  */
 
 /*
- * This file contains definitions relative to FC-NVME r1.11 and a few
- * newer items
+ * This file contains definitions relative to FC-NVME r1.14 (16-020vB).
  */
 
 #ifndef _NVME_FC_H
@@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu {
 
 #define NVME_FC_SIZEOF_ZEROS_RSP	12
 
+enum {
+	FCNVME_SC_SUCCESS		= 0,
+	FCNVME_SC_INVALID_FIELD		= 1,
+	FCNVME_SC_INVALID_CONNID	= 2,
+};
+
 struct nvme_fc_ersp_iu {
-	__u8			rsvd0[2];
+	__u8			status_code;
+	__u8			rsvd1;
 	__be16			iu_len;
 	__be32			rsn;
 	__be32			xfrd_len;
@@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu {
 };
 
 
-/* FC-NVME r1.03/16-119v0 NVME Link Services */
+/* FC-NVME Link Services */
 enum {
 	FCNVME_LS_RSVD			= 0,
 	FCNVME_LS_RJT			= 1,
@@ -68,7 +74,7 @@ enum {
 	FCNVME_LS_DISCONNECT		= 5,
 };
 
-/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+/* FC-NVME Link Service Descriptors */
 enum {
 	FCNVME_LSDESC_RSVD		= 0x0,
 	FCNVME_LSDESC_RQST		= 0x1,
@@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz)
 	return cpu_to_be32(sz - (2 * sizeof(u32)));
 }
 
-
 struct fcnvme_ls_rqst_w0 {
 	u8	ls_cmd;			/* FCNVME_LS_xxx */
 	u8	zeros[3];
@@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst {
 	__be32	rsvd12;
 };
 
+/* FC-NVME LS RJT reason_code values */
+enum fcnvme_ls_rjt_reason {
+	FCNVME_RJT_RC_NONE		= 0,
+	/* no reason - not to be sent */
+
+	FCNVME_RJT_RC_INVAL		= 0x01,
+	/* invalid NVMe_LS command code */
+
+	FCNVME_RJT_RC_LOGIC		= 0x03,
+	/* logical error */
+
+	FCNVME_RJT_RC_UNAB		= 0x09,
+	/* unable to perform command request */
+
+	FCNVME_RJT_RC_UNSUP		= 0x0b,
+	/* command not supported */
+
+	FCNVME_RJT_RC_INPROG		= 0x0e,
+	/* command already in progress */
 
+	FCNVME_RJT_RC_INV_ASSOC		= 0x40,
+	/* Invalid Association ID*/
 
+	FCNVME_RJT_RC_INV_CONN		= 0x41,
+	/* Invalid Connection ID*/
+
+	FCNVME_RJT_RC_VENDOR		= 0xff,
+	/* vendor specific error */
+};
+
+/* FC-NVME LS RJT reason_explanation values */
+enum fcnvme_ls_rjt_explan {
+	FCNVME_RJT_EXP_NONE		= 0x00,
+	/* No additional explanation */
+
+	FCNVME_RJT_EXP_OXID_RXID	= 0x17,
+	/* invalid OX_ID-RX_ID combination */
+
+	FCNVME_RJT_EXP_INSUF_RES	= 0x29,
+	/* insufficient resources */
+
+	FCNVME_RJT_EXP_UNAB_DATA	= 0x2a,
+	/* unable to supply requested data */
+
+	FCNVME_RJT_EXP_INV_LEN		= 0x2d,
+	/* Invalid payload length */
+};
 
 /* FCNVME_LSDESC_RJT */
 struct fcnvme_lsdesc_rjt {
@@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt {
 	 * Reject reason and explanaction codes are generic
 	 * to ELs's from LS-3.
 	 */
-	u8	reason_code;
-	u8	reason_explanation;
+	u8	reason_code;		/* fcnvme_ls_rjt_reason */
+	u8	reason_explanation;	/* fcnvme_ls_rjt_explan */
 
 	u8	vendor;
 	__be32	rsvd12;
 };
 
 
-#define FCNVME_ASSOC_HOSTID_LEN		64
+#define FCNVME_ASSOC_HOSTID_LEN		16
 #define FCNVME_ASSOC_HOSTNQN_LEN	256
 #define FCNVME_ASSOC_SUBNQN_LEN		256
 
-- 
cgit v1.2.3


From 62eeacb0e04f2aff7099a7765f386bb7ba53d5e2 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 23 Mar 2017 20:41:27 -0700
Subject: nvme_fc: Clean up host fcpio done status handling

As Dan Carpenter pointed out: mixing 16-bit nvme status with 32-bit
error status from driver. Corrected comment on fcp request struct
status field, and converted done routine to explicitly set nvme status
codes for nvme status.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/fc.c         | 14 +++++++-------
 include/linux/nvme-fc-driver.h |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index a02eeb69f85c..7a1d7ea5366e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1147,7 +1147,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 	struct nvme_fc_queue *queue = op->queue;
 	struct nvme_completion *cqe = &op->rsp_iu.cqe;
-	u16 status;
+	u16 status = NVME_SC_SUCCESS;
 
 	/*
 	 * WARNING:
@@ -1183,8 +1183,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 
 	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
 		status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
-	else
-		status = freq->status;
+	else if (freq->status)
+		status = NVME_SC_FC_TRANSPORT_ERROR;
 
 	/*
 	 * For the linux implementation, if we have an unsuccesful
@@ -1212,7 +1212,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 		 */
 		if (freq->transferred_length !=
 			be32_to_cpu(op->cmd_iu.data_len)) {
-			status = -EIO;
+			status = NVME_SC_FC_TRANSPORT_ERROR;
 			goto done;
 		}
 		op->nreq.result.u64 = 0;
@@ -1229,7 +1229,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 					freq->transferred_length ||
 			     op->rsp_iu.status_code ||
 			     op->rqno != le16_to_cpu(cqe->command_id))) {
-			status = -EIO;
+			status = NVME_SC_FC_TRANSPORT_ERROR;
 			goto done;
 		}
 		op->nreq.result = cqe->result;
@@ -1237,7 +1237,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 		break;
 
 	default:
-		status = -EIO;
+		status = NVME_SC_FC_TRANSPORT_ERROR;
 		goto done;
 	}
 
@@ -1763,7 +1763,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 	op->fcp_req.io_dir = io_dir;
 	op->fcp_req.transferred_length = 0;
 	op->fcp_req.rcv_rsplen = 0;
-	op->fcp_req.status = 0;
+	op->fcp_req.status = NVME_SC_SUCCESS;
 	op->fcp_req.sqid = cpu_to_le16(queue->qnum);
 
 	/*
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f21471f7ee40..16eb264980c2 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir {
  *             transferred. Should equal payload_length on success.
  * @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
  * @status:    Completion status of the FCP operation. must be 0 upon success,
- *             NVME_SC_FC_xxx value upon failure. Note: this is NOT a
- *             reflection of the NVME CQE completion status. Only the status
- *             of the FCP operation at the NVME-FC level.
+ *             negative errno value upon failure (ex: -EIO). Note: this is
+ *             NOT a reflection of the NVME CQE completion status. Only the
+ *             status of the FCP operation at the NVME-FC level.
  */
 struct nvmefc_fcp_req {
 	void			*cmdaddr;
-- 
cgit v1.2.3


From a5ea7a0fcbd7376b2c9fcb15fe59fec298c9ce9f Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Tue, 4 Apr 2017 08:51:29 -0700
Subject: PM / Domains: Add generic data pointer to genpd data struct

Add a void *data pointer to struct generic_pm_domain_data. Because this
exists for each device associated with a genpd it will allow us to
assign per-device data if needed on a platform for control of that
specific device.

Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Santosh Shilimkar <ssantosh@kernel.org>
---
 include/linux/pm_domain.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 5339ed5bd6f9..b213d22daefd 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -117,6 +117,7 @@ struct generic_pm_domain_data {
 	struct pm_domain_data base;
 	struct gpd_timing_data td;
 	struct notifier_block nb;
+	void *data;
 };
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS
-- 
cgit v1.2.3


From bf616d21f41174389c6d720ae21bf40f154474c8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 4 Apr 2017 16:54:21 +0100
Subject: Annotate module params that specify hardware parameters (eg. ioport)

Provided an annotation for module parameters that specify hardware
parameters (such as io ports, iomem addresses, irqs, dma channels, fixed
dma buffers and other types).

This will enable such parameters to be locked down in the core parameter
parser for secure boot support.

I've also included annotations as to what sort of hardware configuration
each module is dealing with for future use.  Some of these are
straightforward (ioport, iomem, irq, dma), but there are also:

 (1) drivers that switch the semantics of a parameter between ioport and
     iomem depending on a second parameter,

 (2) drivers that appear to reserve a CPU memory buffer at a fixed address,

 (3) other parameters, such as bus types and irq selection bitmasks.

For the moment, the hardware configuration type isn't actually stored,
though its validity is checked.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/moduleparam.h | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 52666d90ca94..6be1949ebcdf 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -60,9 +60,11 @@ struct kernel_param_ops {
  * Flags available for kernel_param
  *
  * UNSAFE - the parameter is dangerous and setting it will taint the kernel
+ * HWPARAM - Hardware param not permitted in lockdown mode
  */
 enum {
-	KERNEL_PARAM_FL_UNSAFE = (1 << 0)
+	KERNEL_PARAM_FL_UNSAFE	= (1 << 0),
+	KERNEL_PARAM_FL_HWPARAM	= (1 << 1),
 };
 
 struct kernel_param {
@@ -451,6 +453,67 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp);
 			    perm, -1, 0);				\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
+enum hwparam_type {
+	hwparam_ioport,		/* Module parameter configures an I/O port */
+	hwparam_iomem,		/* Module parameter configures an I/O mem address */
+	hwparam_ioport_or_iomem, /* Module parameter could be either, depending on other option */
+	hwparam_irq,		/* Module parameter configures an I/O port */
+	hwparam_dma,		/* Module parameter configures a DMA channel */
+	hwparam_dma_addr,	/* Module parameter configures a DMA buffer address */
+	hwparam_other,		/* Module parameter configures some other value */
+};
+
+/**
+ * module_param_hw_named - A parameter representing a hw parameters
+ * @name: a valid C identifier which is the parameter name.
+ * @value: the actual lvalue to alter.
+ * @type: the type of the parameter
+ * @hwtype: what the value represents (enum hwparam_type)
+ * @perm: visibility in sysfs.
+ *
+ * Usually it's a good idea to have variable names and user-exposed names the
+ * same, but that's harder if the variable must be non-static or is inside a
+ * structure.  This allows exposure under a different name.
+ */
+#define module_param_hw_named(name, value, type, hwtype, perm)		\
+	param_check_##type(name, &(value));				\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    &param_ops_##type, &value,			\
+			    perm, -1,					\
+			    KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0));	\
+	__MODULE_PARM_TYPE(name, #type)
+
+#define module_param_hw(name, type, hwtype, perm)		\
+	module_param_hw_named(name, name, type, hwtype, perm)
+
+/**
+ * module_param_hw_array - A parameter representing an array of hw parameters
+ * @name: the name of the array variable
+ * @type: the type, as per module_param()
+ * @hwtype: what the value represents (enum hwparam_type)
+ * @nump: optional pointer filled in with the number written
+ * @perm: visibility in sysfs
+ *
+ * Input and output are as comma-separated values.  Commas inside values
+ * don't work properly (eg. an array of charp).
+ *
+ * ARRAY_SIZE(@name) is used to determine the number of elements in the
+ * array, so the definition must be visible.
+ */
+#define module_param_hw_array(name, type, hwtype, nump, perm)		\
+	param_check_##type(name, &(name)[0]);				\
+	static const struct kparam_array __param_arr_##name		\
+	= { .max = ARRAY_SIZE(name), .num = nump,			\
+	    .ops = &param_ops_##type,					\
+	    .elemsize = sizeof(name[0]), .elem = name };		\
+	__module_param_call(MODULE_PARAM_PREFIX, name,			\
+			    &param_array_ops,				\
+			    .arr = &__param_arr_##name,			\
+			    perm, -1,					\
+			    KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0));	\
+	__MODULE_PARM_TYPE(name, "array of " #type)
+
+
 extern const struct kernel_param_ops param_array_ops;
 
 extern const struct kernel_param_ops param_ops_string;
-- 
cgit v1.2.3


From 65c2e69b3ccaa359032cfc35c4dbb8d235f63e5b Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 23 Mar 2017 19:00:44 +0000
Subject: include: pe.h: allow for use in assembly

Some of the definitions in include/linux/pe.h would be useful for the
EFI stub headers, where values are currently open-coded. Unfortunately
they cannot be used as some structures are also defined in pe.h without
!__ASSEMBLY__ guards.

This patch moves the structure definitions into an #ifdef __ASSEMBLY__
block, so that the common value definitions can be used from assembly.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/pe.h | 174 +++++++++++++++++++++++++++--------------------------
 1 file changed, 89 insertions(+), 85 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pe.h b/include/linux/pe.h
index e170b95e763b..a8a594117df3 100644
--- a/include/linux/pe.h
+++ b/include/linux/pe.h
@@ -23,34 +23,6 @@
 
 #define MZ_MAGIC	0x5a4d	/* "MZ" */
 
-struct mz_hdr {
-	uint16_t magic;		/* MZ_MAGIC */
-	uint16_t lbsize;	/* size of last used block */
-	uint16_t blocks;	/* pages in file, 0x3 */
-	uint16_t relocs;	/* relocations */
-	uint16_t hdrsize;	/* header size in "paragraphs" */
-	uint16_t min_extra_pps;	/* .bss */
-	uint16_t max_extra_pps;	/* runtime limit for the arena size */
-	uint16_t ss;		/* relative stack segment */
-	uint16_t sp;		/* initial %sp register */
-	uint16_t checksum;	/* word checksum */
-	uint16_t ip;		/* initial %ip register */
-	uint16_t cs;		/* initial %cs relative to load segment */
-	uint16_t reloc_table_offset;	/* offset of the first relocation */
-	uint16_t overlay_num;	/* overlay number.  set to 0. */
-	uint16_t reserved0[4];	/* reserved */
-	uint16_t oem_id;	/* oem identifier */
-	uint16_t oem_info;	/* oem specific */
-	uint16_t reserved1[10];	/* reserved */
-	uint32_t peaddr;	/* address of pe header */
-	char     message[64];	/* message to print */
-};
-
-struct mz_reloc {
-	uint16_t offset;
-	uint16_t segment;
-};
-
 #define PE_MAGIC		0x00004550	/* "PE\0\0" */
 #define PE_OPT_MAGIC_PE32	0x010b
 #define PE_OPT_MAGIC_PE32_ROM	0x0107
@@ -98,17 +70,6 @@ struct mz_reloc {
 #define IMAGE_FILE_UP_SYSTEM_ONLY            0x4000
 #define IMAGE_FILE_BYTES_REVERSED_HI         0x8000
 
-struct pe_hdr {
-	uint32_t magic;		/* PE magic */
-	uint16_t machine;	/* machine type */
-	uint16_t sections;	/* number of sections */
-	uint32_t timestamp;	/* time_t */
-	uint32_t symbol_table;	/* symbol table offset */
-	uint32_t symbols;	/* number of symbols */
-	uint16_t opt_hdr_size;	/* size of optional header */
-	uint16_t flags;		/* flags */
-};
-
 #define IMAGE_FILE_OPT_ROM_MAGIC	0x107
 #define IMAGE_FILE_OPT_PE32_MAGIC	0x10b
 #define IMAGE_FILE_OPT_PE32_PLUS_MAGIC	0x20b
@@ -134,6 +95,93 @@ struct pe_hdr {
 #define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER             0x2000
 #define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE  0x8000
 
+/* they actually defined 0x00000000 as well, but I think we'll skip that one. */
+#define IMAGE_SCN_RESERVED_0	0x00000001
+#define IMAGE_SCN_RESERVED_1	0x00000002
+#define IMAGE_SCN_RESERVED_2	0x00000004
+#define IMAGE_SCN_TYPE_NO_PAD	0x00000008 /* don't pad - obsolete */
+#define IMAGE_SCN_RESERVED_3	0x00000010
+#define IMAGE_SCN_CNT_CODE	0x00000020 /* .text */
+#define IMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040 /* .data */
+#define IMAGE_SCN_CNT_UNINITIALIZED_DATA 0x00000080 /* .bss */
+#define IMAGE_SCN_LNK_OTHER	0x00000100 /* reserved */
+#define IMAGE_SCN_LNK_INFO	0x00000200 /* .drectve comments */
+#define IMAGE_SCN_RESERVED_4	0x00000400
+#define IMAGE_SCN_LNK_REMOVE	0x00000800 /* .o only - scn to be rm'd*/
+#define IMAGE_SCN_LNK_COMDAT	0x00001000 /* .o only - COMDAT data */
+#define IMAGE_SCN_RESERVED_5	0x00002000 /* spec omits this */
+#define IMAGE_SCN_RESERVED_6	0x00004000 /* spec omits this */
+#define IMAGE_SCN_GPREL		0x00008000 /* global pointer referenced data */
+/* spec lists 0x20000 twice, I suspect they meant 0x10000 for one of them */
+#define IMAGE_SCN_MEM_PURGEABLE	0x00010000 /* reserved for "future" use */
+#define IMAGE_SCN_16BIT		0x00020000 /* reserved for "future" use */
+#define IMAGE_SCN_LOCKED	0x00040000 /* reserved for "future" use */
+#define IMAGE_SCN_PRELOAD	0x00080000 /* reserved for "future" use */
+/* and here they just stuck a 1-byte integer in the middle of a bitfield */
+#define IMAGE_SCN_ALIGN_1BYTES	0x00100000 /* it does what it says on the box */
+#define IMAGE_SCN_ALIGN_2BYTES	0x00200000
+#define IMAGE_SCN_ALIGN_4BYTES	0x00300000
+#define IMAGE_SCN_ALIGN_8BYTES	0x00400000
+#define IMAGE_SCN_ALIGN_16BYTES	0x00500000
+#define IMAGE_SCN_ALIGN_32BYTES	0x00600000
+#define IMAGE_SCN_ALIGN_64BYTES	0x00700000
+#define IMAGE_SCN_ALIGN_128BYTES 0x00800000
+#define IMAGE_SCN_ALIGN_256BYTES 0x00900000
+#define IMAGE_SCN_ALIGN_512BYTES 0x00a00000
+#define IMAGE_SCN_ALIGN_1024BYTES 0x00b00000
+#define IMAGE_SCN_ALIGN_2048BYTES 0x00c00000
+#define IMAGE_SCN_ALIGN_4096BYTES 0x00d00000
+#define IMAGE_SCN_ALIGN_8192BYTES 0x00e00000
+#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* extended relocations */
+#define IMAGE_SCN_MEM_DISCARDABLE 0x02000000 /* scn can be discarded */
+#define IMAGE_SCN_MEM_NOT_CACHED 0x04000000 /* cannot be cached */
+#define IMAGE_SCN_MEM_NOT_PAGED	0x08000000 /* not pageable */
+#define IMAGE_SCN_MEM_SHARED	0x10000000 /* can be shared */
+#define IMAGE_SCN_MEM_EXECUTE	0x20000000 /* can be executed as code */
+#define IMAGE_SCN_MEM_READ	0x40000000 /* readable */
+#define IMAGE_SCN_MEM_WRITE	0x80000000 /* writeable */
+
+#ifndef __ASSEMBLY__
+
+struct mz_hdr {
+	uint16_t magic;		/* MZ_MAGIC */
+	uint16_t lbsize;	/* size of last used block */
+	uint16_t blocks;	/* pages in file, 0x3 */
+	uint16_t relocs;	/* relocations */
+	uint16_t hdrsize;	/* header size in "paragraphs" */
+	uint16_t min_extra_pps;	/* .bss */
+	uint16_t max_extra_pps;	/* runtime limit for the arena size */
+	uint16_t ss;		/* relative stack segment */
+	uint16_t sp;		/* initial %sp register */
+	uint16_t checksum;	/* word checksum */
+	uint16_t ip;		/* initial %ip register */
+	uint16_t cs;		/* initial %cs relative to load segment */
+	uint16_t reloc_table_offset;	/* offset of the first relocation */
+	uint16_t overlay_num;	/* overlay number.  set to 0. */
+	uint16_t reserved0[4];	/* reserved */
+	uint16_t oem_id;	/* oem identifier */
+	uint16_t oem_info;	/* oem specific */
+	uint16_t reserved1[10];	/* reserved */
+	uint32_t peaddr;	/* address of pe header */
+	char     message[64];	/* message to print */
+};
+
+struct mz_reloc {
+	uint16_t offset;
+	uint16_t segment;
+};
+
+struct pe_hdr {
+	uint32_t magic;		/* PE magic */
+	uint16_t machine;	/* machine type */
+	uint16_t sections;	/* number of sections */
+	uint32_t timestamp;	/* time_t */
+	uint32_t symbol_table;	/* symbol table offset */
+	uint32_t symbols;	/* number of symbols */
+	uint16_t opt_hdr_size;	/* size of optional header */
+	uint16_t flags;		/* flags */
+};
+
 /* the fact that pe32 isn't padded where pe32+ is 64-bit means union won't
  * work right.  vomit. */
 struct pe32_opt_hdr {
@@ -243,52 +291,6 @@ struct section_header {
 	uint32_t flags;
 };
 
-/* they actually defined 0x00000000 as well, but I think we'll skip that one. */
-#define IMAGE_SCN_RESERVED_0	0x00000001
-#define IMAGE_SCN_RESERVED_1	0x00000002
-#define IMAGE_SCN_RESERVED_2	0x00000004
-#define IMAGE_SCN_TYPE_NO_PAD	0x00000008 /* don't pad - obsolete */
-#define IMAGE_SCN_RESERVED_3	0x00000010
-#define IMAGE_SCN_CNT_CODE	0x00000020 /* .text */
-#define IMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040 /* .data */
-#define IMAGE_SCN_CNT_UNINITIALIZED_DATA 0x00000080 /* .bss */
-#define IMAGE_SCN_LNK_OTHER	0x00000100 /* reserved */
-#define IMAGE_SCN_LNK_INFO	0x00000200 /* .drectve comments */
-#define IMAGE_SCN_RESERVED_4	0x00000400
-#define IMAGE_SCN_LNK_REMOVE	0x00000800 /* .o only - scn to be rm'd*/
-#define IMAGE_SCN_LNK_COMDAT	0x00001000 /* .o only - COMDAT data */
-#define IMAGE_SCN_RESERVED_5	0x00002000 /* spec omits this */
-#define IMAGE_SCN_RESERVED_6	0x00004000 /* spec omits this */
-#define IMAGE_SCN_GPREL		0x00008000 /* global pointer referenced data */
-/* spec lists 0x20000 twice, I suspect they meant 0x10000 for one of them */
-#define IMAGE_SCN_MEM_PURGEABLE	0x00010000 /* reserved for "future" use */
-#define IMAGE_SCN_16BIT		0x00020000 /* reserved for "future" use */
-#define IMAGE_SCN_LOCKED	0x00040000 /* reserved for "future" use */
-#define IMAGE_SCN_PRELOAD	0x00080000 /* reserved for "future" use */
-/* and here they just stuck a 1-byte integer in the middle of a bitfield */
-#define IMAGE_SCN_ALIGN_1BYTES	0x00100000 /* it does what it says on the box */
-#define IMAGE_SCN_ALIGN_2BYTES	0x00200000
-#define IMAGE_SCN_ALIGN_4BYTES	0x00300000
-#define IMAGE_SCN_ALIGN_8BYTES	0x00400000
-#define IMAGE_SCN_ALIGN_16BYTES	0x00500000
-#define IMAGE_SCN_ALIGN_32BYTES	0x00600000
-#define IMAGE_SCN_ALIGN_64BYTES	0x00700000
-#define IMAGE_SCN_ALIGN_128BYTES 0x00800000
-#define IMAGE_SCN_ALIGN_256BYTES 0x00900000
-#define IMAGE_SCN_ALIGN_512BYTES 0x00a00000
-#define IMAGE_SCN_ALIGN_1024BYTES 0x00b00000
-#define IMAGE_SCN_ALIGN_2048BYTES 0x00c00000
-#define IMAGE_SCN_ALIGN_4096BYTES 0x00d00000
-#define IMAGE_SCN_ALIGN_8192BYTES 0x00e00000
-#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* extended relocations */
-#define IMAGE_SCN_MEM_DISCARDABLE 0x02000000 /* scn can be discarded */
-#define IMAGE_SCN_MEM_NOT_CACHED 0x04000000 /* cannot be cached */
-#define IMAGE_SCN_MEM_NOT_PAGED	0x08000000 /* not pageable */
-#define IMAGE_SCN_MEM_SHARED	0x10000000 /* can be shared */
-#define IMAGE_SCN_MEM_EXECUTE	0x20000000 /* can be executed as code */
-#define IMAGE_SCN_MEM_READ	0x40000000 /* readable */
-#define IMAGE_SCN_MEM_WRITE	0x80000000 /* writeable */
-
 enum x64_coff_reloc_type {
 	IMAGE_REL_AMD64_ABSOLUTE = 0,
 	IMAGE_REL_AMD64_ADDR64,
@@ -445,4 +447,6 @@ struct win_certificate {
 	uint16_t cert_type;
 };
 
+#endif /* !__ASSEMBLY__ */
+
 #endif /* __LINUX_PE_H */
-- 
cgit v1.2.3


From 6f5541ba0eed842445a99b411d0f34103bcbbea1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 23 Mar 2017 19:00:45 +0000
Subject: include: pe.h: add some missing definitions

Add the missing IMAGE_FILE_MACHINE_ARM64 and IMAGE_DEBUG_TYPE_CODEVIEW
definitions.

We'll need them for the arm64 EFI stub...

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
[ardb: add IMAGE_DEBUG_TYPE_CODEVIEW as well]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/pe.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pe.h b/include/linux/pe.h
index a8a594117df3..143ce75be5f0 100644
--- a/include/linux/pe.h
+++ b/include/linux/pe.h
@@ -34,6 +34,7 @@
 #define	IMAGE_FILE_MACHINE_AMD64	0x8664
 #define	IMAGE_FILE_MACHINE_ARM		0x01c0
 #define	IMAGE_FILE_MACHINE_ARMV7	0x01c4
+#define	IMAGE_FILE_MACHINE_ARM64	0xaa64
 #define	IMAGE_FILE_MACHINE_EBC		0x0ebc
 #define	IMAGE_FILE_MACHINE_I386		0x014c
 #define	IMAGE_FILE_MACHINE_IA64		0x0200
@@ -141,6 +142,8 @@
 #define IMAGE_SCN_MEM_READ	0x40000000 /* readable */
 #define IMAGE_SCN_MEM_WRITE	0x80000000 /* writeable */
 
+#define IMAGE_DEBUG_TYPE_CODEVIEW	2
+
 #ifndef __ASSEMBLY__
 
 struct mz_hdr {
-- 
cgit v1.2.3


From 2b6aa412ff23a02ac777ad307249c60a839cfd25 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Wed, 31 Aug 2016 16:05:43 -0700
Subject: KEYS: Use structure to capture key restriction function and data

Replace struct key's restrict_link function pointer with a pointer to
the new struct key_restriction. The structure contains pointers to the
restriction function as well as relevant data for evaluating the
restriction.

The garbage collector checks restrict_link->keytype when key types are
unregistered. Restrictions involving a removed key type are converted
to use restrict_link_reject so that restrictions cannot be removed by
unregistering key types.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 Documentation/security/keys.txt  | 21 +++++++------
 certs/system_keyring.c           | 21 ++++++++++++-
 include/linux/key.h              |  8 ++---
 security/integrity/digsig.c      |  9 +++++-
 security/integrity/ima/ima_mok.c | 11 ++++++-
 security/keys/gc.c               | 11 +++++++
 security/keys/internal.h         |  2 ++
 security/keys/key.c              | 23 ++++++++------
 security/keys/keyring.c          | 68 +++++++++++++++++++++++++++++++++++++---
 9 files changed, 144 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index bb575ab80207..e35de987fc48 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1032,7 +1032,7 @@ payload contents" for more information.
 	struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 				  const struct cred *cred,
 				  key_perm_t perm,
-				  key_restrict_link_func_t restrict_link,
+				  struct key_restriction *restrict_link,
 				  unsigned long flags,
 				  struct key *dest);
 
@@ -1044,14 +1044,17 @@ payload contents" for more information.
     KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted
     towards the user's quota).  Error ENOMEM can also be returned.
 
-    If restrict_link not NULL, it should point to a function that will be
-    called each time an attempt is made to link a key into the new keyring.
-    This function is called to check whether a key may be added into the keying
-    or not.  Callers of key_create_or_update() within the kernel can pass
-    KEY_ALLOC_BYPASS_RESTRICTION to suppress the check.  An example of using
-    this is to manage rings of cryptographic keys that are set up when the
-    kernel boots where userspace is also permitted to add keys - provided they
-    can be verified by a key the kernel already has.
+    If restrict_link is not NULL, it should point to a structure that contains
+    the function that will be called each time an attempt is made to link a
+    key into the new keyring.  The structure may also contain a key pointer
+    and an associated key type.  The function is called to check whether a key
+    may be added into the keyring or not.  The key type is used by the garbage
+    collector to clean up function or data pointers in this structure if the
+    given key type is unregistered.  Callers of key_create_or_update() within
+    the kernel can pass KEY_ALLOC_BYPASS_RESTRICTION to suppress the check.
+    An example of using this is to manage rings of cryptographic keys that are
+    set up when the kernel boots where userspace is also permitted to add keys
+    - provided they can be verified by a key the kernel already has.
 
     When called, the restriction function will be passed the keyring being
     added to, the key type, the payload of the key being added, and data to be
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index e39cce68dcfa..6251d1b27f0c 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/cred.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include <keys/asymmetric-type.h>
 #include <keys/system_keyring.h>
 #include <crypto/pkcs7.h>
@@ -68,6 +69,24 @@ int restrict_link_by_builtin_and_secondary_trusted(
 	return restrict_link_by_signature(dest_keyring, type, payload,
 					  secondary_trusted_keys);
 }
+
+/**
+ * Allocate a struct key_restriction for the "builtin and secondary trust"
+ * keyring. Only for use in system_trusted_keyring_init().
+ */
+static __init struct key_restriction *get_builtin_and_secondary_restriction(void)
+{
+	struct key_restriction *restriction;
+
+	restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL);
+
+	if (!restriction)
+		panic("Can't allocate secondary trusted keyring restriction\n");
+
+	restriction->check = restrict_link_by_builtin_and_secondary_trusted;
+
+	return restriction;
+}
 #endif
 
 /*
@@ -95,7 +114,7 @@ static __init int system_trusted_keyring_init(void)
 			       KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH |
 			       KEY_USR_WRITE),
 			      KEY_ALLOC_NOT_IN_QUOTA,
-			      restrict_link_by_builtin_and_secondary_trusted,
+			      get_builtin_and_secondary_restriction(),
 			      NULL);
 	if (IS_ERR(secondary_trusted_keys))
 		panic("Can't allocate secondary trusted keyring\n");
diff --git a/include/linux/key.h b/include/linux/key.h
index a06649f3223d..d2916363689c 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -217,7 +217,7 @@ struct key {
 	};
 
 	/* This is set on a keyring to restrict the addition of a link to a key
-	 * to it.  If this method isn't provided then it is assumed that the
+	 * to it.  If this structure isn't provided then it is assumed that the
 	 * keyring is open to any addition.  It is ignored for non-keyring
 	 * keys.
 	 *
@@ -226,7 +226,7 @@ struct key {
 	 * overrides this, allowing the kernel to add extra keys without
 	 * restriction.
 	 */
-	key_restrict_link_func_t restrict_link;
+	struct key_restriction *restrict_link;
 };
 
 extern struct key *key_alloc(struct key_type *type,
@@ -235,7 +235,7 @@ extern struct key *key_alloc(struct key_type *type,
 			     const struct cred *cred,
 			     key_perm_t perm,
 			     unsigned long flags,
-			     key_restrict_link_func_t restrict_link);
+			     struct key_restriction *restrict_link);
 
 
 #define KEY_ALLOC_IN_QUOTA		0x0000	/* add to quota, reject if would overrun */
@@ -311,7 +311,7 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid
 				 const struct cred *cred,
 				 key_perm_t perm,
 				 unsigned long flags,
-				 key_restrict_link_func_t restrict_link,
+				 struct key_restriction *restrict_link,
 				 struct key *dest);
 
 extern int restrict_link_reject(struct key *keyring,
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index 106e855e2d9d..06554c448dce 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -81,18 +81,25 @@ int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 int __init integrity_init_keyring(const unsigned int id)
 {
 	const struct cred *cred = current_cred();
+	struct key_restriction *restriction;
 	int err = 0;
 
 	if (!init_keyring)
 		return 0;
 
+	restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL);
+	if (!restriction)
+		return -ENOMEM;
+
+	restriction->check = restrict_link_to_ima;
+
 	keyring[id] = keyring_alloc(keyring_name[id], KUIDT_INIT(0),
 				    KGIDT_INIT(0), cred,
 				    ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				     KEY_USR_VIEW | KEY_USR_READ |
 				     KEY_USR_WRITE | KEY_USR_SEARCH),
 				    KEY_ALLOC_NOT_IN_QUOTA,
-				    restrict_link_to_ima, NULL);
+				    restriction, NULL);
 	if (IS_ERR(keyring[id])) {
 		err = PTR_ERR(keyring[id]);
 		pr_info("Can't allocate %s keyring (%d)\n",
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index 74a279957464..073ddc9bce5b 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -17,6 +17,7 @@
 #include <linux/cred.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <keys/system_keyring.h>
 
 
@@ -27,15 +28,23 @@ struct key *ima_blacklist_keyring;
  */
 __init int ima_mok_init(void)
 {
+	struct key_restriction *restriction;
+
 	pr_notice("Allocating IMA blacklist keyring.\n");
 
+	restriction = kzalloc(sizeof(struct key_restriction), GFP_KERNEL);
+	if (!restriction)
+		panic("Can't allocate IMA blacklist restriction.");
+
+	restriction->check = restrict_link_by_builtin_trusted;
+
 	ima_blacklist_keyring = keyring_alloc(".ima_blacklist",
 				KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
 				(KEY_POS_ALL & ~KEY_POS_SETATTR) |
 				KEY_USR_VIEW | KEY_USR_READ |
 				KEY_USR_WRITE | KEY_USR_SEARCH,
 				KEY_ALLOC_NOT_IN_QUOTA,
-				restrict_link_by_builtin_trusted, NULL);
+				restriction, NULL);
 
 	if (IS_ERR(ima_blacklist_keyring))
 		panic("Can't allocate IMA blacklist keyring.");
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 44789256c88c..15b9ddf510e4 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -229,6 +229,9 @@ continue_scanning:
 				set_bit(KEY_FLAG_DEAD, &key->flags);
 				key->perm = 0;
 				goto skip_dead_key;
+			} else if (key->type == &key_type_keyring &&
+				   key->restrict_link) {
+				goto found_restricted_keyring;
 			}
 		}
 
@@ -334,6 +337,14 @@ found_unreferenced_key:
 	gc_state |= KEY_GC_REAP_AGAIN;
 	goto maybe_resched;
 
+	/* We found a restricted keyring and need to update the restriction if
+	 * it is associated with the dead key type.
+	 */
+found_restricted_keyring:
+	spin_unlock(&key_serial_lock);
+	keyring_restriction_gc(key, key_gc_dead_keytype);
+	goto maybe_resched;
+
 	/* We found a keyring and we need to check the payload for links to
 	 * dead or expired keys.  We don't flag another reap immediately as we
 	 * have to wait for the old payload to be destroyed by RCU before we
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 6bee06ae026d..24762ae9a198 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -168,6 +168,8 @@ extern void key_change_session_keyring(struct callback_head *twork);
 extern struct work_struct key_gc_work;
 extern unsigned key_gc_delay;
 extern void keyring_gc(struct key *keyring, time_t limit);
+extern void keyring_restriction_gc(struct key *keyring,
+				   struct key_type *dead_type);
 extern void key_schedule_gc(time_t gc_at);
 extern void key_schedule_gc_links(void);
 extern void key_gc_keytype(struct key_type *ktype);
diff --git a/security/keys/key.c b/security/keys/key.c
index 27fc1bb40034..2ea5967121de 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -201,12 +201,15 @@ serial_exists:
  * @cred: The credentials specifying UID namespace.
  * @perm: The permissions mask of the new key.
  * @flags: Flags specifying quota properties.
- * @restrict_link: Optional link restriction method for new keyrings.
+ * @restrict_link: Optional link restriction for new keyrings.
  *
  * Allocate a key of the specified type with the attributes given.  The key is
  * returned in an uninstantiated state and the caller needs to instantiate the
  * key before returning.
  *
+ * The restrict_link structure (if not NULL) will be freed when the
+ * keyring is destroyed, so it must be dynamically allocated.
+ *
  * The user's key count quota is updated to reflect the creation of the key and
  * the user's key data quota has the default for the key type reserved.  The
  * instantiation function should amend this as necessary.  If insufficient
@@ -225,7 +228,7 @@ serial_exists:
 struct key *key_alloc(struct key_type *type, const char *desc,
 		      kuid_t uid, kgid_t gid, const struct cred *cred,
 		      key_perm_t perm, unsigned long flags,
-		      key_restrict_link_func_t restrict_link)
+		      struct key_restriction *restrict_link)
 {
 	struct key_user *user = NULL;
 	struct key *key;
@@ -497,9 +500,11 @@ int key_instantiate_and_link(struct key *key,
 	}
 
 	if (keyring) {
-		if (keyring->restrict_link) {
-			ret = keyring->restrict_link(keyring, key->type,
-						     &prep.payload, NULL);
+		if (keyring->restrict_link && keyring->restrict_link->check) {
+			struct key_restriction *keyres = keyring->restrict_link;
+
+			ret = keyres->check(keyring, key->type, &prep.payload,
+					    keyres->key);
 			if (ret < 0)
 				goto error;
 		}
@@ -804,7 +809,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	struct key *keyring, *key = NULL;
 	key_ref_t key_ref;
 	int ret;
-	key_restrict_link_func_t restrict_link = NULL;
+	struct key_restriction *restrict_link = NULL;
 
 	/* look up the key type to see if it's one of the registered kernel
 	 * types */
@@ -850,9 +855,9 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
 	}
 	index_key.desc_len = strlen(index_key.description);
 
-	if (restrict_link) {
-		ret = restrict_link(keyring, index_key.type, &prep.payload,
-				    NULL);
+	if (restrict_link && restrict_link->check) {
+		ret = restrict_link->check(keyring, index_key.type,
+					   &prep.payload, restrict_link->key);
 		if (ret < 0) {
 			key_ref = ERR_PTR(ret);
 			goto error_free_prep;
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 2ccc66ec35d7..838334fec6ce 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -394,6 +394,13 @@ static void keyring_destroy(struct key *keyring)
 		write_unlock(&keyring_name_lock);
 	}
 
+	if (keyring->restrict_link) {
+		struct key_restriction *keyres = keyring->restrict_link;
+
+		key_put(keyres->key);
+		kfree(keyres);
+	}
+
 	assoc_array_destroy(&keyring->keys, &keyring_assoc_array_ops);
 }
 
@@ -492,7 +499,7 @@ static long keyring_read(const struct key *keyring,
 struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
 			  const struct cred *cred, key_perm_t perm,
 			  unsigned long flags,
-			  key_restrict_link_func_t restrict_link,
+			  struct key_restriction *restrict_link,
 			  struct key *dest)
 {
 	struct key *keyring;
@@ -523,8 +530,8 @@ EXPORT_SYMBOL(keyring_alloc);
  * passing KEY_ALLOC_BYPASS_RESTRICTION to key_instantiate_and_link() when
  * adding a key to a keyring.
  *
- * This is meant to be passed as the restrict_link parameter to
- * keyring_alloc().
+ * This is meant to be stored in a key_restriction structure which is passed
+ * in the restrict_link parameter to keyring_alloc().
  */
 int restrict_link_reject(struct key *keyring,
 			 const struct key_type *type,
@@ -1220,9 +1227,10 @@ void __key_link_end(struct key *keyring,
  */
 static int __key_link_check_restriction(struct key *keyring, struct key *key)
 {
-	if (!keyring->restrict_link)
+	if (!keyring->restrict_link || !keyring->restrict_link->check)
 		return 0;
-	return keyring->restrict_link(keyring, key->type, &key->payload, NULL);
+	return keyring->restrict_link->check(keyring, key->type, &key->payload,
+					     keyring->restrict_link->key);
 }
 
 /**
@@ -1426,3 +1434,53 @@ do_gc:
 	up_write(&keyring->sem);
 	kleave(" [gc]");
 }
+
+/*
+ * Garbage collect restriction pointers from a keyring.
+ *
+ * Keyring restrictions are associated with a key type, and must be cleaned
+ * up if the key type is unregistered. The restriction is altered to always
+ * reject additional keys so a keyring cannot be opened up by unregistering
+ * a key type.
+ *
+ * Not called with any keyring locks held. The keyring's key struct will not
+ * be deallocated under us as only our caller may deallocate it.
+ *
+ * The caller is required to hold key_types_sem and dead_type->sem. This is
+ * fulfilled by key_gc_keytype() holding the locks on behalf of
+ * key_garbage_collector(), which it invokes on a workqueue.
+ */
+void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type)
+{
+	struct key_restriction *keyres;
+
+	kenter("%x{%s}", keyring->serial, keyring->description ?: "");
+
+	/*
+	 * keyring->restrict_link is only assigned at key allocation time
+	 * or with the key type locked, so the only values that could be
+	 * concurrently assigned to keyring->restrict_link are for key
+	 * types other than dead_type. Given this, it's ok to check
+	 * the key type before acquiring keyring->sem.
+	 */
+	if (!dead_type || !keyring->restrict_link ||
+	    keyring->restrict_link->keytype != dead_type) {
+		kleave(" [no restriction gc]");
+		return;
+	}
+
+	/* Lock the keyring to ensure that a link is not in progress */
+	down_write(&keyring->sem);
+
+	keyres = keyring->restrict_link;
+
+	keyres->check = restrict_link_reject;
+
+	key_put(keyres->key);
+	keyres->key = NULL;
+	keyres->keytype = NULL;
+
+	up_write(&keyring->sem);
+
+	kleave(" [restriction gc]");
+}
-- 
cgit v1.2.3


From efba797b977c99bc6e0c301299272c80fb8b287f Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Fri, 6 May 2016 15:38:17 -0700
Subject: KEYS: Add an optional lookup_restriction hook to key_type

The restrict_link functions used to validate keys as they are linked
to a keyring can be associated with specific key types.  Each key type
may be loaded (or not) at runtime, so lookup of restrict_link
functions needs to be part of the key type implementation to ensure
that the requested keys can be examined.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 Documentation/security/keys.txt | 9 +++++++++
 include/linux/key-type.h        | 8 ++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index e35de987fc48..5fe04a7cc03d 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1445,6 +1445,15 @@ The structure has a number of fields, some of which are mandatory:
      	 The authorisation key.
 
 
+ (*) struct key_restriction *(*lookup_restriction)(const char *params);
+
+     This optional method is used to enable userspace configuration of keyring
+     restrictions. The restriction parameter string (not including the key type
+     name) is passed in, and this method returns a pointer to a key_restriction
+     structure containing the relevant functions and data to evaluate each
+     attempted key link operation. If there is no match, -EINVAL is returned.
+
+
 ============================
 REQUEST-KEY CALLBACK SERVICE
 ============================
diff --git a/include/linux/key-type.h b/include/linux/key-type.h
index eaee981c5558..8496cf64575c 100644
--- a/include/linux/key-type.h
+++ b/include/linux/key-type.h
@@ -147,6 +147,14 @@ struct key_type {
 	 */
 	request_key_actor_t request_key;
 
+	/* Look up a keyring access restriction (optional)
+	 *
+	 * - NULL is a valid return value (meaning the requested restriction
+	 *   is known but will never block addition of a key)
+	 * - should return -EINVAL if the restriction is unknown
+	 */
+	struct key_restriction *(*lookup_restriction)(const char *params);
+
 	/* internal fields */
 	struct list_head	link;		/* link in types list */
 	struct lock_class_key	lock_class;	/* key->sem lock class */
-- 
cgit v1.2.3


From 6563c91fd645556c7801748f15bc727c77fcd311 Mon Sep 17 00:00:00 2001
From: Mat Martineau <mathew.j.martineau@linux.intel.com>
Date: Wed, 1 Mar 2017 16:44:09 -0800
Subject: KEYS: Add KEYCTL_RESTRICT_KEYRING

Keyrings recently gained restrict_link capabilities that allow
individual keys to be validated prior to linking.  This functionality
was only available using internal kernel APIs.

With the KEYCTL_RESTRICT_KEYRING command existing keyrings can be
configured to check the content of keys before they are linked, and
then allow or disallow linkage of that key to the keyring.

To restrict a keyring, call:

  keyctl(KEYCTL_RESTRICT_KEYRING, key_serial_t keyring, const char *type,
         const char *restriction)

where 'type' is the name of a registered key type and 'restriction' is a
string describing how key linkage is to be restricted. The restriction
option syntax is specific to each key type.

Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
---
 Documentation/security/keys.txt |  25 ++++++++++
 include/linux/key.h             |   6 ++-
 include/uapi/linux/keyctl.h     |   1 +
 security/keys/compat.c          |   4 ++
 security/keys/internal.h        |   3 ++
 security/keys/keyctl.c          |  58 ++++++++++++++++++++++
 security/keys/keyring.c         | 105 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 201 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 5fe04a7cc03d..5f554aab8751 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -857,6 +857,31 @@ The keyctl syscall functions are:
      supported, error ENOKEY if the key could not be found, or error
      EACCES if the key is not readable by the caller.
 
+ (*) Restrict keyring linkage
+
+       long keyctl(KEYCTL_RESTRICT_KEYRING, key_serial_t keyring,
+		   const char *type, const char *restriction);
+
+     An existing keyring can restrict linkage of additional keys by evaluating
+     the contents of the key according to a restriction scheme.
+
+     "keyring" is the key ID for an existing keyring to apply a restriction
+     to. It may be empty or may already have keys linked. Existing linked keys
+     will remain in the keyring even if the new restriction would reject them.
+
+     "type" is a registered key type.
+
+     "restriction" is a string describing how key linkage is to be restricted.
+     The format varies depending on the key type, and the string is passed to
+     the lookup_restriction() function for the requested type.  It may specify
+     a method and relevant data for the restriction such as signature
+     verification or constraints on key payload. If the requested key type is
+     later unregistered, no keys may be added to the keyring after the key type
+     is removed.
+
+     To apply a keyring restriction the process must have Set Attribute
+     permission and the keyring must not be previously restricted.
+
 ===============
 KERNEL SERVICES
 ===============
diff --git a/include/linux/key.h b/include/linux/key.h
index d2916363689c..0c9b93b0d1f7 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -219,7 +219,8 @@ struct key {
 	/* This is set on a keyring to restrict the addition of a link to a key
 	 * to it.  If this structure isn't provided then it is assumed that the
 	 * keyring is open to any addition.  It is ignored for non-keyring
-	 * keys.
+	 * keys. Only set this value using keyring_restrict(), keyring_alloc(),
+	 * or key_alloc().
 	 *
 	 * This is intended for use with rings of trusted keys whereby addition
 	 * to the keyring needs to be controlled.  KEY_ALLOC_BYPASS_RESTRICTION
@@ -328,6 +329,9 @@ extern key_ref_t keyring_search(key_ref_t keyring,
 extern int keyring_add_key(struct key *keyring,
 			   struct key *key);
 
+extern int keyring_restrict(key_ref_t keyring, const char *type,
+			    const char *restriction);
+
 extern struct key *key_lookup(key_serial_t id);
 
 static inline key_serial_t key_serial(const struct key *key)
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index 86eddd6241f3..ff79c44e49a3 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -60,6 +60,7 @@
 #define KEYCTL_INVALIDATE		21	/* invalidate a key */
 #define KEYCTL_GET_PERSISTENT		22	/* get a user's persistent keyring */
 #define KEYCTL_DH_COMPUTE		23	/* Compute Diffie-Hellman values */
+#define KEYCTL_RESTRICT_KEYRING		29	/* Restrict keys allowed to link to a keyring */
 
 /* keyctl structures */
 struct keyctl_dh_params {
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 36c80bf5b89c..bb98f2b8dd7d 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -136,6 +136,10 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
 		return keyctl_dh_compute(compat_ptr(arg2), compat_ptr(arg3),
 					 arg4, compat_ptr(arg5));
 
+	case KEYCTL_RESTRICT_KEYRING:
+		return keyctl_restrict_keyring(arg2, compat_ptr(arg3),
+					       compat_ptr(arg4));
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 24762ae9a198..6ce016314897 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -252,6 +252,9 @@ struct iov_iter;
 extern long keyctl_instantiate_key_common(key_serial_t,
 					  struct iov_iter *,
 					  key_serial_t);
+extern long keyctl_restrict_keyring(key_serial_t id,
+				    const char __user *_type,
+				    const char __user *_restriction);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 extern long keyctl_get_persistent(uid_t, key_serial_t);
 extern unsigned persistent_keyring_expiry;
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 52c34532c785..6ee2826a2d06 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1582,6 +1582,59 @@ error_keyring:
 	return ret;
 }
 
+/*
+ * Apply a restriction to a given keyring.
+ *
+ * The caller must have Setattr permission to change keyring restrictions.
+ *
+ * The requested type name may be a NULL pointer to reject all attempts
+ * to link to the keyring. If _type is non-NULL, _restriction can be
+ * NULL or a pointer to a string describing the restriction. If _type is
+ * NULL, _restriction must also be NULL.
+ *
+ * Returns 0 if successful.
+ */
+long keyctl_restrict_keyring(key_serial_t id, const char __user *_type,
+			     const char __user *_restriction)
+{
+	key_ref_t key_ref;
+	bool link_reject = !_type;
+	char type[32];
+	char *restriction = NULL;
+	long ret;
+
+	key_ref = lookup_user_key(id, 0, KEY_NEED_SETATTR);
+	if (IS_ERR(key_ref))
+		return PTR_ERR(key_ref);
+
+	if (_type) {
+		ret = key_get_type_from_user(type, _type, sizeof(type));
+		if (ret < 0)
+			goto error;
+	}
+
+	if (_restriction) {
+		if (!_type) {
+			ret = -EINVAL;
+			goto error;
+		}
+
+		restriction = strndup_user(_restriction, PAGE_SIZE);
+		if (IS_ERR(restriction)) {
+			ret = PTR_ERR(restriction);
+			goto error;
+		}
+	}
+
+	ret = keyring_restrict(key_ref, link_reject ? NULL : type, restriction);
+	kfree(restriction);
+
+error:
+	key_ref_put(key_ref);
+
+	return ret;
+}
+
 /*
  * The key control system call
  */
@@ -1693,6 +1746,11 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 					 (char __user *) arg3, (size_t) arg4,
 					 (void __user *) arg5);
 
+	case KEYCTL_RESTRICT_KEYRING:
+		return keyctl_restrict_keyring((key_serial_t) arg2,
+					       (const char __user *) arg3,
+					       (const char __user *) arg4);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 838334fec6ce..4d1678e4586f 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -947,6 +947,111 @@ key_ref_t keyring_search(key_ref_t keyring,
 }
 EXPORT_SYMBOL(keyring_search);
 
+static struct key_restriction *keyring_restriction_alloc(
+	key_restrict_link_func_t check)
+{
+	struct key_restriction *keyres =
+		kzalloc(sizeof(struct key_restriction), GFP_KERNEL);
+
+	if (!keyres)
+		return ERR_PTR(-ENOMEM);
+
+	keyres->check = check;
+
+	return keyres;
+}
+
+/*
+ * Semaphore to serialise restriction setup to prevent reference count
+ * cycles through restriction key pointers.
+ */
+static DECLARE_RWSEM(keyring_serialise_restrict_sem);
+
+/*
+ * Check for restriction cycles that would prevent keyring garbage collection.
+ * keyring_serialise_restrict_sem must be held.
+ */
+static bool keyring_detect_restriction_cycle(const struct key *dest_keyring,
+					     struct key_restriction *keyres)
+{
+	while (keyres && keyres->key &&
+	       keyres->key->type == &key_type_keyring) {
+		if (keyres->key == dest_keyring)
+			return true;
+
+		keyres = keyres->key->restrict_link;
+	}
+
+	return false;
+}
+
+/**
+ * keyring_restrict - Look up and apply a restriction to a keyring
+ *
+ * @keyring: The keyring to be restricted
+ * @restriction: The restriction options to apply to the keyring
+ */
+int keyring_restrict(key_ref_t keyring_ref, const char *type,
+		     const char *restriction)
+{
+	struct key *keyring;
+	struct key_type *restrict_type = NULL;
+	struct key_restriction *restrict_link;
+	int ret = 0;
+
+	keyring = key_ref_to_ptr(keyring_ref);
+	key_check(keyring);
+
+	if (keyring->type != &key_type_keyring)
+		return -ENOTDIR;
+
+	if (!type) {
+		restrict_link = keyring_restriction_alloc(restrict_link_reject);
+	} else {
+		restrict_type = key_type_lookup(type);
+
+		if (IS_ERR(restrict_type))
+			return PTR_ERR(restrict_type);
+
+		if (!restrict_type->lookup_restriction) {
+			ret = -ENOENT;
+			goto error;
+		}
+
+		restrict_link = restrict_type->lookup_restriction(restriction);
+	}
+
+	if (IS_ERR(restrict_link)) {
+		ret = PTR_ERR(restrict_link);
+		goto error;
+	}
+
+	down_write(&keyring->sem);
+	down_write(&keyring_serialise_restrict_sem);
+
+	if (keyring->restrict_link)
+		ret = -EEXIST;
+	else if (keyring_detect_restriction_cycle(keyring, restrict_link))
+		ret = -EDEADLK;
+	else
+		keyring->restrict_link = restrict_link;
+
+	up_write(&keyring_serialise_restrict_sem);
+	up_write(&keyring->sem);
+
+	if (ret < 0) {
+		key_put(restrict_link->key);
+		kfree(restrict_link);
+	}
+
+error:
+	if (restrict_type)
+		key_type_put(restrict_type);
+
+	return ret;
+}
+EXPORT_SYMBOL(keyring_restrict);
+
 /*
  * Search the given keyring for a key that might be updated.
  *
-- 
cgit v1.2.3


From 96dc4f9fb64690fc34410415fd1fc609cf803f61 Mon Sep 17 00:00:00 2001
From: Sahara <keun-o.park@darkmatter.ae>
Date: Thu, 16 Feb 2017 18:29:15 +0000
Subject: usercopy: Move enum for arch_within_stack_frames()

This patch moves the arch_within_stack_frames() return value enum up in
the header files so that per-architecture implementations can reuse the
same return values.

Signed-off-by: Sahara <keun-o.park@darkmatter.ae>
Signed-off-by: James Morse <james.morse@arm.com>
[kees: adjusted naming and commit log]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/x86/include/asm/thread_info.h | 13 +++++++------
 include/linux/thread_info.h        | 12 ++++++++++++
 mm/usercopy.c                      |  8 +-------
 3 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6f5eb07a95..920ca1f7adea 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -168,9 +168,9 @@ static inline unsigned long current_stack_pointer(void)
  * entirely contained by a single stack frame.
  *
  * Returns:
- *		 1 if within a frame
- *		-1 if placed across a frame boundary (or outside stack)
- *		 0 unable to determine (no frame pointers, etc)
+ *	GOOD_FRAME	if within a frame
+ *	BAD_STACK	if placed across a frame boundary (or outside stack)
+ *	NOT_STACK	unable to determine (no frame pointers, etc)
  */
 static inline int arch_within_stack_frames(const void * const stack,
 					   const void * const stackend,
@@ -197,13 +197,14 @@ static inline int arch_within_stack_frames(const void * const stack,
 		 * the copy as invalid.
 		 */
 		if (obj + len <= frame)
-			return obj >= oldframe + 2 * sizeof(void *) ? 1 : -1;
+			return obj >= oldframe + 2 * sizeof(void *) ?
+				GOOD_FRAME : BAD_STACK;
 		oldframe = frame;
 		frame = *(const void * const *)frame;
 	}
-	return -1;
+	return BAD_STACK;
 #else
-	return 0;
+	return NOT_STACK;
 #endif
 }
 
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 58373875e8ee..0dbe41be6181 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -22,6 +22,18 @@
 #endif
 
 #include <linux/bitops.h>
+
+/*
+ * For per-arch arch_within_stack_frames() implementations, defined in
+ * asm/thread_info.h.
+ */
+enum {
+	BAD_STACK = -1,
+	NOT_STACK = 0,
+	GOOD_FRAME,
+	GOOD_STACK,
+};
+
 #include <asm/thread_info.h>
 
 #ifdef __KERNEL__
diff --git a/mm/usercopy.c b/mm/usercopy.c
index d155e12563b1..1eba99baf1cf 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -19,15 +19,9 @@
 #include <linux/sched.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
+#include <linux/thread_info.h>
 #include <asm/sections.h>
 
-enum {
-	BAD_STACK = -1,
-	NOT_STACK = 0,
-	GOOD_FRAME,
-	GOOD_STACK,
-};
-
 /*
  * Checks if a given pointer and length is contained by the current
  * stack frame (if possible).
-- 
cgit v1.2.3


From f1c316a3ab9d24df6022682422fe897492f2c0c8 Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Fri, 19 Aug 2016 20:39:09 +0200
Subject: KEYS: add SP800-56A KDF support for DH

SP800-56A defines the use of DH with key derivation function based on a
counter. The input to the KDF is defined as (DH shared secret || other
information). The value for the "other information" is to be provided by
the caller.

The KDF is implemented using the hash support from the kernel crypto API.
The implementation uses the symmetric hash support as the input to the
hash operation is usually very small. The caller is allowed to specify
the hash name that he wants to use to derive the key material allowing
the use of all supported hashes provided with the kernel crypto API.

As the KDF implements the proper truncation of the DH shared secret to
the requested size, this patch fills the caller buffer up to its size.

The patch is tested with a new test added to the keyutils user space
code which uses a CAVS test vector testing the compliance with
SP800-56A.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/security/keys.txt |  34 +++++--
 include/linux/compat.h          |   7 ++
 include/uapi/linux/keyctl.h     |   7 ++
 security/keys/Kconfig           |   1 +
 security/keys/Makefile          |   3 +-
 security/keys/compat.c          |   5 +-
 security/keys/compat_dh.c       |  38 +++++++
 security/keys/dh.c              | 220 +++++++++++++++++++++++++++++++++++++---
 security/keys/internal.h        |  24 ++++-
 security/keys/keyctl.c          |   2 +-
 10 files changed, 315 insertions(+), 26 deletions(-)
 create mode 100644 security/keys/compat_dh.c

(limited to 'include/linux')

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 5f554aab8751..cd5019934d7f 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -827,7 +827,7 @@ The keyctl syscall functions are:
 
        long keyctl(KEYCTL_DH_COMPUTE, struct keyctl_dh_params *params,
 		   char *buffer, size_t buflen,
-		   void *reserved);
+		   struct keyctl_kdf_params *kdf);
 
      The params struct contains serial numbers for three keys:
 
@@ -844,18 +844,36 @@ The keyctl syscall functions are:
      public key.  If the base is the remote public key, the result is
      the shared secret.
 
-     The reserved argument must be set to NULL.
+     If the parameter kdf is NULL, the following applies:
 
-     The buffer length must be at least the length of the prime, or zero.
+	 - The buffer length must be at least the length of the prime, or zero.
 
-     If the buffer length is nonzero, the length of the result is
-     returned when it is successfully calculated and copied in to the
-     buffer. When the buffer length is zero, the minimum required
-     buffer length is returned.
+	 - If the buffer length is nonzero, the length of the result is
+	   returned when it is successfully calculated and copied in to the
+	   buffer. When the buffer length is zero, the minimum required
+	   buffer length is returned.
+
+     The kdf parameter allows the caller to apply a key derivation function
+     (KDF) on the Diffie-Hellman computation where only the result
+     of the KDF is returned to the caller. The KDF is characterized with
+     struct keyctl_kdf_params as follows:
+
+	 - char *hashname specifies the NUL terminated string identifying
+	   the hash used from the kernel crypto API and applied for the KDF
+	   operation. The KDF implemenation complies with SP800-56A as well
+	   as with SP800-108 (the counter KDF).
+
+	 - char *otherinfo specifies the OtherInfo data as documented in
+	   SP800-56A section 5.8.1.2. The length of the buffer is given with
+	   otherinfolen. The format of OtherInfo is defined by the caller.
+	   The otherinfo pointer may be NULL if no OtherInfo shall be used.
 
      This function will return error EOPNOTSUPP if the key type is not
      supported, error ENOKEY if the key could not be found, or error
-     EACCES if the key is not readable by the caller.
+     EACCES if the key is not readable by the caller. In addition, the
+     function will return EMSGSIZE when the parameter kdf is non-NULL
+     and either the buffer length or the OtherInfo length exceeds the
+     allowed length.
 
  (*) Restrict keyring linkage
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index aef47be2a5c1..993c87182e02 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -295,6 +295,13 @@ struct compat_old_sigaction {
 };
 #endif
 
+struct compat_keyctl_kdf_params {
+	compat_uptr_t hashname;
+	compat_uptr_t otherinfo;
+	__u32 otherinfolen;
+	__u32 __spare[8];
+};
+
 struct compat_statfs;
 struct compat_statfs64;
 struct compat_old_linux_dirent;
diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h
index ff79c44e49a3..201c6644b237 100644
--- a/include/uapi/linux/keyctl.h
+++ b/include/uapi/linux/keyctl.h
@@ -69,4 +69,11 @@ struct keyctl_dh_params {
 	__s32 base;
 };
 
+struct keyctl_kdf_params {
+	char *hashname;
+	char *otherinfo;
+	__u32 otherinfolen;
+	__u32 __spare[8];
+};
+
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/security/keys/Kconfig b/security/keys/Kconfig
index d942c7c2bc0a..4ac1b83a23f8 100644
--- a/security/keys/Kconfig
+++ b/security/keys/Kconfig
@@ -90,6 +90,7 @@ config KEY_DH_OPERATIONS
        bool "Diffie-Hellman operations on retained keys"
        depends on KEYS
        select MPILIB
+       select CRYPTO_HASH
        help
 	 This option provides support for calculating Diffie-Hellman
 	 public keys and shared secrets using values stored as keys
diff --git a/security/keys/Makefile b/security/keys/Makefile
index 1fd4a16e6daf..57dff0c15809 100644
--- a/security/keys/Makefile
+++ b/security/keys/Makefile
@@ -15,7 +15,8 @@ obj-y := \
 	request_key.o \
 	request_key_auth.o \
 	user_defined.o
-obj-$(CONFIG_KEYS_COMPAT) += compat.o
+compat-obj-$(CONFIG_KEY_DH_OPERATIONS) += compat_dh.o
+obj-$(CONFIG_KEYS_COMPAT) += compat.o $(compat-obj-y)
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_SYSCTL) += sysctl.o
 obj-$(CONFIG_PERSISTENT_KEYRINGS) += persistent.o
diff --git a/security/keys/compat.c b/security/keys/compat.c
index bb98f2b8dd7d..e87c89c0177c 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -133,8 +133,9 @@ COMPAT_SYSCALL_DEFINE5(keyctl, u32, option,
 		return keyctl_get_persistent(arg2, arg3);
 
 	case KEYCTL_DH_COMPUTE:
-		return keyctl_dh_compute(compat_ptr(arg2), compat_ptr(arg3),
-					 arg4, compat_ptr(arg5));
+		return compat_keyctl_dh_compute(compat_ptr(arg2),
+						compat_ptr(arg3),
+						arg4, compat_ptr(arg5));
 
 	case KEYCTL_RESTRICT_KEYRING:
 		return keyctl_restrict_keyring(arg2, compat_ptr(arg3),
diff --git a/security/keys/compat_dh.c b/security/keys/compat_dh.c
new file mode 100644
index 000000000000..a6a659b6bcb6
--- /dev/null
+++ b/security/keys/compat_dh.c
@@ -0,0 +1,38 @@
+/* 32-bit compatibility syscall for 64-bit systems for DH operations
+ *
+ * Copyright (C) 2016 Stephan Mueller <smueller@chronox.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * Perform the DH computation or DH based key derivation.
+ *
+ * If successful, 0 will be returned.
+ */
+long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params,
+			      char __user *buffer, size_t buflen,
+			      struct compat_keyctl_kdf_params __user *kdf)
+{
+	struct keyctl_kdf_params kdfcopy;
+	struct compat_keyctl_kdf_params compat_kdfcopy;
+
+	if (!kdf)
+		return __keyctl_dh_compute(params, buffer, buflen, NULL);
+
+	if (copy_from_user(&compat_kdfcopy, kdf, sizeof(compat_kdfcopy)) != 0)
+		return -EFAULT;
+
+	kdfcopy.hashname = compat_ptr(compat_kdfcopy.hashname);
+	kdfcopy.otherinfo = compat_ptr(compat_kdfcopy.otherinfo);
+	kdfcopy.otherinfolen = compat_kdfcopy.otherinfolen;
+
+	return __keyctl_dh_compute(params, buffer, buflen, &kdfcopy);
+}
diff --git a/security/keys/dh.c b/security/keys/dh.c
index 893af4c45038..e603bd912e4c 100644
--- a/security/keys/dh.c
+++ b/security/keys/dh.c
@@ -11,6 +11,8 @@
 #include <linux/mpi.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/crypto.h>
+#include <crypto/hash.h>
 #include <keys/user-type.h>
 #include "internal.h"
 
@@ -77,9 +79,146 @@ error:
 	return ret;
 }
 
-long keyctl_dh_compute(struct keyctl_dh_params __user *params,
-		       char __user *buffer, size_t buflen,
-		       void __user *reserved)
+struct kdf_sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
+static int kdf_alloc(struct kdf_sdesc **sdesc_ret, char *hashname)
+{
+	struct crypto_shash *tfm;
+	struct kdf_sdesc *sdesc;
+	int size;
+
+	/* allocate synchronous hash */
+	tfm = crypto_alloc_shash(hashname, 0, 0);
+	if (IS_ERR(tfm)) {
+		pr_info("could not allocate digest TFM handle %s\n", hashname);
+		return PTR_ERR(tfm);
+	}
+
+	size = sizeof(struct shash_desc) + crypto_shash_descsize(tfm);
+	sdesc = kmalloc(size, GFP_KERNEL);
+	if (!sdesc)
+		return -ENOMEM;
+	sdesc->shash.tfm = tfm;
+	sdesc->shash.flags = 0x0;
+
+	*sdesc_ret = sdesc;
+
+	return 0;
+}
+
+static void kdf_dealloc(struct kdf_sdesc *sdesc)
+{
+	if (!sdesc)
+		return;
+
+	if (sdesc->shash.tfm)
+		crypto_free_shash(sdesc->shash.tfm);
+
+	kzfree(sdesc);
+}
+
+/* convert 32 bit integer into its string representation */
+static inline void crypto_kw_cpu_to_be32(u32 val, u8 *buf)
+{
+	__be32 *a = (__be32 *)buf;
+
+	*a = cpu_to_be32(val);
+}
+
+/*
+ * Implementation of the KDF in counter mode according to SP800-108 section 5.1
+ * as well as SP800-56A section 5.8.1 (Single-step KDF).
+ *
+ * SP800-56A:
+ * The src pointer is defined as Z || other info where Z is the shared secret
+ * from DH and other info is an arbitrary string (see SP800-56A section
+ * 5.8.1.2).
+ */
+static int kdf_ctr(struct kdf_sdesc *sdesc, const u8 *src, unsigned int slen,
+		   u8 *dst, unsigned int dlen)
+{
+	struct shash_desc *desc = &sdesc->shash;
+	unsigned int h = crypto_shash_digestsize(desc->tfm);
+	int err = 0;
+	u8 *dst_orig = dst;
+	u32 i = 1;
+	u8 iteration[sizeof(u32)];
+
+	while (dlen) {
+		err = crypto_shash_init(desc);
+		if (err)
+			goto err;
+
+		crypto_kw_cpu_to_be32(i, iteration);
+		err = crypto_shash_update(desc, iteration, sizeof(u32));
+		if (err)
+			goto err;
+
+		if (src && slen) {
+			err = crypto_shash_update(desc, src, slen);
+			if (err)
+				goto err;
+		}
+
+		if (dlen < h) {
+			u8 tmpbuffer[h];
+
+			err = crypto_shash_final(desc, tmpbuffer);
+			if (err)
+				goto err;
+			memcpy(dst, tmpbuffer, dlen);
+			memzero_explicit(tmpbuffer, h);
+			return 0;
+		} else {
+			err = crypto_shash_final(desc, dst);
+			if (err)
+				goto err;
+
+			dlen -= h;
+			dst += h;
+			i++;
+		}
+	}
+
+	return 0;
+
+err:
+	memzero_explicit(dst_orig, dlen);
+	return err;
+}
+
+static int keyctl_dh_compute_kdf(struct kdf_sdesc *sdesc,
+				 char __user *buffer, size_t buflen,
+				 uint8_t *kbuf, size_t kbuflen)
+{
+	uint8_t *outbuf = NULL;
+	int ret;
+
+	outbuf = kmalloc(buflen, GFP_KERNEL);
+	if (!outbuf) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	ret = kdf_ctr(sdesc, kbuf, kbuflen, outbuf, buflen);
+	if (ret)
+		goto err;
+
+	ret = buflen;
+	if (copy_to_user(buffer, outbuf, buflen) != 0)
+		ret = -EFAULT;
+
+err:
+	kzfree(outbuf);
+	return ret;
+}
+
+long __keyctl_dh_compute(struct keyctl_dh_params __user *params,
+			 char __user *buffer, size_t buflen,
+			 struct keyctl_kdf_params *kdfcopy)
 {
 	long ret;
 	MPI base, private, prime, result;
@@ -88,6 +227,7 @@ long keyctl_dh_compute(struct keyctl_dh_params __user *params,
 	uint8_t *kbuf;
 	ssize_t keylen;
 	size_t resultlen;
+	struct kdf_sdesc *sdesc = NULL;
 
 	if (!params || (!buffer && buflen)) {
 		ret = -EINVAL;
@@ -98,12 +238,34 @@ long keyctl_dh_compute(struct keyctl_dh_params __user *params,
 		goto out;
 	}
 
-	if (reserved) {
-		ret = -EINVAL;
-		goto out;
+	if (kdfcopy) {
+		char *hashname;
+
+		if (buflen > KEYCTL_KDF_MAX_OUTPUT_LEN ||
+		    kdfcopy->otherinfolen > KEYCTL_KDF_MAX_OI_LEN) {
+			ret = -EMSGSIZE;
+			goto out;
+		}
+
+		/* get KDF name string */
+		hashname = strndup_user(kdfcopy->hashname, CRYPTO_MAX_ALG_NAME);
+		if (IS_ERR(hashname)) {
+			ret = PTR_ERR(hashname);
+			goto out;
+		}
+
+		/* allocate KDF from the kernel crypto API */
+		ret = kdf_alloc(&sdesc, hashname);
+		kfree(hashname);
+		if (ret)
+			goto out;
 	}
 
-	keylen = mpi_from_key(pcopy.prime, buflen, &prime);
+	/*
+	 * If the caller requests postprocessing with a KDF, allow an
+	 * arbitrary output buffer size since the KDF ensures proper truncation.
+	 */
+	keylen = mpi_from_key(pcopy.prime, kdfcopy ? SIZE_MAX : buflen, &prime);
 	if (keylen < 0 || !prime) {
 		/* buflen == 0 may be used to query the required buffer size,
 		 * which is the prime key length.
@@ -133,12 +295,25 @@ long keyctl_dh_compute(struct keyctl_dh_params __user *params,
 		goto error3;
 	}
 
-	kbuf = kmalloc(resultlen, GFP_KERNEL);
+	/* allocate space for DH shared secret and SP800-56A otherinfo */
+	kbuf = kmalloc(kdfcopy ? (resultlen + kdfcopy->otherinfolen) : resultlen,
+		       GFP_KERNEL);
 	if (!kbuf) {
 		ret = -ENOMEM;
 		goto error4;
 	}
 
+	/*
+	 * Concatenate SP800-56A otherinfo past DH shared secret -- the
+	 * input to the KDF is (DH shared secret || otherinfo)
+	 */
+	if (kdfcopy && kdfcopy->otherinfo &&
+	    copy_from_user(kbuf + resultlen, kdfcopy->otherinfo,
+			   kdfcopy->otherinfolen) != 0) {
+		ret = -EFAULT;
+		goto error5;
+	}
+
 	ret = do_dh(result, base, private, prime);
 	if (ret)
 		goto error5;
@@ -147,12 +322,17 @@ long keyctl_dh_compute(struct keyctl_dh_params __user *params,
 	if (ret != 0)
 		goto error5;
 
-	ret = nbytes;
-	if (copy_to_user(buffer, kbuf, nbytes) != 0)
-		ret = -EFAULT;
+	if (kdfcopy) {
+		ret = keyctl_dh_compute_kdf(sdesc, buffer, buflen, kbuf,
+					    resultlen + kdfcopy->otherinfolen);
+	} else {
+		ret = nbytes;
+		if (copy_to_user(buffer, kbuf, nbytes) != 0)
+			ret = -EFAULT;
+	}
 
 error5:
-	kfree(kbuf);
+	kzfree(kbuf);
 error4:
 	mpi_free(result);
 error3:
@@ -162,5 +342,21 @@ error2:
 error1:
 	mpi_free(prime);
 out:
+	kdf_dealloc(sdesc);
 	return ret;
 }
+
+long keyctl_dh_compute(struct keyctl_dh_params __user *params,
+		       char __user *buffer, size_t buflen,
+		       struct keyctl_kdf_params __user *kdf)
+{
+	struct keyctl_kdf_params kdfcopy;
+
+	if (!kdf)
+		return __keyctl_dh_compute(params, buffer, buflen, NULL);
+
+	if (copy_from_user(&kdfcopy, kdf, sizeof(kdfcopy)) != 0)
+		return -EFAULT;
+
+	return __keyctl_dh_compute(params, buffer, buflen, &kdfcopy);
+}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 6ce016314897..c0f8682eba69 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -18,6 +18,7 @@
 #include <linux/task_work.h>
 #include <linux/keyctl.h>
 #include <linux/refcount.h>
+#include <linux/compat.h>
 
 struct iovec;
 
@@ -267,15 +268,34 @@ static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring)
 
 #ifdef CONFIG_KEY_DH_OPERATIONS
 extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
-			      size_t, void __user *);
+			      size_t, struct keyctl_kdf_params __user *);
+extern long __keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *,
+				size_t, struct keyctl_kdf_params *);
+#ifdef CONFIG_KEYS_COMPAT
+extern long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params,
+				char __user *buffer, size_t buflen,
+				struct compat_keyctl_kdf_params __user *kdf);
+#endif
+#define KEYCTL_KDF_MAX_OUTPUT_LEN	1024	/* max length of KDF output */
+#define KEYCTL_KDF_MAX_OI_LEN		64	/* max length of otherinfo */
 #else
 static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params,
 				     char __user *buffer, size_t buflen,
-				     void __user *reserved)
+				     struct keyctl_kdf_params __user *kdf)
+{
+	return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_KEYS_COMPAT
+static inline long compat_keyctl_dh_compute(
+				struct keyctl_dh_params __user *params,
+				char __user *buffer, size_t buflen,
+				struct keyctl_kdf_params __user *kdf)
 {
 	return -EOPNOTSUPP;
 }
 #endif
+#endif
 
 /*
  * Debugging key validation
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 6ee2826a2d06..10fcea154c0f 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1744,7 +1744,7 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case KEYCTL_DH_COMPUTE:
 		return keyctl_dh_compute((struct keyctl_dh_params __user *) arg2,
 					 (char __user *) arg3, (size_t) arg4,
-					 (void __user *) arg5);
+					 (struct keyctl_kdf_params __user *) arg5);
 
 	case KEYCTL_RESTRICT_KEYRING:
 		return keyctl_restrict_keyring((key_serial_t) arg2,
-- 
cgit v1.2.3


From 9b3fe6796d7c0e0c2b87243ce0c7f4744c54efad Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Tue, 28 Mar 2017 08:42:49 -0700
Subject: PCI: imx6: Add code to support i.MX7D

Add various bits of code needed to support i.MX7D variant of the IP.

Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
Acked-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Cc: yurovsky@gmail.com
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Fabio Estevam <fabio.estevam@nxp.com>
Cc: Dong Aisheng <dongas86@gmail.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: devicetree@vger.kernel.org
---
 .../devicetree/bindings/pci/fsl,imx6q-pcie.txt     |  14 ++-
 drivers/pci/dwc/pci-imx6.c                         | 120 ++++++++++++++++-----
 include/linux/mfd/syscon/imx7-iomuxc-gpr.h         |   4 +
 3 files changed, 112 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
index 83aeb1f5a645..e3d5680875b1 100644
--- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt
@@ -4,7 +4,11 @@ This PCIe host controller is based on the Synopsis Designware PCIe IP
 and thus inherits all the common properties defined in designware-pcie.txt.
 
 Required properties:
-- compatible: "fsl,imx6q-pcie", "fsl,imx6sx-pcie", "fsl,imx6qp-pcie"
+- compatible:
+	- "fsl,imx6q-pcie"
+	- "fsl,imx6sx-pcie",
+	- "fsl,imx6qp-pcie"
+	- "fsl,imx7d-pcie"
 - reg: base address and length of the PCIe controller
 - interrupts: A list of interrupt outputs of the controller. Must contain an
   entry for each entry in the interrupt-names property.
@@ -34,6 +38,14 @@ Additional required properties for imx6sx-pcie:
 - clock names: Must include the following additional entries:
 	- "pcie_inbound_axi"
 
+Additional required properties for imx7d-pcie:
+- power-domains: Must be set to a phandle pointing to PCIE_PHY power domain
+- resets: Must contain phandles to PCIe-related reset lines exposed by SRC
+  IP block
+- reset-names: Must contain the following entires:
+	       - "pciephy"
+	       - "apps"
+
 Example:
 
 	pcie@0x01000000 {
diff --git a/drivers/pci/dwc/pci-imx6.c b/drivers/pci/dwc/pci-imx6.c
index 801e46cd266d..e0c36d6f5a21 100644
--- a/drivers/pci/dwc/pci-imx6.c
+++ b/drivers/pci/dwc/pci-imx6.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/mfd/syscon.h>
 #include <linux/mfd/syscon/imx6q-iomuxc-gpr.h>
+#include <linux/mfd/syscon/imx7-iomuxc-gpr.h>
 #include <linux/module.h>
 #include <linux/of_gpio.h>
 #include <linux/of_device.h>
@@ -27,6 +28,7 @@
 #include <linux/signal.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/reset.h>
 
 #include "pcie-designware.h"
 
@@ -36,6 +38,7 @@ enum imx6_pcie_variants {
 	IMX6Q,
 	IMX6SX,
 	IMX6QP,
+	IMX7D,
 };
 
 struct imx6_pcie {
@@ -47,6 +50,8 @@ struct imx6_pcie {
 	struct clk		*pcie_inbound_axi;
 	struct clk		*pcie;
 	struct regmap		*iomuxc_gpr;
+	struct reset_control	*pciephy_reset;
+	struct reset_control	*apps_reset;
 	enum imx6_pcie_variants variant;
 	u32			tx_deemph_gen1;
 	u32			tx_deemph_gen2_3p5db;
@@ -56,6 +61,11 @@ struct imx6_pcie {
 	int			link_gen;
 };
 
+/* Parameters for the waiting for PCIe PHY PLL to lock on i.MX7 */
+#define PHY_PLL_LOCK_WAIT_MAX_RETRIES	2000
+#define PHY_PLL_LOCK_WAIT_USLEEP_MIN	50
+#define PHY_PLL_LOCK_WAIT_USLEEP_MAX	200
+
 /* PCIe Root Complex registers (memory-mapped) */
 #define PCIE_RC_LCR				0x7c
 #define PCIE_RC_LCR_MAX_LINK_SPEEDS_GEN1	0x1
@@ -248,6 +258,10 @@ static int imx6q_pcie_abort_handler(unsigned long addr,
 static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie)
 {
 	switch (imx6_pcie->variant) {
+	case IMX7D:
+		reset_control_assert(imx6_pcie->pciephy_reset);
+		reset_control_assert(imx6_pcie->apps_reset);
+		break;
 	case IMX6SX:
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 				   IMX6SX_GPR12_PCIE_TEST_POWERDOWN,
@@ -303,11 +317,32 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie)
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1,
 				   IMX6Q_GPR1_PCIE_REF_CLK_EN, 1 << 16);
 		break;
+	case IMX7D:
+		break;
 	}
 
 	return ret;
 }
 
+static void imx7d_pcie_wait_for_phy_pll_lock(struct imx6_pcie *imx6_pcie)
+{
+	u32 val;
+	unsigned int retries;
+	struct device *dev = imx6_pcie->pci->dev;
+
+	for (retries = 0; retries < PHY_PLL_LOCK_WAIT_MAX_RETRIES; retries++) {
+		regmap_read(imx6_pcie->iomuxc_gpr, IOMUXC_GPR22, &val);
+
+		if (val & IMX7D_GPR22_PCIE_PHY_PLL_LOCKED)
+			return;
+
+		usleep_range(PHY_PLL_LOCK_WAIT_USLEEP_MIN,
+			     PHY_PLL_LOCK_WAIT_USLEEP_MAX);
+	}
+
+	dev_err(dev, "PCIe PLL lock timeout\n");
+}
+
 static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie)
 {
 	struct dw_pcie *pci = imx6_pcie->pci;
@@ -351,6 +386,10 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie)
 	}
 
 	switch (imx6_pcie->variant) {
+	case IMX7D:
+		reset_control_deassert(imx6_pcie->pciephy_reset);
+		imx7d_pcie_wait_for_phy_pll_lock(imx6_pcie);
+		break;
 	case IMX6SX:
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR5,
 				   IMX6SX_GPR5_PCIE_BTNRST_RESET, 0);
@@ -377,35 +416,44 @@ err_pcie_bus:
 
 static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie)
 {
-	if (imx6_pcie->variant == IMX6SX)
+	switch (imx6_pcie->variant) {
+	case IMX7D:
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
+				   IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, 0);
+		break;
+	case IMX6SX:
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 				   IMX6SX_GPR12_PCIE_RX_EQ_MASK,
 				   IMX6SX_GPR12_PCIE_RX_EQ_2);
+		/* FALLTHROUGH */
+	default:
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
+				   IMX6Q_GPR12_PCIE_CTL_2, 0 << 10);
 
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
-			IMX6Q_GPR12_PCIE_CTL_2, 0 << 10);
+		/* configure constant input signal to the pcie ctrl and phy */
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
+				   IMX6Q_GPR12_LOS_LEVEL, 9 << 4);
+
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
+				   IMX6Q_GPR8_TX_DEEMPH_GEN1,
+				   imx6_pcie->tx_deemph_gen1 << 0);
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
+				   IMX6Q_GPR8_TX_DEEMPH_GEN2_3P5DB,
+				   imx6_pcie->tx_deemph_gen2_3p5db << 6);
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
+				   IMX6Q_GPR8_TX_DEEMPH_GEN2_6DB,
+				   imx6_pcie->tx_deemph_gen2_6db << 12);
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
+				   IMX6Q_GPR8_TX_SWING_FULL,
+				   imx6_pcie->tx_swing_full << 18);
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
+				   IMX6Q_GPR8_TX_SWING_LOW,
+				   imx6_pcie->tx_swing_low << 25);
+		break;
+	}
 
-	/* configure constant input signal to the pcie ctrl and phy */
 	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 			IMX6Q_GPR12_DEVICE_TYPE, PCI_EXP_TYPE_ROOT_PORT << 12);
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
-			IMX6Q_GPR12_LOS_LEVEL, 9 << 4);
-
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
-			   IMX6Q_GPR8_TX_DEEMPH_GEN1,
-			   imx6_pcie->tx_deemph_gen1 << 0);
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
-			   IMX6Q_GPR8_TX_DEEMPH_GEN2_3P5DB,
-			   imx6_pcie->tx_deemph_gen2_3p5db << 6);
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
-			   IMX6Q_GPR8_TX_DEEMPH_GEN2_6DB,
-			   imx6_pcie->tx_deemph_gen2_6db << 12);
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
-			   IMX6Q_GPR8_TX_SWING_FULL,
-			   imx6_pcie->tx_swing_full << 18);
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8,
-			   IMX6Q_GPR8_TX_SWING_LOW,
-			   imx6_pcie->tx_swing_low << 25);
 }
 
 static int imx6_pcie_wait_for_link(struct imx6_pcie *imx6_pcie)
@@ -469,8 +517,11 @@ static int imx6_pcie_establish_link(struct imx6_pcie *imx6_pcie)
 	dw_pcie_writel_dbi(pci, PCIE_RC_LCR, tmp);
 
 	/* Start LTSSM. */
-	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
-			IMX6Q_GPR12_PCIE_CTL_2, 1 << 10);
+	if (imx6_pcie->variant == IMX7D)
+		reset_control_deassert(imx6_pcie->apps_reset);
+	else
+		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
+				   IMX6Q_GPR12_PCIE_CTL_2, 1 << 10);
 
 	ret = imx6_pcie_wait_for_link(imx6_pcie);
 	if (ret)
@@ -653,13 +704,31 @@ static int __init imx6_pcie_probe(struct platform_device *pdev)
 		return PTR_ERR(imx6_pcie->pcie);
 	}
 
-	if (imx6_pcie->variant == IMX6SX) {
+	switch (imx6_pcie->variant) {
+	case IMX6SX:
 		imx6_pcie->pcie_inbound_axi = devm_clk_get(dev,
 							   "pcie_inbound_axi");
 		if (IS_ERR(imx6_pcie->pcie_inbound_axi)) {
 			dev_err(dev, "pcie_inbound_axi clock missing or invalid\n");
 			return PTR_ERR(imx6_pcie->pcie_inbound_axi);
 		}
+		break;
+	case IMX7D:
+		imx6_pcie->pciephy_reset = devm_reset_control_get(dev,
+								  "pciephy");
+		if (IS_ERR(imx6_pcie->pciephy_reset)) {
+			dev_err(dev, "Failed to get PCIEPHY reset contol\n");
+			return PTR_ERR(imx6_pcie->pciephy_reset);
+		}
+
+		imx6_pcie->apps_reset = devm_reset_control_get(dev, "apps");
+		if (IS_ERR(imx6_pcie->apps_reset)) {
+			dev_err(dev, "Failed to get PCIE APPS reset contol\n");
+			return PTR_ERR(imx6_pcie->apps_reset);
+		}
+		break;
+	default:
+		break;
 	}
 
 	/* Grab GPR config register range */
@@ -718,6 +787,7 @@ static const struct of_device_id imx6_pcie_of_match[] = {
 	{ .compatible = "fsl,imx6q-pcie",  .data = (void *)IMX6Q,  },
 	{ .compatible = "fsl,imx6sx-pcie", .data = (void *)IMX6SX, },
 	{ .compatible = "fsl,imx6qp-pcie", .data = (void *)IMX6QP, },
+	{ .compatible = "fsl,imx7d-pcie",  .data = (void *)IMX7D,  },
 	{},
 };
 
diff --git a/include/linux/mfd/syscon/imx7-iomuxc-gpr.h b/include/linux/mfd/syscon/imx7-iomuxc-gpr.h
index 4585d6105d68..abbd52466573 100644
--- a/include/linux/mfd/syscon/imx7-iomuxc-gpr.h
+++ b/include/linux/mfd/syscon/imx7-iomuxc-gpr.h
@@ -44,4 +44,8 @@
 
 #define IMX7D_GPR5_CSI_MUX_CONTROL_MIPI		(0x1 << 4)
 
+#define IMX7D_GPR12_PCIE_PHY_REFCLK_SEL		BIT(5)
+
+#define IMX7D_GPR22_PCIE_PHY_PLL_LOCKED		BIT(31)
+
 #endif /* __LINUX_IMX7_IOMUXC_GPR_H */
-- 
cgit v1.2.3


From e7f6ccaab127147c52ac4b624c54ad0059bd08e9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 28 Mar 2017 12:36:35 +0300
Subject: NFC: pn544: Get rid of platform data

Legacy platform data must go away. We are on the safe side here since
there are no users of it in the kernel.

If anyone by any odd reason needs it the GPIO lookup tables and
built-in device properties at your service.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/pn544/i2c.c             | 43 ++++++-------------------------------
 include/linux/platform_data/pn544.h | 43 -------------------------------------
 2 files changed, 6 insertions(+), 80 deletions(-)
 delete mode 100644 include/linux/platform_data/pn544.h

(limited to 'include/linux')

diff --git a/drivers/nfc/pn544/i2c.c b/drivers/nfc/pn544/i2c.c
index 7030a47fe379..4afc92a54444 100644
--- a/drivers/nfc/pn544/i2c.c
+++ b/drivers/nfc/pn544/i2c.c
@@ -30,7 +30,7 @@
 #include <linux/nfc.h>
 #include <linux/firmware.h>
 #include <linux/gpio/consumer.h>
-#include <linux/platform_data/pn544.h>
+
 #include <asm/unaligned.h>
 
 #include <net/nfc/hci.h>
@@ -972,7 +972,6 @@ static int pn544_hci_i2c_probe(struct i2c_client *client,
 			       const struct i2c_device_id *id)
 {
 	struct pn544_i2c_phy *phy;
-	struct pn544_nfc_platform_data *pdata;
 	int r = 0;
 
 	dev_dbg(&client->dev, "%s\n", __func__);
@@ -994,32 +993,13 @@ static int pn544_hci_i2c_probe(struct i2c_client *client,
 	phy->i2c_dev = client;
 	i2c_set_clientdata(client, phy);
 
-	pdata = client->dev.platform_data;
-
 	/* No platform data, using device tree. */
-	if (!pdata && client->dev.of_node) {
+	if (client->dev.of_node) {
 		r = pn544_hci_i2c_of_request_resources(client);
 		if (r) {
 			nfc_err(&client->dev, "No DT data\n");
 			return r;
 		}
-	/* Using platform data. */
-	} else if (pdata) {
-
-		if (pdata->request_resources == NULL) {
-			nfc_err(&client->dev, "request_resources() missing\n");
-			return -EINVAL;
-		}
-
-		r = pdata->request_resources(client);
-		if (r) {
-			nfc_err(&client->dev,
-				"Cannot get platform resources\n");
-			return r;
-		}
-
-		phy->gpio_en = pdata->get_gpio(NFC_GPIO_ENABLE);
-		phy->gpio_fw = pdata->get_gpio(NFC_GPIO_FW_RESET);
 	/* Using ACPI */
 	} else if (ACPI_HANDLE(&client->dev)) {
 		r = pn544_hci_i2c_acpi_request_resources(client);
@@ -1056,12 +1036,8 @@ err_hci:
 	free_irq(client->irq, phy);
 
 err_rti:
-	if (!pdata) {
-		gpio_free(phy->gpio_en);
-		gpio_free(phy->gpio_fw);
-	} else if (pdata->free_resources) {
-		pdata->free_resources();
-	}
+	gpio_free(phy->gpio_en);
+	gpio_free(phy->gpio_fw);
 
 	return r;
 }
@@ -1069,7 +1045,6 @@ err_rti:
 static int pn544_hci_i2c_remove(struct i2c_client *client)
 {
 	struct pn544_i2c_phy *phy = i2c_get_clientdata(client);
-	struct pn544_nfc_platform_data *pdata = client->dev.platform_data;
 
 	dev_dbg(&client->dev, "%s\n", __func__);
 
@@ -1084,14 +1059,8 @@ static int pn544_hci_i2c_remove(struct i2c_client *client)
 
 	free_irq(client->irq, phy);
 
-	/* No platform data, GPIOs have been requested by this driver */
-	if (!pdata) {
-		gpio_free(phy->gpio_en);
-		gpio_free(phy->gpio_fw);
-	/* Using platform data */
-	} else if (pdata->free_resources) {
-		pdata->free_resources();
-	}
+	gpio_free(phy->gpio_en);
+	gpio_free(phy->gpio_fw);
 
 	return 0;
 }
diff --git a/include/linux/platform_data/pn544.h b/include/linux/platform_data/pn544.h
deleted file mode 100644
index 5ce1ab983f44..000000000000
--- a/include/linux/platform_data/pn544.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Driver include for the PN544 NFC chip.
- *
- * Copyright (C) Nokia Corporation
- *
- * Author: Jari Vanhala <ext-jari.vanhala@nokia.com>
- * Contact: Matti Aaltoenn <matti.j.aaltonen@nokia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _PN544_H_
-#define _PN544_H_
-
-#include <linux/i2c.h>
-
-enum {
-	NFC_GPIO_ENABLE,
-	NFC_GPIO_FW_RESET,
-	NFC_GPIO_IRQ
-};
-
-/* board config */
-struct pn544_nfc_platform_data {
-	int (*request_resources) (struct i2c_client *client);
-	void (*free_resources) (void);
-	void (*enable) (int fw);
-	int (*test) (void);
-	void (*disable) (void);
-	int (*get_gpio)(int type);
-};
-
-#endif /* _PN544_H_ */
-- 
cgit v1.2.3


From 79557b33cca2fa005235b45ab16b81f95f441bd8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 7 Mar 2017 12:25:43 +0200
Subject: NFC: st21nfca: Get rid of platform data

Legacy platform data must go away. We are on the safe side here since
there are no users of it in the kernel.

If anyone by any odd reason needs it the GPIO lookup tables and
built-in device properties at your service.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st21nfca/i2c.c             | 46 +++-------------------------------
 include/linux/platform_data/st21nfca.h | 33 ------------------------
 2 files changed, 3 insertions(+), 76 deletions(-)
 delete mode 100644 include/linux/platform_data/st21nfca.h

(limited to 'include/linux')

diff --git a/drivers/nfc/st21nfca/i2c.c b/drivers/nfc/st21nfca/i2c.c
index c4f0d048359b..443733502251 100644
--- a/drivers/nfc/st21nfca/i2c.c
+++ b/drivers/nfc/st21nfca/i2c.c
@@ -20,7 +20,6 @@
 #include <linux/crc-ccitt.h>
 #include <linux/module.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/of_irq.h>
 #include <linux/of_gpio.h>
@@ -29,7 +28,7 @@
 #include <linux/delay.h>
 #include <linux/nfc.h>
 #include <linux/firmware.h>
-#include <linux/platform_data/st21nfca.h>
+
 #include <asm/unaligned.h>
 
 #include <net/nfc/hci.h>
@@ -59,6 +58,7 @@
 #define IS_START_OF_FRAME(buf) (buf[0] == ST21NFCA_SOF_EOF && \
 				buf[1] == 0)
 
+#define ST21NFCA_HCI_DRIVER_NAME "st21nfca_hci"
 #define ST21NFCA_HCI_I2C_DRIVER_NAME "st21nfca_hci_i2c"
 
 #define ST21NFCA_GPIO_NAME_EN "enable"
@@ -576,43 +576,10 @@ static int st21nfca_hci_i2c_of_request_resources(struct i2c_client *client)
 	return 0;
 }
 
-static int st21nfca_hci_i2c_request_resources(struct i2c_client *client)
-{
-	struct st21nfca_nfc_platform_data *pdata;
-	struct st21nfca_i2c_phy *phy = i2c_get_clientdata(client);
-	int r;
-
-	pdata = client->dev.platform_data;
-	if (pdata == NULL) {
-		nfc_err(&client->dev, "No platform data\n");
-		return -EINVAL;
-	}
-
-	/* store for later use */
-	phy->gpio_ena = pdata->gpio_ena;
-	phy->irq_polarity = pdata->irq_polarity;
-
-	if (phy->gpio_ena > 0) {
-		r = devm_gpio_request_one(&client->dev, phy->gpio_ena,
-					  GPIOF_OUT_INIT_HIGH,
-					  ST21NFCA_GPIO_NAME_EN);
-		if (r) {
-			pr_err("%s : ena gpio_request failed\n", __FILE__);
-			return r;
-		}
-	}
-
-	phy->se_status.is_ese_present = pdata->is_ese_present;
-	phy->se_status.is_uicc_present = pdata->is_uicc_present;
-
-	return 0;
-}
-
 static int st21nfca_hci_i2c_probe(struct i2c_client *client,
 				  const struct i2c_device_id *id)
 {
 	struct st21nfca_i2c_phy *phy;
-	struct st21nfca_nfc_platform_data *pdata;
 	int r;
 
 	dev_dbg(&client->dev, "%s\n", __func__);
@@ -638,19 +605,12 @@ static int st21nfca_hci_i2c_probe(struct i2c_client *client,
 	mutex_init(&phy->phy_lock);
 	i2c_set_clientdata(client, phy);
 
-	pdata = client->dev.platform_data;
-	if (!pdata && client->dev.of_node) {
+	if (client->dev.of_node) {
 		r = st21nfca_hci_i2c_of_request_resources(client);
 		if (r) {
 			nfc_err(&client->dev, "No platform data\n");
 			return r;
 		}
-	} else if (pdata) {
-		r = st21nfca_hci_i2c_request_resources(client);
-		if (r) {
-			nfc_err(&client->dev, "Cannot get platform resources\n");
-			return r;
-		}
 	} else if (ACPI_HANDLE(&client->dev)) {
 		r = st21nfca_hci_i2c_acpi_request_resources(client);
 		if (r) {
diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h
deleted file mode 100644
index cc2bdafb0c69..000000000000
--- a/include/linux/platform_data/st21nfca.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Driver include for the ST21NFCA NFC chip.
- *
- * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _ST21NFCA_HCI_H_
-#define _ST21NFCA_HCI_H_
-
-#include <linux/i2c.h>
-
-#define ST21NFCA_HCI_DRIVER_NAME "st21nfca_hci"
-
-struct st21nfca_nfc_platform_data {
-	unsigned int gpio_ena;
-	unsigned int irq_polarity;
-	bool is_ese_present;
-	bool is_uicc_present;
-};
-
-#endif /* _ST21NFCA_HCI_H_ */
-- 
cgit v1.2.3


From 6e7300cff1c410dde7ac4354b6a0a8cb0a561e54 Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Tue, 4 Apr 2017 17:02:41 +0100
Subject: efi/bgrt: Enable ACPI BGRT handling on arm64

Now that the ACPI BGRT handling code has been made generic, we can
enable it for arm64.

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
[ Updated commit log to reflect that BGRT is only enabled for arm64, and added
  missing 'return' statement to the dummy acpi_parse_bgrt() function. ]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170404160245.27812-8-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/acpi.c    | 3 +++
 arch/x86/kernel/acpi/boot.c | 6 ------
 drivers/acpi/Kconfig        | 2 +-
 drivers/acpi/bgrt.c         | 6 ++++++
 include/linux/efi-bgrt.h    | 5 +++++
 5 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 64d9cbd61678..e25c11e727fe 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -18,6 +18,7 @@
 #include <linux/acpi.h>
 #include <linux/bootmem.h>
 #include <linux/cpumask.h>
+#include <linux/efi-bgrt.h>
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
@@ -233,6 +234,8 @@ done:
 			early_init_dt_scan_chosen_stdout();
 	} else {
 		parse_spcr(earlycon_init_is_deferred);
+		if (IS_ENABLED(CONFIG_ACPI_BGRT))
+			acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 	}
 }
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2879cc23db4..70854988a963 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1564,12 +1564,6 @@ int __init early_acpi_boot_init(void)
 	return 0;
 }
 
-static int __init acpi_parse_bgrt(struct acpi_table_header *table)
-{
-	efi_bgrt_init(table);
-	return 0;
-}
-
 int __init acpi_boot_init(void)
 {
 	/* those are executed after early-quirks are executed */
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 83e5f7e1a20d..dad02c0f21b9 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -440,7 +440,7 @@ config ACPI_CUSTOM_METHOD
 
 config ACPI_BGRT
 	bool "Boottime Graphics Resource Table support"
-	depends on EFI && X86
+	depends on EFI && (X86 || ARM64)
         help
 	  This driver adds support for exposing the ACPI Boottime Graphics
 	  Resource Table, which allows the operating system to obtain
diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c
index ca28aa572aa9..df1c629205e7 100644
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -81,6 +81,12 @@ static struct attribute_group bgrt_attribute_group = {
 	.bin_attrs = bgrt_bin_attributes,
 };
 
+int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	efi_bgrt_init(table);
+	return 0;
+}
+
 static int __init bgrt_init(void)
 {
 	int ret;
diff --git a/include/linux/efi-bgrt.h b/include/linux/efi-bgrt.h
index 2fd3993c370b..e6f624b53c3d 100644
--- a/include/linux/efi-bgrt.h
+++ b/include/linux/efi-bgrt.h
@@ -6,6 +6,7 @@
 #ifdef CONFIG_ACPI_BGRT
 
 void efi_bgrt_init(struct acpi_table_header *table);
+int __init acpi_parse_bgrt(struct acpi_table_header *table);
 
 /* The BGRT data itself; only valid if bgrt_image != NULL. */
 extern size_t bgrt_image_size;
@@ -14,6 +15,10 @@ extern struct acpi_table_bgrt bgrt_tab;
 #else /* !CONFIG_ACPI_BGRT */
 
 static inline void efi_bgrt_init(struct acpi_table_header *table) {}
+static inline int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	return 0;
+}
 
 #endif /* !CONFIG_ACPI_BGRT */
 
-- 
cgit v1.2.3


From 60f38de7a8d4e816100ceafd1b382df52527bd50 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Apr 2017 17:09:08 +0100
Subject: efi/libstub: Unify command line param parsing

Merge the parsing of the command line carried out in arm-stub.c with
the handling in efi_parse_options(). Note that this also fixes the
missing handling of CONFIG_CMDLINE_FORCE=y, in which case the builtin
command line should supersede the one passed by the firmware.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bhe@redhat.com
Cc: bhsharma@redhat.com
Cc: bp@alien8.de
Cc: eugene@hp.com
Cc: evgeny.kalugin@intel.com
Cc: jhugo@codeaurora.org
Cc: leif.lindholm@linaro.org
Cc: linux-efi@vger.kernel.org
Cc: mark.rutland@arm.com
Cc: roy.franz@cavium.com
Cc: rruigrok@codeaurora.org
Link: http://lkml.kernel.org/r/20170404160910.28115-1-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/libstub/arm-stub.c        | 24 +++++++-----------------
 drivers/firmware/efi/libstub/arm64-stub.c      |  4 +---
 drivers/firmware/efi/libstub/efi-stub-helper.c | 19 +++++++++++--------
 drivers/firmware/efi/libstub/efistub.h         |  2 ++
 include/linux/efi.h                            |  2 +-
 5 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 02049ff25c6b..ac3222f6f805 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,8 +18,6 @@
 
 #include "efistub.h"
 
-bool __nokaslr;
-
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
 			     void *__image, void **__fh)
 {
@@ -153,18 +151,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail;
 	}
 
-	/* check whether 'nokaslr' was passed on the command line */
-	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		static const u8 default_cmdline[] = CONFIG_CMDLINE;
-		const u8 *str, *cmdline = cmdline_ptr;
-
-		if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
-			cmdline = default_cmdline;
-		str = strstr(cmdline, "nokaslr");
-		if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
-			__nokaslr = true;
-	}
-
 	si = setup_graphics(sys_table);
 
 	status = handle_kernel_image(sys_table, image_addr, &image_size,
@@ -176,9 +162,13 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail_free_cmdline;
 	}
 
-	status = efi_parse_options(cmdline_ptr);
-	if (status != EFI_SUCCESS)
-		pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    cmdline_size == 0)
+		efi_parse_options(CONFIG_CMDLINE);
+
+	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+		efi_parse_options(cmdline_ptr);
 
 	secure_boot = efi_get_secureboot(sys_table);
 
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index eae693eb3e91..b4c2589d7c91 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -16,8 +16,6 @@
 
 #include "efistub.h"
 
-extern bool __nokaslr;
-
 efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 {
 	u64 tg;
@@ -52,7 +50,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
 	u64 phys_seed = 0;
 
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		if (!__nokaslr) {
+		if (!nokaslr()) {
 			status = efi_get_random_bytes(sys_table_arg,
 						      sizeof(phys_seed),
 						      (u8 *)&phys_seed);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 3290fae0b38f..2e17d2b8787c 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,13 @@
 
 static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 
+static int __section(.data) __nokaslr;
+
+int __pure nokaslr(void)
+{
+	return __nokaslr;
+}
+
 #define EFI_MMAP_NR_SLACK_SLOTS	8
 
 struct file_info {
@@ -409,17 +416,13 @@ static efi_status_t efi_file_close(void *handle)
  * environments, first in the early boot environment of the EFI boot
  * stub, and subsequently during the kernel boot.
  */
-efi_status_t efi_parse_options(char *cmdline)
+efi_status_t efi_parse_options(char const *cmdline)
 {
 	char *str;
 
-	/*
-	 * Currently, the only efi= option we look for is 'nochunk', which
-	 * is intended to work around known issues on certain x86 UEFI
-	 * versions. So ignore for now on other architectures.
-	 */
-	if (!IS_ENABLED(CONFIG_X86))
-		return EFI_SUCCESS;
+	str = strstr(cmdline, "nokaslr");
+	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+		__nokaslr = 1;
 
 	/*
 	 * If no EFI parameters were specified on the cmdline we've got
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 71c4d0e3c4ed..a7a2a2c3f199 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -24,6 +24,8 @@
 #define EFI_ALLOC_ALIGN		EFI_PAGE_SIZE
 #endif
 
+extern int __pure nokaslr(void);
+
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 94d34e0be24f..e485e87615d1 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1471,7 +1471,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 				  unsigned long *load_addr,
 				  unsigned long *load_size);
 
-efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_parse_options(char const *cmdline);
 
 efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
 			   struct screen_info *si, efi_guid_t *proto,
-- 
cgit v1.2.3


From eeff7d634f4750306785be709ca444140c29b043 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Apr 2017 17:09:09 +0100
Subject: efi/libstub/arm/arm64: Disable debug prints on 'quiet' cmdline arg

The EFI stub currently prints a number of diagnostic messages that do
not carry a lot of information. Since these prints are not controlled
by 'loglevel' or other command line parameters, and since they appear on
the EFI framebuffer as well (if enabled), it would be nice if we could
turn them off.

So let's add support for the 'quiet' command line parameter in the stub,
and disable the non-error prints if it is passed.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bhe@redhat.com
Cc: bhsharma@redhat.com
Cc: bp@alien8.de
Cc: eugene@hp.com
Cc: evgeny.kalugin@intel.com
Cc: jhugo@codeaurora.org
Cc: leif.lindholm@linaro.org
Cc: linux-efi@vger.kernel.org
Cc: roy.franz@cavium.com
Cc: rruigrok@codeaurora.org
Link: http://lkml.kernel.org/r/20170404160910.28115-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/libstub/arm-stub.c        | 20 ++++++++++----------
 drivers/firmware/efi/libstub/arm32-stub.c      |  2 ++
 drivers/firmware/efi/libstub/efi-stub-helper.c |  9 +++++++++
 drivers/firmware/efi/libstub/efistub.h         |  7 +++++++
 drivers/firmware/efi/libstub/secureboot.c      |  2 ++
 include/linux/efi.h                            |  3 ---
 6 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index ac3222f6f805..657bb72c9e0b 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -116,8 +116,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
 		goto fail;
 
-	pr_efi(sys_table, "Booting Linux Kernel...\n");
-
 	status = check_platform_features(sys_table);
 	if (status != EFI_SUCCESS)
 		goto fail;
@@ -151,6 +149,16 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail;
 	}
 
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    cmdline_size == 0)
+		efi_parse_options(CONFIG_CMDLINE);
+
+	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+		efi_parse_options(cmdline_ptr);
+
+	pr_efi(sys_table, "Booting Linux Kernel...\n");
+
 	si = setup_graphics(sys_table);
 
 	status = handle_kernel_image(sys_table, image_addr, &image_size,
@@ -162,14 +170,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail_free_cmdline;
 	}
 
-	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
-	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
-	    cmdline_size == 0)
-		efi_parse_options(CONFIG_CMDLINE);
-
-	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
-		efi_parse_options(cmdline_ptr);
-
 	secure_boot = efi_get_secureboot(sys_table);
 
 	/*
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 18a8b5eb55e7..becbda445913 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -9,6 +9,8 @@
 #include <linux/efi.h>
 #include <asm/efi.h>
 
+#include "efistub.h"
+
 efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 {
 	int block;
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 2e17d2b8787c..b0184360efc6 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -33,11 +33,16 @@
 static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 
 static int __section(.data) __nokaslr;
+static int __section(.data) __quiet;
 
 int __pure nokaslr(void)
 {
 	return __nokaslr;
 }
+int __pure is_quiet(void)
+{
+	return __quiet;
+}
 
 #define EFI_MMAP_NR_SLACK_SLOTS	8
 
@@ -424,6 +429,10 @@ efi_status_t efi_parse_options(char const *cmdline)
 	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
 		__nokaslr = 1;
 
+	str = strstr(cmdline, "quiet");
+	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+		__quiet = 1;
+
 	/*
 	 * If no EFI parameters were specified on the cmdline we've got
 	 * nothing to do.
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index a7a2a2c3f199..83f268c05007 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -25,6 +25,13 @@
 #endif
 
 extern int __pure nokaslr(void);
+extern int __pure is_quiet(void);
+
+#define pr_efi(sys_table, msg)		do {				\
+	if (!is_quiet()) efi_printk(sys_table, "EFI stub: "msg);	\
+} while (0)
+
+#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
 
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 5da36e56b36a..8c34d50a4d80 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -12,6 +12,8 @@
 #include <linux/efi.h>
 #include <asm/efi.h>
 
+#include "efistub.h"
+
 /* BIOS variables */
 static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
 static const efi_char16_t const efi_SecureBoot_name[] = {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index e485e87615d1..ec36f42a2add 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1435,9 +1435,6 @@ static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
 
 /* prototypes shared between arch specific and generic stub code */
 
-#define pr_efi(sys_table, msg)     efi_printk(sys_table, "EFI stub: "msg)
-#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
-
 void efi_printk(efi_system_table_t *sys_table_arg, char *str);
 
 void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
-- 
cgit v1.2.3


From b2376407f98920c9b0c411948675f58a9640be35 Mon Sep 17 00:00:00 2001
From: Vic Yang <victoryang@google.com>
Date: Fri, 24 Mar 2017 18:44:01 +0100
Subject: mfd: cros-ec: Fix host command buffer size

For SPI, we can get up to 32 additional bytes for response preamble.
The current overhead (2 bytes) may cause problems when we try to receive
a big response. Update it to 32 bytes.

Without this fix we could see a kernel BUG when we receive a big response
from the Chrome EC when is connected via SPI.

Signed-off-by: Vic Yang <victoryang@google.com>
Tested-by: Enric Balletbo i Serra <enric.balletbo.collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/cros_ec.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 7a01c94496f1..3eef9fb9968a 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -35,10 +35,11 @@
  * Max bus-specific overhead incurred by request/responses.
  * I2C requires 1 additional byte for requests.
  * I2C requires 2 additional bytes for responses.
+ * SPI requires up to 32 additional bytes for responses.
  * */
 #define EC_PROTO_VERSION_UNKNOWN	0
 #define EC_MAX_REQUEST_OVERHEAD		1
-#define EC_MAX_RESPONSE_OVERHEAD	2
+#define EC_MAX_RESPONSE_OVERHEAD	32
 
 /*
  * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
-- 
cgit v1.2.3


From def12888c161e6fec0702e5ec9c3962846e3a21d Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 4 Apr 2017 09:23:42 -0400
Subject: rtnl: Add support for netdev event to link messages

When netdev events happen, a rtnetlink_event() handler will send
messages for every event in it's white list.  These messages contain
current information about a particular device, but they do not include
the iformation about which event just happened.  The consumer of
the message has to try to infer this information.  In some cases
(ex: NETDEV_NOTIFY_PEERS), that is not possible.

This patch adds a new extension to RTM_NEWLINK message called IFLA_EVENT
that would have an encoding of the which event triggered this
message.  This would allow the the message consumer to easily determine
if it is interested in a particular event or not.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  3 +-
 include/uapi/linux/if_link.h | 21 ++++++++++
 net/core/dev.c               |  2 +-
 net/core/rtnetlink.c         | 92 +++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 107 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 57e54847b0b9..0459018173cf 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -18,7 +18,8 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
-				       unsigned change, gfp_t flags);
+				       unsigned change, unsigned long event,
+				       gfp_t flags);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8b405afb2376..97f6d302f627 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -157,6 +157,7 @@ enum {
 	IFLA_GSO_MAX_SIZE,
 	IFLA_PAD,
 	IFLA_XDP,
+	IFLA_EVENT,
 	__IFLA_MAX
 };
 
@@ -899,4 +900,24 @@ enum {
 
 #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1)
 
+enum {
+	IFLA_EVENT_UNSPEC,
+	IFLA_EVENT_REBOOT,
+	IFLA_EVENT_CHANGE_MTU,
+	IFLA_EVENT_CHANGE_ADDR,
+	IFLA_EVENT_CHANGE_NAME,
+	IFLA_EVENT_FEAT_CHANGE,
+	IFLA_EVENT_BONDING_FAILOVER,
+	IFLA_EVENT_POST_TYPE_CHANGE,
+	IFLA_EVENT_NOTIFY_PEERS,
+	IFLA_EVENT_CHANGE_UPPER,
+	IFLA_EVENT_RESEND_IGMP,
+	IFLA_EVENT_PRE_CHANGE_MTU,
+	IFLA_EVENT_CHANGE_INFO_DATA,
+	IFLA_EVENT_PRE_CHANGE_UPPER,
+	IFLA_EVENT_CHANGE_LOWER_STATE,
+	IFLA_EVENT_UDP_TUNNEL_PUSH_INFO,
+	IFLA_EVENT_CHANGE_TX_QUEUE_LEN,
+};
+
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index ef9fe60ee294..7efb4178ffef 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6840,7 +6840,7 @@ static void rollback_registered_many(struct list_head *head)
 
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
+			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
 						     GFP_KERNEL);
 
 		/*
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 58419da7961b..b2bd4c9ee860 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -944,6 +944,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size(dev) /* IFLA_XDP */
+	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1276,9 +1277,70 @@ err_cancel:
 	return err;
 }
 
+static int rtnl_fill_link_event(struct sk_buff *skb, unsigned long event)
+{
+	u32 rtnl_event;
+
+	switch (event) {
+	case NETDEV_REBOOT:
+		rtnl_event = IFLA_EVENT_REBOOT;
+		break;
+	case NETDEV_CHANGEMTU:
+		rtnl_event = IFLA_EVENT_CHANGE_MTU;
+		break;
+	case NETDEV_CHANGEADDR:
+		rtnl_event = IFLA_EVENT_CHANGE_ADDR;
+		break;
+	case NETDEV_CHANGENAME:
+		rtnl_event = IFLA_EVENT_CHANGE_NAME;
+		break;
+	case NETDEV_FEAT_CHANGE:
+		rtnl_event = IFLA_EVENT_FEAT_CHANGE;
+		break;
+	case NETDEV_BONDING_FAILOVER:
+		rtnl_event = IFLA_EVENT_BONDING_FAILOVER;
+		break;
+	case NETDEV_POST_TYPE_CHANGE:
+		rtnl_event = IFLA_EVENT_POST_TYPE_CHANGE;
+		break;
+	case NETDEV_NOTIFY_PEERS:
+		rtnl_event = IFLA_EVENT_NOTIFY_PEERS;
+		break;
+	case NETDEV_CHANGEUPPER:
+		rtnl_event = IFLA_EVENT_CHANGE_UPPER;
+		break;
+	case NETDEV_RESEND_IGMP:
+		rtnl_event = IFLA_EVENT_RESEND_IGMP;
+		break;
+	case NETDEV_PRECHANGEMTU:
+		rtnl_event = IFLA_EVENT_PRE_CHANGE_MTU;
+		break;
+	case NETDEV_CHANGEINFODATA:
+		rtnl_event = IFLA_EVENT_CHANGE_INFO_DATA;
+		break;
+	case NETDEV_PRECHANGEUPPER:
+		rtnl_event = IFLA_EVENT_PRE_CHANGE_UPPER;
+		break;
+	case NETDEV_CHANGELOWERSTATE:
+		rtnl_event = IFLA_EVENT_CHANGE_LOWER_STATE;
+		break;
+	case NETDEV_UDP_TUNNEL_PUSH_INFO:
+		rtnl_event = IFLA_EVENT_UDP_TUNNEL_PUSH_INFO;
+		break;
+	case NETDEV_CHANGE_TX_QUEUE_LEN:
+		rtnl_event = IFLA_EVENT_CHANGE_TX_QUEUE_LEN;
+		break;
+	default:
+		return 0;
+	}
+
+	return nla_put_u32(skb, IFLA_EVENT, rtnl_event);
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
-			    unsigned int flags, u32 ext_filter_mask)
+			    unsigned int flags, u32 ext_filter_mask,
+			    unsigned long event)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1327,6 +1389,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
 		goto nla_put_failure;
 
+	if (rtnl_fill_link_event(skb, event))
+		goto nla_put_failure;
+
 	if (rtnl_fill_link_ifmap(skb, dev))
 		goto nla_put_failure;
 
@@ -1461,6 +1526,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
 	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },
 	[IFLA_XDP]		= { .type = NLA_NESTED },
+	[IFLA_EVENT]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1619,7 +1685,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask);
+					       ext_filter_mask, 0);
 			/* If we ran out of room on the first message,
 			 * we're in trouble
 			 */
@@ -2710,7 +2776,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2782,7 +2848,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 }
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
-				       unsigned int change, gfp_t flags)
+				       unsigned int change,
+				       unsigned long event, gfp_t flags)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2793,7 +2860,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2814,18 +2881,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
 }
 
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
+static void rtmsg_ifinfo_event(int type, struct net_device *dev,
+			       unsigned int change, unsigned long event,
+			       gfp_t flags)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
+
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+		  gfp_t flags)
+{
+	rtmsg_ifinfo_event(type, dev, change, 0, flags);
+}
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4132,7 +4206,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_UDP_TUNNEL_PUSH_INFO:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, event, GFP_KERNEL);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From f2fbc9dd78970accd7649e8b87c7f00a0da0cdbc Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 5 Apr 2017 08:39:18 -0700
Subject: blk-mq: Remove blk_mq_queue_data.list

The block layer core sets blk_mq_queue_data.list but no block
drivers read that member. Hence remove it and also the code that
is used to set this member.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 17 -----------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 061fc2cc88d3..f7cd3208bcdf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -984,16 +984,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
-	LIST_HEAD(driver_list);
-	struct list_head *dptr;
 	int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
 
-	/*
-	 * Start off with dptr being NULL, so we start the first request
-	 * immediately, even if we have more pending.
-	 */
-	dptr = NULL;
-
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1026,7 +1018,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
-		bd.list = dptr;
 
 		/*
 		 * Flag last if we have no more requests, or if we have more
@@ -1062,13 +1053,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 
 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 			break;
-
-		/*
-		 * We've done the first request. If we have more than 1
-		 * left in the list, set dptr to defer issue.
-		 */
-		if (!dptr && list->next != list->prev)
-			dptr = &driver_list;
 	}
 
 	hctx->dispatched[queued_to_index(queued)]++;
@@ -1451,7 +1435,6 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
-		.list = NULL,
 		.last = 1
 	};
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ea2e9dcd3aef..bdea90d75274 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -81,7 +81,6 @@ struct blk_mq_tag_set {
 
 struct blk_mq_queue_data {
 	struct request *rq;
-	struct list_head *list;
 	bool last;
 };
 
-- 
cgit v1.2.3


From 0ae797a8ba05a2354db5e81c1d7df04671dd1c25 Mon Sep 17 00:00:00 2001
From: Arto Merilainen <amerilainen@nvidia.com>
Date: Wed, 14 Dec 2016 13:16:13 +0200
Subject: drm/tegra: Add VIC support

This patch adds support for Video Image Compositor engine which
can be used for 2d operations.

Signed-off-by: Andrew Chew <achew@nvidia.com>
Signed-off-by: Arto Merilainen <amerilainen@nvidia.com>
Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/Makefile |   3 +-
 drivers/gpu/drm/tegra/drm.c    |   3 +
 drivers/gpu/drm/tegra/drm.h    |   1 +
 drivers/gpu/drm/tegra/vic.c    | 396 +++++++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/tegra/vic.h    |  31 ++++
 include/linux/host1x.h         |   1 +
 6 files changed, 434 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/tegra/vic.c
 create mode 100644 drivers/gpu/drm/tegra/vic.h

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
index 93e9a4a1e3d1..6af3a9ad6565 100644
--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -14,6 +14,7 @@ tegra-drm-y := \
 	dpaux.o \
 	gr2d.o \
 	gr3d.o \
-	falcon.o
+	falcon.o \
+	vic.o
 
 obj-$(CONFIG_DRM_TEGRA) += tegra-drm.o
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index d9211cb78da5..bbce47dc79e6 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1211,11 +1211,13 @@ static const struct of_device_id host1x_drm_subdevs[] = {
 	{ .compatible = "nvidia,tegra124-sor", },
 	{ .compatible = "nvidia,tegra124-hdmi", },
 	{ .compatible = "nvidia,tegra124-dsi", },
+	{ .compatible = "nvidia,tegra124-vic", },
 	{ .compatible = "nvidia,tegra132-dsi", },
 	{ .compatible = "nvidia,tegra210-dc", },
 	{ .compatible = "nvidia,tegra210-dsi", },
 	{ .compatible = "nvidia,tegra210-sor", },
 	{ .compatible = "nvidia,tegra210-sor1", },
+	{ .compatible = "nvidia,tegra210-vic", },
 	{ /* sentinel */ }
 };
 
@@ -1237,6 +1239,7 @@ static struct platform_driver * const drivers[] = {
 	&tegra_sor_driver,
 	&tegra_gr2d_driver,
 	&tegra_gr3d_driver,
+	&tegra_vic_driver,
 };
 
 static int __init host1x_drm_init(void)
diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h
index 668ccfb5cb05..9d6f4ffcc786 100644
--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -298,5 +298,6 @@ extern struct platform_driver tegra_dpaux_driver;
 extern struct platform_driver tegra_sor_driver;
 extern struct platform_driver tegra_gr2d_driver;
 extern struct platform_driver tegra_gr3d_driver;
+extern struct platform_driver tegra_vic_driver;
 
 #endif /* HOST1X_DRM_H */
diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c
new file mode 100644
index 000000000000..cd804e404a11
--- /dev/null
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2015, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/host1x.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/reset.h>
+
+#include <soc/tegra/pmc.h>
+
+#include "drm.h"
+#include "falcon.h"
+#include "vic.h"
+
+struct vic_config {
+	const char *firmware;
+};
+
+struct vic {
+	struct falcon falcon;
+	bool booted;
+
+	void __iomem *regs;
+	struct tegra_drm_client client;
+	struct host1x_channel *channel;
+	struct iommu_domain *domain;
+	struct device *dev;
+	struct clk *clk;
+
+	/* Platform configuration */
+	const struct vic_config *config;
+};
+
+static inline struct vic *to_vic(struct tegra_drm_client *client)
+{
+	return container_of(client, struct vic, client);
+}
+
+static void vic_writel(struct vic *vic, u32 value, unsigned int offset)
+{
+	writel(value, vic->regs + offset);
+}
+
+static int vic_runtime_resume(struct device *dev)
+{
+	struct vic *vic = dev_get_drvdata(dev);
+
+	return clk_prepare_enable(vic->clk);
+}
+
+static int vic_runtime_suspend(struct device *dev)
+{
+	struct vic *vic = dev_get_drvdata(dev);
+
+	clk_disable_unprepare(vic->clk);
+
+	vic->booted = false;
+
+	return 0;
+}
+
+static int vic_boot(struct vic *vic)
+{
+	u32 fce_ucode_size, fce_bin_data_offset;
+	void *hdr;
+	int err = 0;
+
+	if (vic->booted)
+		return 0;
+
+	/* setup clockgating registers */
+	vic_writel(vic, CG_IDLE_CG_DLY_CNT(4) |
+			CG_IDLE_CG_EN |
+			CG_WAKEUP_DLY_CNT(4),
+		   NV_PVIC_MISC_PRI_VIC_CG);
+
+	err = falcon_boot(&vic->falcon);
+	if (err < 0)
+		return err;
+
+	hdr = vic->falcon.firmware.vaddr;
+	fce_bin_data_offset = *(u32 *)(hdr + VIC_UCODE_FCE_DATA_OFFSET);
+	hdr = vic->falcon.firmware.vaddr +
+		*(u32 *)(hdr + VIC_UCODE_FCE_HEADER_OFFSET);
+	fce_ucode_size = *(u32 *)(hdr + FCE_UCODE_SIZE_OFFSET);
+
+	falcon_execute_method(&vic->falcon, VIC_SET_APPLICATION_ID, 1);
+	falcon_execute_method(&vic->falcon, VIC_SET_FCE_UCODE_SIZE,
+			      fce_ucode_size);
+	falcon_execute_method(&vic->falcon, VIC_SET_FCE_UCODE_OFFSET,
+			      (vic->falcon.firmware.paddr + fce_bin_data_offset)
+				>> 8);
+
+	err = falcon_wait_idle(&vic->falcon);
+	if (err < 0) {
+		dev_err(vic->dev,
+			"failed to set application ID and FCE base\n");
+		return err;
+	}
+
+	vic->booted = true;
+
+	return 0;
+}
+
+static void *vic_falcon_alloc(struct falcon *falcon, size_t size,
+			       dma_addr_t *iova)
+{
+	struct tegra_drm *tegra = falcon->data;
+
+	return tegra_drm_alloc(tegra, size, iova);
+}
+
+static void vic_falcon_free(struct falcon *falcon, size_t size,
+			    dma_addr_t iova, void *va)
+{
+	struct tegra_drm *tegra = falcon->data;
+
+	return tegra_drm_free(tegra, size, va, iova);
+}
+
+static const struct falcon_ops vic_falcon_ops = {
+	.alloc = vic_falcon_alloc,
+	.free = vic_falcon_free
+};
+
+static int vic_init(struct host1x_client *client)
+{
+	struct tegra_drm_client *drm = host1x_to_drm_client(client);
+	struct drm_device *dev = dev_get_drvdata(client->parent);
+	struct tegra_drm *tegra = dev->dev_private;
+	struct vic *vic = to_vic(drm);
+	int err;
+
+	if (tegra->domain) {
+		err = iommu_attach_device(tegra->domain, vic->dev);
+		if (err < 0) {
+			dev_err(vic->dev, "failed to attach to domain: %d\n",
+				err);
+			return err;
+		}
+
+		vic->domain = tegra->domain;
+	}
+
+	if (!vic->falcon.data) {
+		vic->falcon.data = tegra;
+		err = falcon_load_firmware(&vic->falcon);
+		if (err < 0)
+			goto detach_device;
+	}
+
+	vic->channel = host1x_channel_request(client->dev);
+	if (!vic->channel) {
+		err = -ENOMEM;
+		goto detach_device;
+	}
+
+	client->syncpts[0] = host1x_syncpt_request(client->dev, 0);
+	if (!client->syncpts[0]) {
+		err = -ENOMEM;
+		goto free_channel;
+	}
+
+	err = tegra_drm_register_client(tegra, drm);
+	if (err < 0)
+		goto free_syncpt;
+
+	return 0;
+
+free_syncpt:
+	host1x_syncpt_free(client->syncpts[0]);
+free_channel:
+	host1x_channel_free(vic->channel);
+detach_device:
+	if (tegra->domain)
+		iommu_detach_device(tegra->domain, vic->dev);
+
+	return err;
+}
+
+static int vic_exit(struct host1x_client *client)
+{
+	struct tegra_drm_client *drm = host1x_to_drm_client(client);
+	struct drm_device *dev = dev_get_drvdata(client->parent);
+	struct tegra_drm *tegra = dev->dev_private;
+	struct vic *vic = to_vic(drm);
+	int err;
+
+	err = tegra_drm_unregister_client(tegra, drm);
+	if (err < 0)
+		return err;
+
+	host1x_syncpt_free(client->syncpts[0]);
+	host1x_channel_free(vic->channel);
+
+	if (vic->domain) {
+		iommu_detach_device(vic->domain, vic->dev);
+		vic->domain = NULL;
+	}
+
+	return 0;
+}
+
+static const struct host1x_client_ops vic_client_ops = {
+	.init = vic_init,
+	.exit = vic_exit,
+};
+
+static int vic_open_channel(struct tegra_drm_client *client,
+			    struct tegra_drm_context *context)
+{
+	struct vic *vic = to_vic(client);
+	int err;
+
+	err = pm_runtime_get_sync(vic->dev);
+	if (err < 0)
+		return err;
+
+	err = vic_boot(vic);
+	if (err < 0) {
+		pm_runtime_put(vic->dev);
+		return err;
+	}
+
+	context->channel = host1x_channel_get(vic->channel);
+	if (!context->channel) {
+		pm_runtime_put(vic->dev);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void vic_close_channel(struct tegra_drm_context *context)
+{
+	struct vic *vic = to_vic(context->client);
+
+	host1x_channel_put(context->channel);
+
+	pm_runtime_put(vic->dev);
+}
+
+static const struct tegra_drm_client_ops vic_ops = {
+	.open_channel = vic_open_channel,
+	.close_channel = vic_close_channel,
+	.submit = tegra_drm_submit,
+};
+
+static const struct vic_config vic_t124_config = {
+	.firmware = "nvidia/tegra124/vic03_ucode.bin",
+};
+
+static const struct vic_config vic_t210_config = {
+	.firmware = "nvidia/tegra210/vic04_ucode.bin",
+};
+
+static const struct of_device_id vic_match[] = {
+	{ .compatible = "nvidia,tegra124-vic", .data = &vic_t124_config },
+	{ .compatible = "nvidia,tegra210-vic", .data = &vic_t210_config },
+	{ },
+};
+
+static int vic_probe(struct platform_device *pdev)
+{
+	struct vic_config *vic_config = NULL;
+	struct device *dev = &pdev->dev;
+	struct host1x_syncpt **syncpts;
+	struct resource *regs;
+	const struct of_device_id *match;
+	struct vic *vic;
+	int err;
+
+	match = of_match_device(vic_match, dev);
+	vic_config = (struct vic_config *)match->data;
+
+	vic = devm_kzalloc(dev, sizeof(*vic), GFP_KERNEL);
+	if (!vic)
+		return -ENOMEM;
+
+	syncpts = devm_kzalloc(dev, sizeof(*syncpts), GFP_KERNEL);
+	if (!syncpts)
+		return -ENOMEM;
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!regs) {
+		dev_err(&pdev->dev, "failed to get registers\n");
+		return -ENXIO;
+	}
+
+	vic->regs = devm_ioremap_resource(dev, regs);
+	if (IS_ERR(vic->regs))
+		return PTR_ERR(vic->regs);
+
+	vic->clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(vic->clk)) {
+		dev_err(&pdev->dev, "failed to get clock\n");
+		return PTR_ERR(vic->clk);
+	}
+
+	vic->falcon.dev = dev;
+	vic->falcon.regs = vic->regs;
+	vic->falcon.ops = &vic_falcon_ops;
+
+	err = falcon_init(&vic->falcon);
+	if (err < 0)
+		return err;
+
+	err = falcon_read_firmware(&vic->falcon, vic_config->firmware);
+	if (err < 0)
+		goto exit_falcon;
+
+	platform_set_drvdata(pdev, vic);
+
+	INIT_LIST_HEAD(&vic->client.base.list);
+	vic->client.base.ops = &vic_client_ops;
+	vic->client.base.dev = dev;
+	vic->client.base.class = HOST1X_CLASS_VIC;
+	vic->client.base.syncpts = syncpts;
+	vic->client.base.num_syncpts = 1;
+	vic->dev = dev;
+	vic->config = vic_config;
+
+	INIT_LIST_HEAD(&vic->client.list);
+	vic->client.ops = &vic_ops;
+
+	err = host1x_client_register(&vic->client.base);
+	if (err < 0) {
+		dev_err(dev, "failed to register host1x client: %d\n", err);
+		platform_set_drvdata(pdev, NULL);
+		goto exit_falcon;
+	}
+
+	pm_runtime_enable(&pdev->dev);
+	if (!pm_runtime_enabled(&pdev->dev)) {
+		err = vic_runtime_resume(&pdev->dev);
+		if (err < 0)
+			goto unregister_client;
+	}
+
+	return 0;
+
+unregister_client:
+	host1x_client_unregister(&vic->client.base);
+exit_falcon:
+	falcon_exit(&vic->falcon);
+
+	return err;
+}
+
+static int vic_remove(struct platform_device *pdev)
+{
+	struct vic *vic = platform_get_drvdata(pdev);
+	int err;
+
+	err = host1x_client_unregister(&vic->client.base);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to unregister host1x client: %d\n",
+			err);
+		return err;
+	}
+
+	if (pm_runtime_enabled(&pdev->dev))
+		pm_runtime_disable(&pdev->dev);
+	else
+		vic_runtime_suspend(&pdev->dev);
+
+	falcon_exit(&vic->falcon);
+
+	return 0;
+}
+
+static const struct dev_pm_ops vic_pm_ops = {
+	SET_RUNTIME_PM_OPS(vic_runtime_suspend, vic_runtime_resume, NULL)
+};
+
+struct platform_driver tegra_vic_driver = {
+	.driver = {
+		.name = "tegra-vic",
+		.of_match_table = vic_match,
+		.pm = &vic_pm_ops
+	},
+	.probe = vic_probe,
+	.remove = vic_remove,
+};
diff --git a/drivers/gpu/drm/tegra/vic.h b/drivers/gpu/drm/tegra/vic.h
new file mode 100644
index 000000000000..21844817a7e1
--- /dev/null
+++ b/drivers/gpu/drm/tegra/vic.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2015, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef TEGRA_VIC_H
+#define TEGRA_VIC_H
+
+/* VIC methods */
+
+#define VIC_SET_APPLICATION_ID			0x00000200
+#define VIC_SET_FCE_UCODE_SIZE			0x0000071C
+#define VIC_SET_FCE_UCODE_OFFSET		0x0000072C
+
+/* VIC registers */
+
+#define NV_PVIC_MISC_PRI_VIC_CG			0x000016d0
+#define CG_IDLE_CG_DLY_CNT(val)			((val & 0x3f) << 0)
+#define CG_IDLE_CG_EN				(1 << 6)
+#define CG_WAKEUP_DLY_CNT(val)			((val & 0xf) << 16)
+
+/* Firmware offsets */
+
+#define VIC_UCODE_FCE_HEADER_OFFSET		(6*4)
+#define VIC_UCODE_FCE_DATA_OFFSET		(7*4)
+#define FCE_UCODE_SIZE_OFFSET			(2*4)
+
+#endif /* TEGRA_VIC_H */
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 1ffbf2a8cb99..3d04aa1dc83e 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -26,6 +26,7 @@ enum host1x_class {
 	HOST1X_CLASS_HOST1X = 0x1,
 	HOST1X_CLASS_GR2D = 0x51,
 	HOST1X_CLASS_GR2D_SB = 0x52,
+	HOST1X_CLASS_VIC = 0x5D,
 	HOST1X_CLASS_GR3D = 0x60,
 };
 
-- 
cgit v1.2.3


From d6c1dc3f52e3a65f35c58433ba57d14c0bad902f Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Tue, 4 Apr 2017 18:59:50 +0530
Subject: regulator: Add settling time for non-linear voltage transition

Some regulators (some PWM regulators) have the voltage transition
non-linear i.e. exponentially. On such cases, the settling time
for voltage transition can not be presented in the voltage-ramp-delay.

Add new property for non-linear voltage transition and handle this
in getting the voltage settling time.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c          | 2 ++
 drivers/regulator/of_regulator.c  | 4 ++++
 include/linux/regulator/machine.h | 3 +++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 04baac9a165b..3a641d64f8e1 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2753,6 +2753,8 @@ static int _regulator_set_voltage_time(struct regulator_dev *rdev,
 		ramp_delay = rdev->constraints->ramp_delay;
 	else if (rdev->desc->ramp_delay)
 		ramp_delay = rdev->desc->ramp_delay;
+	else if (rdev->constraints->settling_time)
+		return rdev->constraints->settling_time;
 
 	if (ramp_delay == 0) {
 		rdev_dbg(rdev, "ramp_delay not set\n");
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 4f613ec99500..09d677d5d3f0 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -86,6 +86,10 @@ static void of_get_regulation_constraints(struct device_node *np,
 			constraints->ramp_disable = true;
 	}
 
+	ret = of_property_read_u32(np, "regulator-settling-time-us", &pval);
+	if (!ret)
+		constraints->settling_time = pval;
+
 	ret = of_property_read_u32(np, "regulator-enable-ramp-delay", &pval);
 	if (!ret)
 		constraints->enable_time = pval;
diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index ad3e5158e586..598a493b3927 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -108,6 +108,8 @@ struct regulator_state {
  * @initial_state: Suspend state to set by default.
  * @initial_mode: Mode to set at startup.
  * @ramp_delay: Time to settle down after voltage change (unit: uV/us)
+ * @settling_time: Time to settle down after voltage change when voltage
+ *		   change is non-linear (unit: microseconds).
  * @active_discharge: Enable/disable active discharge. The enum
  *		      regulator_active_discharge values are used for
  *		      initialisation.
@@ -149,6 +151,7 @@ struct regulation_constraints {
 	unsigned int initial_mode;
 
 	unsigned int ramp_delay;
+	unsigned int settling_time;
 	unsigned int enable_time;
 
 	unsigned int active_discharge;
-- 
cgit v1.2.3


From 4c546b8a34690ca858e50f2017b8bb6e358365d1 Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Mon, 3 Apr 2017 11:23:54 +0900
Subject: memblock: add memblock_clear_nomap()

This function, with a combination of memblock_mark_nomap(), will be used
in a later kdump patch for arm64 when it temporarily isolates some range
of memory from the other memory blocks in order to create a specific
kernel mapping at boot time.

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/memblock.h |  1 +
 mm/memblock.c            | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index bdfc65af4152..e82daffcfc44 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -93,6 +93,7 @@ int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
 int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
 int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
 ulong choose_memblock_flags(void);
 
 /* Low level functions */
diff --git a/mm/memblock.c b/mm/memblock.c
index 696f06d17c4e..2f4ca8104ea4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -804,6 +804,18 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
 	return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP);
 }
 
+/**
+ * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
+{
+	return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
+}
+
 /**
  * __next_reserved_mem_region - next function for for_each_reserved_region()
  * @idx: pointer to u64 loop variable
-- 
cgit v1.2.3


From c9ca9b4e2198a4dbeb83739460d4a7ff9ffed24f Mon Sep 17 00:00:00 2001
From: AKASHI Takahiro <takahiro.akashi@linaro.org>
Date: Mon, 3 Apr 2017 11:23:55 +0900
Subject: memblock: add memblock_cap_memory_range()

Add memblock_cap_memory_range() which will remove all the memblock regions
except the memory range specified in the arguments. In addition, rework is
done on memblock_mem_limit_remove_map() to re-implement it using
memblock_cap_memory_range().

This function, like memblock_mem_limit_remove_map(), will not remove
memblocks with MEMMAP_NOMAP attribute as they may be mapped and accessed
later as "device memory."
See the commit a571d4eb55d8 ("mm/memblock.c: add new infrastructure to
address the mem limit issue").

This function is used, in a succeeding patch in the series of arm64 kdump
suuport, to limit the range of usable memory, or System RAM, on crash dump
kernel.
(Please note that "mem=" parameter is of little use for this purpose.)

Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Dennis Chen <dennis.chen@arm.com>
Cc: linux-mm@kvack.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/memblock.h |  1 +
 mm/memblock.c            | 44 +++++++++++++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e82daffcfc44..4ce24a376262 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -336,6 +336,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
+void memblock_cap_memory_range(phys_addr_t base, phys_addr_t size);
 void memblock_mem_limit_remove_map(phys_addr_t limit);
 bool memblock_is_memory(phys_addr_t addr);
 int memblock_is_map_memory(phys_addr_t addr);
diff --git a/mm/memblock.c b/mm/memblock.c
index 2f4ca8104ea4..b049c9b2dba8 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1543,11 +1543,37 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
 			      (phys_addr_t)ULLONG_MAX);
 }
 
+void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
+{
+	int start_rgn, end_rgn;
+	int i, ret;
+
+	if (!size)
+		return;
+
+	ret = memblock_isolate_range(&memblock.memory, base, size,
+						&start_rgn, &end_rgn);
+	if (ret)
+		return;
+
+	/* remove all the MAP regions */
+	for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
+		if (!memblock_is_nomap(&memblock.memory.regions[i]))
+			memblock_remove_region(&memblock.memory, i);
+
+	for (i = start_rgn - 1; i >= 0; i--)
+		if (!memblock_is_nomap(&memblock.memory.regions[i]))
+			memblock_remove_region(&memblock.memory, i);
+
+	/* truncate the reserved regions */
+	memblock_remove_range(&memblock.reserved, 0, base);
+	memblock_remove_range(&memblock.reserved,
+			base + size, (phys_addr_t)ULLONG_MAX);
+}
+
 void __init memblock_mem_limit_remove_map(phys_addr_t limit)
 {
-	struct memblock_type *type = &memblock.memory;
 	phys_addr_t max_addr;
-	int i, ret, start_rgn, end_rgn;
 
 	if (!limit)
 		return;
@@ -1558,19 +1584,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
 	if (max_addr == (phys_addr_t)ULLONG_MAX)
 		return;
 
-	ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX,
-				&start_rgn, &end_rgn);
-	if (ret)
-		return;
-
-	/* remove all the MAP regions above the limit */
-	for (i = end_rgn - 1; i >= start_rgn; i--) {
-		if (!memblock_is_nomap(&type->regions[i]))
-			memblock_remove_region(type, i);
-	}
-	/* truncate the reserved regions */
-	memblock_remove_range(&memblock.reserved, max_addr,
-			      (phys_addr_t)ULLONG_MAX);
+	memblock_cap_memory_range(0, max_addr);
 }
 
 static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
-- 
cgit v1.2.3


From 64c7f1d1572cacadfc0a4ca5a937aeffa486de58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:18:12 +0200
Subject: block, scsi: move the retries field to struct scsi_request

Instead of bloating the generic struct request with it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c                 | 8 ++++----
 drivers/scsi/osd/osd_initiator.c   | 2 +-
 drivers/scsi/osst.c                | 2 +-
 drivers/scsi/scsi_error.c          | 2 +-
 drivers/scsi/scsi_lib.c            | 4 ++--
 drivers/scsi/sg.c                  | 2 +-
 drivers/scsi/st.c                  | 2 +-
 drivers/target/target_core_pscsi.c | 2 +-
 include/linux/blkdev.h             | 1 -
 include/scsi/scsi_request.h        | 1 +
 10 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2a2fc768b27a..82a43bb19967 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		goto out_free_cdb;
 
 	bio = rq->bio;
-	rq->retries = 0;
+	req->retries = 0;
 
 	start_time = jiffies;
 
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error;
 
 	/* default.  possible overriden later */
-	rq->retries = 5;
+	req->retries = 5;
 
 	switch (opcode) {
 	case SEND_DIAGNOSTIC:
 	case FORMAT_UNIT:
 		rq->timeout = FORMAT_UNIT_TIMEOUT;
-		rq->retries = 1;
+		req->retries = 1;
 		break;
 	case START_STOP:
 		rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		break;
 	case READ_DEFECT_DATA:
 		rq->timeout = READ_DEFECT_DATA_TIMEOUT;
-		rq->retries = 1;
+		req->retries = 1;
 		break;
 	default:
 		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 6903f03c88af..9d0727b2bdec 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1602,7 +1602,7 @@ static int _init_blk_request(struct osd_request *or,
 	req->rq_flags |= RQF_QUIET;
 
 	req->timeout = or->timeout;
-	req->retries = or->retries;
+	scsi_req(req)->retries = or->retries;
 
 	if (has_out) {
 		or->out.req = req;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index c47f4b349bac..41bc1d64bf86 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
 	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->timeout = timeout;
-	req->retries = retries;
+	rq->retries = retries;
 	req->end_io_data = SRpnt;
 
 	blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f2cafae150bc..2db412dd4b44 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 
 	req->rq_flags |= RQF_QUIET;
 	req->timeout = 10 * HZ;
-	req->retries = 5;
+	rq->retries = 5;
 
 	blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
 }
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c1519660824b..11972d1075f1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -256,7 +256,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 
 	rq->cmd_len = COMMAND_SIZE(cmd[0]);
 	memcpy(rq->cmd, cmd, rq->cmd_len);
-	req->retries = retries;
+	rq->retries = retries;
 	req->timeout = timeout;
 	req->cmd_flags |= flags;
 	req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
@@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 	cmd->cmd_len = scsi_req(req)->cmd_len;
 	cmd->cmnd = scsi_req(req)->cmd;
 	cmd->transfersize = blk_rq_bytes(req);
-	cmd->allowed = req->retries;
+	cmd->allowed = scsi_req(req)->retries;
 	return BLKPREP_OK;
 }
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 29b86505f796..b61cc3c512d3 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1716,7 +1716,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 
 	srp->rq = rq;
 	rq->end_io_data = srp;
-	rq->retries = SG_DEFAULT_RETRIES;
+	req->retries = SG_DEFAULT_RETRIES;
 
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
 		return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e5ef78a6848e..5408643431bb 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	memset(rq->cmd, 0, BLK_MAX_CDB);
 	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->timeout = timeout;
-	req->retries = retries;
+	rq->retries = retries;
 	req->end_io_data = SRpnt;
 
 	blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 94cda7991e80..c7fa372c527a 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		req->timeout = PS_TIMEOUT_DISK;
 	else
 		req->timeout = PS_TIMEOUT_OTHER;
-	req->retries = PS_RETRY;
+	scsi_req(req)->retries = PS_RETRY;
 
 	blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
 			(cmd->sam_task_attr == TCM_HEAD_TAG),
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a2dc6b390d48..ce6f9a6534c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -224,7 +224,6 @@ struct request {
 	unsigned long deadline;
 	struct list_head timeout_list;
 	unsigned int timeout;
-	int retries;
 
 	/*
 	 * completion callback.
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index ba0aeb980f7e..7c583a0f363a 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -11,6 +11,7 @@ struct scsi_request {
 	unsigned short	cmd_len;
 	unsigned int	sense_len;
 	unsigned int	resid_len;	/* residual count */
+	int		retries;
 	void		*sense;
 };
 
-- 
cgit v1.2.3


From 1dd5198b2df913aec9b77c14529f9ff1b6d33e30 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 5 Apr 2017 12:16:38 -0600
Subject: block: move timeout field in struct request to pack better

After commit 64c7f1d1572c, we went from 1 to 2 holes in my
test setup. If we move the timeout field a bit, we remove
both of those holes and shrink struct request by 8 bytes.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce6f9a6534c9..3cf241b0814d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -215,6 +215,8 @@ struct request {
 
 	unsigned short ioprio;
 
+	unsigned int timeout;
+
 	void *special;		/* opaque pointer available for LLD use */
 
 	int errors;
@@ -223,7 +225,6 @@ struct request {
 
 	unsigned long deadline;
 	struct list_head timeout_list;
-	unsigned int timeout;
 
 	/*
 	 * completion callback.
-- 
cgit v1.2.3


From adf5e5168bd51c42332ebaa709351fa6ed65ea73 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 27 Jan 2017 12:22:54 +0000
Subject: iommu: Better document the IOMMU_PRIV flag

This is a fairly subtle thing - let's make sure it's described as
clearly as possible to avoid potential misunderstandings.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/iommu.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2e4de0deee53..88ec8c6580d3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -32,10 +32,13 @@
 #define IOMMU_NOEXEC	(1 << 3)
 #define IOMMU_MMIO	(1 << 4) /* e.g. things like MSI doorbells */
 /*
- * This is to make the IOMMU API setup privileged
- * mapppings accessible by the master only at higher
- * privileged execution level and inaccessible at
- * less privileged levels.
+ * Where the bus hardware includes a privilege level as part of its access type
+ * markings, and certain devices are capable of issuing transactions marked as
+ * either 'supervisor' or 'user', the IOMMU_PRIV flag requests that the other
+ * given permission flags only apply to accesses at the higher privilege level,
+ * and that unprivileged transactions should have as little access as possible.
+ * This would usually imply the same permissions as kernel mappings on the CPU,
+ * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV	(1 << 5)
 
-- 
cgit v1.2.3


From a7a453f56a1a116027f84ac53b365eb045a0e279 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Tue, 28 Mar 2017 15:14:40 +0100
Subject: regulator: helpers: Add regmap set_soft_start helper

Add a helper function regulator_set_soft_start_regmap to allow regmap
based regulators to easily enable soft start.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/helpers.c      | 18 ++++++++++++++++++
 include/linux/regulator/driver.h |  8 ++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/helpers.c b/drivers/regulator/helpers.c
index 379cdacc05d8..a75e7da49af8 100644
--- a/drivers/regulator/helpers.c
+++ b/drivers/regulator/helpers.c
@@ -445,6 +445,24 @@ int regulator_set_bypass_regmap(struct regulator_dev *rdev, bool enable)
 }
 EXPORT_SYMBOL_GPL(regulator_set_bypass_regmap);
 
+/**
+ * regulator_set_soft_start_regmap - Default set_soft_start() using regmap
+ *
+ * @rdev: device to operate on.
+ */
+int regulator_set_soft_start_regmap(struct regulator_dev *rdev)
+{
+	unsigned int val;
+
+	val = rdev->desc->soft_start_val_on;
+	if (!val)
+		val = rdev->desc->soft_start_mask;
+
+	return regmap_update_bits(rdev->regmap, rdev->desc->soft_start_reg,
+				  rdev->desc->soft_start_mask, val);
+}
+EXPORT_SYMBOL_GPL(regulator_set_soft_start_regmap);
+
 /**
  * regulator_get_bypass_regmap - Default get_bypass() using regmap
  *
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index dac8e7b16bc6..1054c033e783 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -292,6 +292,10 @@ enum regulator_type {
  *			   set_active_discharge
  * @active_discharge_reg: Register for control when using regmap
  *			  set_active_discharge
+ * @soft_start_reg: Register for control when using regmap set_soft_start
+ * @soft_start_mask: Mask for control when using regmap set_soft_start
+ * @soft_start_val_on: Enabling value for control when using regmap
+ *                     set_soft_start
  *
  * @enable_time: Time taken for initial enable of regulator (in uS).
  * @off_on_delay: guard time (in uS), before re-enabling a regulator
@@ -345,6 +349,9 @@ struct regulator_desc {
 	unsigned int active_discharge_off;
 	unsigned int active_discharge_mask;
 	unsigned int active_discharge_reg;
+	unsigned int soft_start_reg;
+	unsigned int soft_start_mask;
+	unsigned int soft_start_val_on;
 
 	unsigned int enable_time;
 
@@ -476,6 +483,7 @@ int regulator_set_voltage_time_sel(struct regulator_dev *rdev,
 				   unsigned int new_selector);
 int regulator_set_bypass_regmap(struct regulator_dev *rdev, bool enable);
 int regulator_get_bypass_regmap(struct regulator_dev *rdev, bool *enable);
+int regulator_set_soft_start_regmap(struct regulator_dev *rdev);
 
 int regulator_set_active_discharge_regmap(struct regulator_dev *rdev,
 					  bool enable);
-- 
cgit v1.2.3


From f7d37bc3cb20828ac43b22cbd40222877ee2c46a Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Tue, 28 Mar 2017 15:14:41 +0100
Subject: regulator: helpers: Add regmap set_pull_down helper

Add a helper function regulator_set_pull_down_regmap to allow regmap
based regulators to easily enable pull down.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/helpers.c      | 18 ++++++++++++++++++
 include/linux/regulator/driver.h |  8 ++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/helpers.c b/drivers/regulator/helpers.c
index a75e7da49af8..2ae7c3ac5940 100644
--- a/drivers/regulator/helpers.c
+++ b/drivers/regulator/helpers.c
@@ -463,6 +463,24 @@ int regulator_set_soft_start_regmap(struct regulator_dev *rdev)
 }
 EXPORT_SYMBOL_GPL(regulator_set_soft_start_regmap);
 
+/**
+ * regulator_set_pull_down_regmap - Default set_pull_down() using regmap
+ *
+ * @rdev: device to operate on.
+ */
+int regulator_set_pull_down_regmap(struct regulator_dev *rdev)
+{
+	unsigned int val;
+
+	val = rdev->desc->pull_down_val_on;
+	if (!val)
+		val = rdev->desc->pull_down_mask;
+
+	return regmap_update_bits(rdev->regmap, rdev->desc->pull_down_reg,
+				  rdev->desc->pull_down_mask, val);
+}
+EXPORT_SYMBOL_GPL(regulator_set_pull_down_regmap);
+
 /**
  * regulator_get_bypass_regmap - Default get_bypass() using regmap
  *
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 1054c033e783..8a9078dd2a5f 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -296,6 +296,10 @@ enum regulator_type {
  * @soft_start_mask: Mask for control when using regmap set_soft_start
  * @soft_start_val_on: Enabling value for control when using regmap
  *                     set_soft_start
+ * @pull_down_reg: Register for control when using regmap set_pull_down
+ * @pull_down_mask: Mask for control when using regmap set_pull_down
+ * @pull_down_val_on: Enabling value for control when using regmap
+ *                     set_pull_down
  *
  * @enable_time: Time taken for initial enable of regulator (in uS).
  * @off_on_delay: guard time (in uS), before re-enabling a regulator
@@ -352,6 +356,9 @@ struct regulator_desc {
 	unsigned int soft_start_reg;
 	unsigned int soft_start_mask;
 	unsigned int soft_start_val_on;
+	unsigned int pull_down_reg;
+	unsigned int pull_down_mask;
+	unsigned int pull_down_val_on;
 
 	unsigned int enable_time;
 
@@ -484,6 +491,7 @@ int regulator_set_voltage_time_sel(struct regulator_dev *rdev,
 int regulator_set_bypass_regmap(struct regulator_dev *rdev, bool enable);
 int regulator_get_bypass_regmap(struct regulator_dev *rdev, bool *enable);
 int regulator_set_soft_start_regmap(struct regulator_dev *rdev);
+int regulator_set_pull_down_regmap(struct regulator_dev *rdev);
 
 int regulator_set_active_discharge_regmap(struct regulator_dev *rdev,
 					  bool enable);
-- 
cgit v1.2.3


From 113c3075931a334f899008f6c753abe70a3a9323 Mon Sep 17 00:00:00 2001
From: "R. Parameswaran" <parameswaran.r7@gmail.com>
Date: Wed, 5 Apr 2017 16:50:35 -0700
Subject: New kernel function to get IP overhead on a socket.

A new function, kernel_sock_ip_overhead(), is provided
to calculate the cumulative overhead imposed by the IP
Header and IP options, if any, on a socket's payload.
The new function returns an overhead of zero for sockets
that do not belong to the IPv4 or IPv6 address families.
This is used in the L2TP code path to compute the
total outer IP overhead on the L2TP tunnel socket when
calculating the default MTU for Ethernet pseudowires.

Signed-off-by: R. Parameswaran <rparames@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h |  3 +++
 net/socket.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index 0620f5e18c96..a42fab24c8af 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -298,6 +298,9 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how);
 
+/* Following routine returns the IP overhead imposed by a socket.  */
+u32 kernel_sock_ip_overhead(struct sock *sk);
+
 #define MODULE_ALIAS_NETPROTO(proto) \
 	MODULE_ALIAS("net-pf-" __stringify(proto))
 
diff --git a/net/socket.c b/net/socket.c
index 985ef06792d6..eea997036ada 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3356,3 +3356,49 @@ int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how)
 	return sock->ops->shutdown(sock, how);
 }
 EXPORT_SYMBOL(kernel_sock_shutdown);
+
+/* This routine returns the IP overhead imposed by a socket i.e.
+ * the length of the underlying IP header, depending on whether
+ * this is an IPv4 or IPv6 socket and the length from IP options turned
+ * on at the socket.
+ */
+u32 kernel_sock_ip_overhead(struct sock *sk)
+{
+	struct inet_sock *inet;
+	struct ip_options_rcu *opt;
+	u32 overhead = 0;
+	bool owned_by_user;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct ipv6_pinfo *np;
+	struct ipv6_txoptions *optv6 = NULL;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+	if (!sk)
+		return overhead;
+
+	owned_by_user = sock_owned_by_user(sk);
+	switch (sk->sk_family) {
+	case AF_INET:
+		inet = inet_sk(sk);
+		overhead += sizeof(struct iphdr);
+		opt = rcu_dereference_protected(inet->inet_opt,
+						owned_by_user);
+		if (opt)
+			overhead += opt->opt.optlen;
+		return overhead;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		np = inet6_sk(sk);
+		overhead += sizeof(struct ipv6hdr);
+		if (np)
+			optv6 = rcu_dereference_protected(np->opt,
+							  owned_by_user);
+		if (optv6)
+			overhead += (optv6->opt_flen + optv6->opt_nflen);
+		return overhead;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+	default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */
+		return overhead;
+	}
+}
+EXPORT_SYMBOL(kernel_sock_ip_overhead);
-- 
cgit v1.2.3


From 08737a3fa30a4c6c10b4c4b682125c7d3c494094 Mon Sep 17 00:00:00 2001
From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
Date: Thu, 6 Apr 2017 15:58:33 +0300
Subject: qed: Inform qedi the number of possible CQs

Now that management firmware is capable of telling us the number of CQs
available for a given PF, qed needs to communicate the number to qedi
so it would know have many to use.

Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h       |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_dev.c   |  7 ++++++-
 drivers/net/ethernet/qlogic/qed/qed_iscsi.c | 11 +++++++++++
 include/linux/qed/qed_iscsi_if.h            |  2 ++
 4 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index d8bcc21a4f69..4896ee0cc458 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -225,8 +225,9 @@ enum QED_FEATURE {
 	QED_PF_L2_QUE,
 	QED_VF,
 	QED_RDMA_CNQ,
-	QED_VF_L2_QUE,
+	QED_ISCSI_CQ,
 	QED_FCOE_CQ,
+	QED_VF_L2_QUE,
 	QED_MAX_FEATURES,
 };
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index fb759ba34b8d..fad73195010d 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2045,12 +2045,17 @@ static void qed_hw_set_feat(struct qed_hwfn *p_hwfn)
 							 QED_VF_L2_QUE));
 	}
 
+	if (p_hwfn->hw_info.personality == QED_PCI_ISCSI)
+		feat_num[QED_ISCSI_CQ] = min_t(u32, RESC_NUM(p_hwfn, QED_SB),
+					       RESC_NUM(p_hwfn,
+							QED_CMDQS_CQS));
 	DP_VERBOSE(p_hwfn,
 		   NETIF_MSG_PROBE,
-		   "#PF_L2_QUEUES=%d VF_L2_QUEUES=%d #ROCE_CNQ=%d #SBS=%d\n",
+		   "#PF_L2_QUEUES=%d VF_L2_QUEUES=%d #ROCE_CNQ=%d ISCSI_CQ=%d #SBS=%d\n",
 		   (int)FEAT_NUM(p_hwfn, QED_PF_L2_QUE),
 		   (int)FEAT_NUM(p_hwfn, QED_VF_L2_QUE),
 		   (int)FEAT_NUM(p_hwfn, QED_RDMA_CNQ),
+		   (int)FEAT_NUM(p_hwfn, QED_ISCSI_CQ),
 		   RESC_NUM(p_hwfn, QED_SB));
 }
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
index 38273f41201d..417691cf4fd6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iscsi.c
@@ -181,6 +181,15 @@ qed_sp_iscsi_func_start(struct qed_hwfn *p_hwfn,
 	p_params = &p_hwfn->pf_params.iscsi_pf_params;
 	p_queue = &p_init->q_params;
 
+	/* Sanity */
+	if (p_params->num_queues > p_hwfn->hw_info.feat_num[QED_ISCSI_CQ]) {
+		DP_ERR(p_hwfn,
+		       "Cannot satisfy CQ amount. Queues requested %d, CQs available %d. Aborting function start\n",
+		       p_params->num_queues,
+		       p_hwfn->hw_info.resc_num[QED_ISCSI_CQ]);
+		return -EINVAL;
+	}
+
 	SET_FIELD(p_init->hdr.flags,
 		  ISCSI_SLOW_PATH_HDR_LAYER_CODE, ISCSI_SLOW_PATH_LAYER_CODE);
 	p_init->hdr.op_code = ISCSI_RAMROD_CMD_ID_INIT_FUNC;
@@ -1012,6 +1021,8 @@ static int qed_fill_iscsi_dev_info(struct qed_dev *cdev,
 	info->secondary_bdq_rq_addr =
 	    qed_iscsi_get_secondary_bdq_prod(hwfn, BDQ_ID_RQ);
 
+	info->num_cqs = FEAT_NUM(hwfn, QED_ISCSI_CQ);
+
 	return rc;
 }
 
diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h
index f70bb81b8b6a..3414649133d2 100644
--- a/include/linux/qed/qed_iscsi_if.h
+++ b/include/linux/qed/qed_iscsi_if.h
@@ -67,6 +67,8 @@ struct qed_dev_iscsi_info {
 
 	void __iomem *primary_dbq_rq_addr;
 	void __iomem *secondary_bdq_rq_addr;
+
+	u8 num_cqs;
 };
 
 struct qed_iscsi_id_params {
-- 
cgit v1.2.3


From 102722fc6832a16850c05595b98c9232549d99f3 Mon Sep 17 00:00:00 2001
From: Guy Ergas <guye@mellanox.com>
Date: Mon, 20 Feb 2017 16:18:17 +0200
Subject: net/mlx5e: Add support for RXFCS feature flag

Add support for rx-fcs flag from ethtool.
In case this flag is set, update all RQs to scatter the FCS data into
the packet.

Signed-off-by: Guy Ergas <guye@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 71 +++++++++++++++++++++++
 include/linux/mlx5/mlx5_ifc.h                     |  1 +
 3 files changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 150fb52a0737..a58031da7fad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -221,6 +221,7 @@ struct mlx5e_params {
 	u8  toeplitz_hash_key[40];
 	u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE];
 	bool vlan_strip_disable;
+	bool scatter_fcs_en;
 	bool rx_am_enabled;
 	u32 lro_timeout;
 	u32 pflags;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d5248637d44f..83796ce17fc3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -760,6 +760,37 @@ static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state,
 	return err;
 }
 
+static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable)
+{
+	struct mlx5e_channel *c = rq->channel;
+	struct mlx5e_priv *priv = c->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *rqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_rq_in);
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
+	rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx);
+
+	MLX5_SET(modify_rq_in, in, rq_state, MLX5_RQC_STATE_RDY);
+	MLX5_SET64(modify_rq_in, in, modify_bitmask,
+		   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS);
+	MLX5_SET(rqc, rqc, scatter_fcs, enable);
+	MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RDY);
+
+	err = mlx5_core_modify_rq(mdev, rq->rqn, in, inlen);
+
+	kvfree(in);
+
+	return err;
+}
+
 static int mlx5e_modify_rq_vsd(struct mlx5e_rq *rq, bool vsd)
 {
 	struct mlx5e_channel *c = rq->channel;
@@ -1834,6 +1865,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
 	MLX5_SET(wq, wq, pd,               priv->mdev->mlx5e_res.pdn);
 	MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
 	MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
+	MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
 
 	param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
 	param->wq.linear = 1;
@@ -2904,6 +2936,20 @@ void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
 		mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
 }
 
+static int mlx5e_modify_channels_scatter_fcs(struct mlx5e_channels *chs, bool enable)
+{
+	int err = 0;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		err = mlx5e_modify_rq_scatter_fcs(&chs->c[i]->rq, enable);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 int mlx5e_modify_channels_vsd(struct mlx5e_channels *chs, bool vsd)
 {
 	int err = 0;
@@ -3121,6 +3167,23 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable)
 	return mlx5_set_port_fcs(mdev, !enable);
 }
 
+static int set_feature_rx_fcs(struct net_device *netdev, bool enable)
+{
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	int err;
+
+	mutex_lock(&priv->state_lock);
+
+	priv->channels.params.scatter_fcs_en = enable;
+	err = mlx5e_modify_channels_scatter_fcs(&priv->channels, enable);
+	if (err)
+		priv->channels.params.scatter_fcs_en = !enable;
+
+	mutex_unlock(&priv->state_lock);
+
+	return err;
+}
+
 static int set_feature_rx_vlan(struct net_device *netdev, bool enable)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -3194,6 +3257,8 @@ static int mlx5e_set_features(struct net_device *netdev,
 				    set_feature_tc_num_filters);
 	err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXALL,
 				    set_feature_rx_all);
+	err |= mlx5e_handle_feature(netdev, features, NETIF_F_RXFCS,
+				    set_feature_rx_fcs);
 	err |= mlx5e_handle_feature(netdev, features, NETIF_F_HW_VLAN_CTAG_RX,
 				    set_feature_rx_vlan);
 #ifdef CONFIG_RFS_ACCEL
@@ -3908,6 +3973,9 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	if (fcs_supported)
 		netdev->hw_features |= NETIF_F_RXALL;
 
+	if (MLX5_CAP_ETH(mdev, scatter_fcs))
+		netdev->hw_features |= NETIF_F_RXFCS;
+
 	netdev->features          = netdev->hw_features;
 	if (!priv->channels.params.lro_en)
 		netdev->features  &= ~NETIF_F_LRO;
@@ -3915,6 +3983,9 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	if (fcs_enabled)
 		netdev->features  &= ~NETIF_F_RXALL;
 
+	if (!priv->channels.params.scatter_fcs_en)
+		netdev->features  &= ~NETIF_F_RXFCS;
+
 #define FT_CAP(f) MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive.f)
 	if (FT_CAP(flow_modify_en) &&
 	    FT_CAP(modify_root) &&
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 56bc842b0620..1993adbd2c82 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5122,6 +5122,7 @@ struct mlx5_ifc_modify_rq_out_bits {
 
 enum {
 	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1,
+	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS = 1ULL << 2,
 	MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID = 1ULL << 3,
 };
 
-- 
cgit v1.2.3


From 6118714275f0a313ecc296a87ed1af32d9691bed Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 30 Mar 2017 09:16:39 -0700
Subject: pinctrl: core: Fix pinctrl_register_and_init() with pinctrl_enable()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Recent pinctrl changes to allow dynamic allocation of pins exposed one
more issue with the pinctrl pins claimed early by the controller itself.
This caused a regression for IMX6 pinctrl hogs.

Before enabling the pin controller driver we need to wait until it has
been properly initialized, then claim the hogs, and only then enable it.

To fix the regression, split the code into pinctrl_claim_hogs() and
pinctrl_enable(). And then let's require that pinctrl_enable() is always
called by the pin controller driver when ready after calling
pinctrl_register_and_init().

Depends-on: 950b0d91dc10 ("pinctrl: core: Fix regression caused by delayed
work for hogs")
Fixes: df61b366af26 ("pinctrl: core: Use delayed work for hogs")
Fixes: e566fc11ea76 ("pinctrl: imx: use generic pinctrl helpers for
managing groups")
Cc: Haojian Zhuang <haojian.zhuang@linaro.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Nishanth Menon <nm@ti.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Stefan Agner <stefan@agner.ch>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Gary Bisson <gary.bisson@boundarydevices.com>
Tested-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/pinctrl.txt               |  8 ++-
 drivers/pinctrl/core.c                  | 97 +++++++++++++++++++++------------
 drivers/pinctrl/freescale/pinctrl-imx.c |  2 +-
 drivers/pinctrl/pinctrl-single.c        |  2 +-
 drivers/pinctrl/sh-pfc/pinctrl.c        | 11 +++-
 drivers/pinctrl/ti/pinctrl-ti-iodelay.c |  2 +
 include/linux/pinctrl/pinctrl.h         |  3 +-
 7 files changed, 83 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt
index 54bd5faa8782..f2af35f6d6b2 100644
--- a/Documentation/pinctrl.txt
+++ b/Documentation/pinctrl.txt
@@ -77,9 +77,15 @@ static struct pinctrl_desc foo_desc = {
 
 int __init foo_probe(void)
 {
+	int error;
+
 	struct pinctrl_dev *pctl;
 
-	return pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
+	error = pinctrl_register_and_init(&foo_desc, <PARENT>, NULL, &pctl);
+	if (error)
+		return error;
+
+	return pinctrl_enable(pctl);
 }
 
 To enable the pinctrl subsystem and the subgroups for PINMUX and PINCONF and
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index d69046537b75..32822b0d9cd0 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -2010,29 +2010,57 @@ out_err:
 	return ERR_PTR(ret);
 }
 
-static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
+static int pinctrl_claim_hogs(struct pinctrl_dev *pctldev)
 {
 	pctldev->p = create_pinctrl(pctldev->dev, pctldev);
-	if (!IS_ERR(pctldev->p)) {
-		kref_get(&pctldev->p->users);
-		pctldev->hog_default =
-			pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
-		if (IS_ERR(pctldev->hog_default)) {
-			dev_dbg(pctldev->dev,
-				"failed to lookup the default state\n");
-		} else {
-			if (pinctrl_select_state(pctldev->p,
-						pctldev->hog_default))
-				dev_err(pctldev->dev,
-					"failed to select default state\n");
-		}
+	if (PTR_ERR(pctldev->p) == -ENODEV) {
+		dev_dbg(pctldev->dev, "no hogs found\n");
 
-		pctldev->hog_sleep =
-			pinctrl_lookup_state(pctldev->p,
-						    PINCTRL_STATE_SLEEP);
-		if (IS_ERR(pctldev->hog_sleep))
-			dev_dbg(pctldev->dev,
-				"failed to lookup the sleep state\n");
+		return 0;
+	}
+
+	if (IS_ERR(pctldev->p)) {
+		dev_err(pctldev->dev, "error claiming hogs: %li\n",
+			PTR_ERR(pctldev->p));
+
+		return PTR_ERR(pctldev->p);
+	}
+
+	kref_get(&pctldev->p->users);
+	pctldev->hog_default =
+		pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT);
+	if (IS_ERR(pctldev->hog_default)) {
+		dev_dbg(pctldev->dev,
+			"failed to lookup the default state\n");
+	} else {
+		if (pinctrl_select_state(pctldev->p,
+					 pctldev->hog_default))
+			dev_err(pctldev->dev,
+				"failed to select default state\n");
+	}
+
+	pctldev->hog_sleep =
+		pinctrl_lookup_state(pctldev->p,
+				     PINCTRL_STATE_SLEEP);
+	if (IS_ERR(pctldev->hog_sleep))
+		dev_dbg(pctldev->dev,
+			"failed to lookup the sleep state\n");
+
+	return 0;
+}
+
+int pinctrl_enable(struct pinctrl_dev *pctldev)
+{
+	int error;
+
+	error = pinctrl_claim_hogs(pctldev);
+	if (error) {
+		dev_err(pctldev->dev, "could not claim hogs: %i\n",
+			error);
+		mutex_destroy(&pctldev->mutex);
+		kfree(pctldev);
+
+		return error;
 	}
 
 	mutex_lock(&pinctrldev_list_mutex);
@@ -2043,6 +2071,7 @@ static int pinctrl_create_and_start(struct pinctrl_dev *pctldev)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pinctrl_enable);
 
 /**
  * pinctrl_register() - register a pin controller device
@@ -2065,25 +2094,30 @@ struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 	if (IS_ERR(pctldev))
 		return pctldev;
 
-	error = pinctrl_create_and_start(pctldev);
-	if (error) {
-		mutex_destroy(&pctldev->mutex);
-		kfree(pctldev);
-
+	error = pinctrl_enable(pctldev);
+	if (error)
 		return ERR_PTR(error);
-	}
 
 	return pctldev;
 
 }
 EXPORT_SYMBOL_GPL(pinctrl_register);
 
+/**
+ * pinctrl_register_and_init() - register and init pin controller device
+ * @pctldesc: descriptor for this pin controller
+ * @dev: parent device for this pin controller
+ * @driver_data: private pin controller data for this pin controller
+ * @pctldev: pin controller device
+ *
+ * Note that pinctrl_enable() still needs to be manually called after
+ * this once the driver is ready.
+ */
 int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
 			      struct device *dev, void *driver_data,
 			      struct pinctrl_dev **pctldev)
 {
 	struct pinctrl_dev *p;
-	int error;
 
 	p = pinctrl_init_controller(pctldesc, dev, driver_data);
 	if (IS_ERR(p))
@@ -2097,15 +2131,6 @@ int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
 	 */
 	*pctldev = p;
 
-	error = pinctrl_create_and_start(p);
-	if (error) {
-		mutex_destroy(&p->mutex);
-		kfree(p);
-		*pctldev = NULL;
-
-		return error;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pinctrl_register_and_init);
diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c
index a7ace9e1ad81..74bd90dfd7b1 100644
--- a/drivers/pinctrl/freescale/pinctrl-imx.c
+++ b/drivers/pinctrl/freescale/pinctrl-imx.c
@@ -790,7 +790,7 @@ int imx_pinctrl_probe(struct platform_device *pdev,
 
 	dev_info(&pdev->dev, "initialized IMX pinctrl driver\n");
 
-	return 0;
+	return pinctrl_enable(ipctl->pctl);
 
 free:
 	imx_free_resources(ipctl);
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index 8b2d45e85bae..9c267dcda094 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -1781,7 +1781,7 @@ static int pcs_probe(struct platform_device *pdev)
 	dev_info(pcs->dev, "%i pins at pa %p size %u\n",
 		 pcs->desc.npins, pcs->base, pcs->size);
 
-	return 0;
+	return pinctrl_enable(pcs->pctl);
 
 free:
 	pcs_free_resources(pcs);
diff --git a/drivers/pinctrl/sh-pfc/pinctrl.c b/drivers/pinctrl/sh-pfc/pinctrl.c
index 08150a321be6..a70157f0acf4 100644
--- a/drivers/pinctrl/sh-pfc/pinctrl.c
+++ b/drivers/pinctrl/sh-pfc/pinctrl.c
@@ -816,6 +816,13 @@ int sh_pfc_register_pinctrl(struct sh_pfc *pfc)
 	pmx->pctl_desc.pins = pmx->pins;
 	pmx->pctl_desc.npins = pfc->info->nr_pins;
 
-	return devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
-					      &pmx->pctl);
+	ret = devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx,
+					     &pmx->pctl);
+	if (ret) {
+		dev_err(pfc->dev, "could not register: %i\n", ret);
+
+		return ret;
+	}
+
+	return pinctrl_enable(pmx->pctl);
 }
diff --git a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
index 717e3404900c..362c50918c13 100644
--- a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
+++ b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c
@@ -893,6 +893,8 @@ static int ti_iodelay_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, iod);
 
+	return pinctrl_enable(iod->pctl);
+
 exit_out:
 	of_node_put(np);
 	return ret;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 8ce2d87a238b..5e45385c5bdc 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -145,8 +145,9 @@ struct pinctrl_desc {
 extern int pinctrl_register_and_init(struct pinctrl_desc *pctldesc,
 				     struct device *dev, void *driver_data,
 				     struct pinctrl_dev **pctldev);
+extern int pinctrl_enable(struct pinctrl_dev *pctldev);
 
-/* Please use pinctrl_register_and_init() instead */
+/* Please use pinctrl_register_and_init() and pinctrl_enable() instead */
 extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc,
 				struct device *dev, void *driver_data);
 
-- 
cgit v1.2.3


From 4c0facddb7d88c78c8bd977c16faa647f079ccda Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Thu, 6 Apr 2017 19:05:52 +0530
Subject: gpio: core: Decouple open drain/source flag with active low/high

Currently, the GPIO interface is said to Open Drain if it is Single
Ended and active LOW. Similarly, it is said as Open Source if it is
Single Ended and active HIGH.

The active HIGH/LOW is used in the interface for setting the pin
state to HIGH or LOW when enabling/disabling the interface.

In Open Drain interface, pin is set to HIGH by putting pin in
high impedance and LOW by driving to the LOW.

In Open Source interface, pin is set to HIGH by driving pin to
HIGH and set to LOW by putting pin in high impedance.

With above, the Open Drain/Source is unrelated to the active LOW/HIGH
in interface. There is interface where the enable/disable of interface
is ether active LOW or HIGH but it is Open Drain type.

Hence decouple the Open Drain with Single Ended + Active LOW and
Open Source with Single Ended + Active HIGH.

Adding different flag for the Open Drain/Open Source which is valid
only when Single ended flag is enabled.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c       |  2 +-
 drivers/gpio/gpiolib.c          |  4 +++-
 include/dt-bindings/gpio/gpio.h | 12 ++++++++----
 include/linux/of_gpio.h         |  1 +
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 975b9f6cf408..b13b7c7c335f 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -147,7 +147,7 @@ struct gpio_desc *of_find_gpio(struct device *dev, const char *con_id,
 		*flags |= GPIO_ACTIVE_LOW;
 
 	if (of_flags & OF_GPIO_SINGLE_ENDED) {
-		if (of_flags & OF_GPIO_ACTIVE_LOW)
+		if (of_flags & OF_GPIO_OPEN_DRAIN)
 			*flags |= GPIO_OPEN_DRAIN;
 		else
 			*flags |= GPIO_OPEN_SOURCE;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index c788b55dfe85..d11eb2abfced 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -3340,6 +3340,7 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 	unsigned long lflags = 0;
 	bool active_low = false;
 	bool single_ended = false;
+	bool open_drain = false;
 	int ret;
 
 	if (!fwnode)
@@ -3353,6 +3354,7 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 		if (!IS_ERR(desc)) {
 			active_low = flags & OF_GPIO_ACTIVE_LOW;
 			single_ended = flags & OF_GPIO_SINGLE_ENDED;
+			open_drain = flags & OF_GPIO_OPEN_DRAIN;
 		}
 	} else if (is_acpi_node(fwnode)) {
 		struct acpi_gpio_info info;
@@ -3373,7 +3375,7 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode,
 		lflags |= GPIO_ACTIVE_LOW;
 
 	if (single_ended) {
-		if (active_low)
+		if (open_drain)
 			lflags |= GPIO_OPEN_DRAIN;
 		else
 			lflags |= GPIO_OPEN_SOURCE;
diff --git a/include/dt-bindings/gpio/gpio.h b/include/dt-bindings/gpio/gpio.h
index c673d2c87c60..b4f54da694eb 100644
--- a/include/dt-bindings/gpio/gpio.h
+++ b/include/dt-bindings/gpio/gpio.h
@@ -17,11 +17,15 @@
 #define GPIO_PUSH_PULL 0
 #define GPIO_SINGLE_ENDED 2
 
+/* Bit 2 express Open drain or open source */
+#define GPIO_LINE_OPEN_SOURCE 0
+#define GPIO_LINE_OPEN_DRAIN 4
+
 /*
- * Open Drain/Collector is the combination of single-ended active low,
- * Open Source/Emitter is the combination of single-ended active high.
+ * Open Drain/Collector is the combination of single-ended open drain interface.
+ * Open Source/Emitter is the combination of single-ended open source interface.
  */
-#define GPIO_OPEN_DRAIN (GPIO_SINGLE_ENDED | GPIO_ACTIVE_LOW)
-#define GPIO_OPEN_SOURCE (GPIO_SINGLE_ENDED | GPIO_ACTIVE_HIGH)
+#define GPIO_OPEN_DRAIN (GPIO_SINGLE_ENDED | GPIO_LINE_OPEN_DRAIN)
+#define GPIO_OPEN_SOURCE (GPIO_SINGLE_ENDED | GPIO_LINE_OPEN_SOURCE)
 
 #endif
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 3f87ea5b8bee..1e089d5a182b 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -30,6 +30,7 @@ struct device_node;
 enum of_gpio_flags {
 	OF_GPIO_ACTIVE_LOW = 0x1,
 	OF_GPIO_SINGLE_ENDED = 0x2,
+	OF_GPIO_OPEN_DRAIN = 0x4,
 };
 
 #ifdef CONFIG_OF_GPIO
-- 
cgit v1.2.3


From 404123c2db798027e852480ed9c4accef9f1d9e6 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 29 Mar 2017 19:06:20 +0300
Subject: virtio: allow drivers to validate features

Some drivers can't support all features in all configurations.  At the
moment we blindly set FEATURES_OK and later FAILED.  Support this better
by adding a callback drivers can use to do some early checks.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio.c | 6 ++++++
 include/linux/virtio.h  | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 400d70b69379..48230a5e12f2 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -232,6 +232,12 @@ static int virtio_dev_probe(struct device *_d)
 		if (device_features & (1ULL << i))
 			__virtio_set_bit(dev, i);
 
+	if (drv->validate) {
+		err = drv->validate(dev);
+		if (err)
+			goto err;
+	}
+
 	err = virtio_finalize_features(dev);
 	if (err)
 		goto err;
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 04b0d3f95043..7edfbdb55a99 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -167,6 +167,7 @@ struct virtio_driver {
 	unsigned int feature_table_size;
 	const unsigned int *feature_table_legacy;
 	unsigned int feature_table_size_legacy;
+	int (*validate)(struct virtio_device *dev);
 	int (*probe)(struct virtio_device *dev);
 	void (*scan)(struct virtio_device *dev);
 	void (*remove)(struct virtio_device *dev);
-- 
cgit v1.2.3


From dedb67c4b4e5fa2e6e149a2ce93e7848aaa9d762 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 28 Mar 2017 22:27:32 +0530
Subject: netfilter: Add nfnl_msg_type() helper function

Add and use nfnl_msg_type() function to replace opencoded nfnetlink
message type. I suggested this change, Arushi Singhal made an initial
patch to address this but was missing several spots.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/nfnetlink.h  |  5 +++++
 net/netfilter/ipset/ip_set_core.c    |  2 +-
 net/netfilter/nf_conntrack_netlink.c | 16 +++++++++-------
 net/netfilter/nf_tables_api.c        | 20 +++++++++-----------
 net/netfilter/nf_tables_trace.c      |  3 ++-
 net/netfilter/nfnetlink_acct.c       |  2 +-
 net/netfilter/nfnetlink_cthelper.c   |  2 +-
 net/netfilter/nfnetlink_cttimeout.c  |  4 ++--
 net/netfilter/nfnetlink_log.c        |  2 +-
 net/netfilter/nfnetlink_queue.c      |  2 +-
 net/netfilter/nft_compat.c           |  2 +-
 11 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 1b49209dd5c7..996711d8a7b4 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -41,6 +41,11 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
 int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
 		      int flags);
 
+static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type)
+{
+	return subsys << 8 | msg_type;
+}
+
 void nfnl_lock(__u8 subsys_id);
 void nfnl_unlock(__u8 subsys_id);
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index c296f9b606d4..731ba9c0cf9b 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -769,7 +769,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags,
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 
-	nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8),
+	nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd),
 			sizeof(*nfmsg), flags);
 	if (!nlh)
 		return NULL;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index cd0a6d270ebe..773d2187a5ea 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -467,7 +467,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	struct nlattr *nest_parms;
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 
-	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW);
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -652,7 +652,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 	if (skb == NULL)
 		goto errout;
 
-	type |= NFNL_SUBSYS_CTNETLINK << 8;
+	type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type);
 	nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -1983,7 +1983,8 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 
-	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU);
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
+			      IPCTNL_MSG_CT_GET_STATS_CPU);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -2066,7 +2067,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 	unsigned int nr_conntracks = atomic_read(&net->ct.count);
 
-	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS);
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -2576,7 +2577,7 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
 	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 
-	event |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -2627,7 +2628,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item)
 	if (skb == NULL)
 		goto errout;
 
-	type |= NFNL_SUBSYS_CTNETLINK_EXP << 8;
+	type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type);
 	nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -3212,7 +3213,8 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu,
 	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0, event;
 
-	event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU);
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK,
+			      IPCTNL_MSG_EXP_GET_STATS_CPU);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bf52acfe4eff..7ba76da96cc2 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -438,7 +438,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 
-	event |= NFNL_SUBSYS_NFTABLES << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
 	if (nlh == NULL)
 		goto nla_put_failure;
@@ -989,7 +989,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
 
-	event |= NFNL_SUBSYS_NFTABLES << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
 	if (nlh == NULL)
 		goto nla_put_failure;
@@ -1885,10 +1885,9 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 	const struct nft_expr *expr, *next;
 	struct nlattr *list;
 	const struct nft_rule *prule;
-	int type = event | NFNL_SUBSYS_NFTABLES << 8;
+	u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 
-	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg),
-			flags);
+	nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags);
 	if (nlh == NULL)
 		goto nla_put_failure;
 
@@ -2645,7 +2644,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 	u32 portid = ctx->portid;
 	u32 seq = ctx->seq;
 
-	event |= NFNL_SUBSYS_NFTABLES << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
 			flags);
 	if (nlh == NULL)
@@ -3395,8 +3394,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
-	event  = NFT_MSG_NEWSETELEM;
-	event |= NFNL_SUBSYS_NFTABLES << 8;
+	event  = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM);
 	portid = NETLINK_CB(cb->skb).portid;
 	seq    = cb->nlh->nlmsg_seq;
 
@@ -3481,7 +3479,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
 	struct nlattr *nest;
 	int err;
 
-	event |= NFNL_SUBSYS_NFTABLES << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg),
 			flags);
 	if (nlh == NULL)
@@ -4253,7 +4251,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 	struct nfgenmsg *nfmsg;
 	struct nlmsghdr *nlh;
 
-	event |= NFNL_SUBSYS_NFTABLES << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags);
 	if (nlh == NULL)
 		goto nla_put_failure;
@@ -4526,7 +4524,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
-	int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN;
+	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0);
 	if (nlh == NULL)
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 12eb9041dca2..e1b15e7a5793 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -169,7 +169,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
 	struct nlmsghdr *nlh;
 	struct sk_buff *skb;
 	unsigned int size;
-	int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE;
+	u16 event;
 
 	if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE))
 		return;
@@ -198,6 +198,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
 	if (!skb)
 		return;
 
+	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE);
 	nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0);
 	if (!nlh)
 		goto nla_put_failure;
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index c86da174a5fc..1b9a5d6099dc 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -139,7 +139,7 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	u64 pkts, bytes;
 	u32 old_flags;
 
-	event |= NFNL_SUBSYS_ACCT << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index d5025cc25df3..9a50bf93dd16 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -507,7 +507,7 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 	int status;
 
-	event |= NFNL_SUBSYS_CTHELPER << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 57c2cdf7b691..0927a6ae6177 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -158,7 +158,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 	struct nf_conntrack_l4proto *l4proto = timeout->l4proto;
 
-	event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
@@ -431,7 +431,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
 	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 
-	event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index ecd857b75ffe..e7648e90d162 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -411,7 +411,7 @@ __build_packet_message(struct nfnl_log_net *log,
 	const unsigned char *hwhdrp;
 
 	nlh = nlmsg_put(inst->skb, 0, 0,
-			NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET,
+			nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET),
 			sizeof(struct nfgenmsg), 0);
 	if (!nlh)
 		return -1;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 933509ebf3d3..05e82004ab62 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -447,7 +447,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
 	}
 
 	nlh = nlmsg_put(skb, 0, 0,
-			NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
+			nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
 			sizeof(struct nfgenmsg), 0);
 	if (!nlh) {
 		skb_tx_error(entskb);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index f443f9d22faf..ed969caf01be 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -504,7 +504,7 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	struct nfgenmsg *nfmsg;
 	unsigned int flags = portid ? NLM_F_MULTI : 0;
 
-	event |= NFNL_SUBSYS_NFT_COMPAT << 8;
+	event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event);
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags);
 	if (nlh == NULL)
 		goto nlmsg_failure;
-- 
cgit v1.2.3


From ad6260da1e23cf937806e42c8490af3ff4530474 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 27 Mar 2017 14:30:40 +0200
Subject: KVM: x86: drop legacy device assignment

Legacy device assignment has been deprecated since 4.2 (released
1.5 years ago).  VFIO is better and everyone should have switched to it.
If they haven't, this should convince them. :)

Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  204 -------
 arch/x86/kvm/Kconfig              |   12 -
 arch/x86/kvm/Makefile             |    2 -
 arch/x86/kvm/assigned-dev.c       | 1058 -------------------------------------
 arch/x86/kvm/assigned-dev.h       |   32 --
 arch/x86/kvm/iommu.c              |  356 -------------
 arch/x86/kvm/x86.c                |   14 +-
 include/linux/kvm_host.h          |   16 -
 virt/kvm/kvm_main.c               |   17 -
 9 files changed, 1 insertion(+), 1710 deletions(-)
 delete mode 100644 arch/x86/kvm/assigned-dev.c
 delete mode 100644 arch/x86/kvm/assigned-dev.h
 delete mode 100644 arch/x86/kvm/iommu.c

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 753e88e5eb2a..1a184843bf9c 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1326,130 +1326,6 @@ The flags bitmap is defined as:
    /* the host supports the ePAPR idle hcall
    #define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
 
-4.48 KVM_ASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Assigns a host PCI device to the VM.
-
-struct kvm_assigned_pci_dev {
-	__u32 assigned_dev_id;
-	__u32 busnr;
-	__u32 devfn;
-	__u32 flags;
-	__u32 segnr;
-	union {
-		__u32 reserved[11];
-	};
-};
-
-The PCI device is specified by the triple segnr, busnr, and devfn.
-Identification in succeeding service requests is done via assigned_dev_id. The
-following flags are specified:
-
-/* Depends on KVM_CAP_IOMMU */
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-/* The following two depend on KVM_CAP_PCI_2_3 */
-#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
-#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
-
-If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
-via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
-assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
-guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
-
-The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
-isolation of the device.  Usages not specifying this flag are deprecated.
-
-Only PCI header type 0 devices with PCI BAR resources are supported by
-device assignment.  The user requesting this ioctl must have read/write
-access to the PCI sysfs resource files associated with the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.49 KVM_DEASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Ends PCI device assignment, releasing all associated resources.
-
-See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is
-used in kvm_assigned_pci_dev to identify the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-4.50 KVM_ASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Assigns an IRQ to a passed-through device.
-
-struct kvm_assigned_irq {
-	__u32 assigned_dev_id;
-	__u32 host_irq; /* ignored (legacy field) */
-	__u32 guest_irq;
-	__u32 flags;
-	union {
-		__u32 reserved[12];
-	};
-};
-
-The following flags are defined:
-
-#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
-#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
-#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
-
-#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
-#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
-#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
-
-It is not valid to specify multiple types per host or guest IRQ. However, the
-IRQ type of host and guest can differ or can even be null.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.51 KVM_DEASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Ends an IRQ assignment to a passed-through device.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
-by assigned_dev_id, flags must correspond to the IRQ type specified on
-KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
-
-
 4.52 KVM_SET_GSI_ROUTING
 
 Capability: KVM_CAP_IRQ_ROUTING
@@ -1536,52 +1412,6 @@ struct kvm_irq_routing_hv_sint {
 	__u32 sint;
 };
 
-4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_nr (in)
-Returns: 0 on success, -1 on error
-
-Set the number of MSI-X interrupts for an assigned device. The number is
-reset again by terminating the MSI-X assignment of the device via
-KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
-point will fail.
-
-struct kvm_assigned_msix_nr {
-	__u32 assigned_dev_id;
-	__u16 entry_nr;
-	__u16 padding;
-};
-
-#define KVM_MAX_MSIX_PER_DEV		256
-
-
-4.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_entry (in)
-Returns: 0 on success, -1 on error
-
-Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
-the GSI vector to zero means disabling the interrupt.
-
-struct kvm_assigned_msix_entry {
-	__u32 assigned_dev_id;
-	__u32 gsi;
-	__u16 entry; /* The index of entry in the MSI-X table */
-	__u16 padding[3];
-};
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
 
 4.55 KVM_SET_TSC_KHZ
 
@@ -1733,40 +1563,6 @@ should skip processing the bitmap and just invalidate everything.  It must
 be set to the number of set bits in the bitmap.
 
 
-4.61 KVM_ASSIGN_SET_INTX_MASK (deprecated)
-
-Capability: KVM_CAP_PCI_2_3
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Allows userspace to mask PCI INTx interrupts from the assigned device.  The
-kernel will not deliver INTx interrupts to the guest between setting and
-clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
-and emulation of PCI 2.3 INTx disable command register behavior.
-
-This may be used for both PCI 2.3 devices supporting INTx disable natively and
-older devices lacking this support. Userspace is responsible for emulating the
-read value of the INTx disable bit in the guest visible PCI command register.
-When modifying the INTx disable state, userspace should precede updating the
-physical device command register by calling this ioctl to inform the kernel of
-the new intended INTx mask state.
-
-Note that the kernel uses the device INTx disable bit to internally manage the
-device interrupt state for PCI 2.3 devices.  Reads of this register may
-therefore not match the expected value.  Writes should always use the guest
-intended INTx disable value rather than attempting to read-copy-update the
-current physical device state.  Races between user and kernel updates to the
-INTx disable bit are handled lazily in the kernel.  It's possible the device
-may generate unintended interrupts, but they will not be injected into the
-guest.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
-by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
-evaluated.
-
-
 4.62 KVM_CREATE_SPAPR_TCE
 
 Capability: KVM_CAP_SPAPR_TCE
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ab8e32f7b9a8..760433b2574a 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,18 +86,6 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 auditing of KVM MMU events at runtime.
 
-config KVM_DEVICE_ASSIGNMENT
-	bool "KVM legacy PCI device assignment support (DEPRECATED)"
-	depends on KVM && PCI && IOMMU_API
-	default n
-	---help---
-	  Provide support for legacy PCI device assignment through KVM.  The
-	  kernel now also supports a full featured userspace device driver
-	  framework through VFIO, which supersedes this support and provides
-	  better security.
-
-	  If unsure, say N.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 3bff20710471..09d4b17be022 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -15,8 +15,6 @@ kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
 			   hyperv.o page_track.o debugfs.o
 
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
-
 kvm-intel-y		+= vmx.o pmu_intel.o
 kvm-amd-y		+= svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
deleted file mode 100644
index 308b8597c691..000000000000
--- a/arch/x86/kvm/assigned-dev.c
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-#include "assigned-dev.h"
-#include "trace/events/kvm.h"
-
-struct kvm_assigned_dev_kernel {
-	struct kvm_irq_ack_notifier ack_notifier;
-	struct list_head list;
-	int assigned_dev_id;
-	int host_segnr;
-	int host_busnr;
-	int host_devfn;
-	unsigned int entries_nr;
-	int host_irq;
-	bool host_irq_disabled;
-	bool pci_2_3;
-	struct msix_entry *host_msix_entries;
-	int guest_irq;
-	struct msix_entry *guest_msix_entries;
-	unsigned long irq_requested_type;
-	int irq_source_id;
-	int flags;
-	struct pci_dev *dev;
-	struct kvm *kvm;
-	spinlock_t intx_lock;
-	spinlock_t intx_mask_lock;
-	char irq_name[32];
-	struct pci_saved_state *pci_saved_state;
-};
-
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-						      int assigned_dev_id)
-{
-	struct kvm_assigned_dev_kernel *match;
-
-	list_for_each_entry(match, head, list) {
-		if (match->assigned_dev_id == assigned_dev_id)
-			return match;
-	}
-	return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-				    *assigned_dev, int irq)
-{
-	int i, index;
-	struct msix_entry *host_msix_entries;
-
-	host_msix_entries = assigned_dev->host_msix_entries;
-
-	index = -1;
-	for (i = 0; i < assigned_dev->entries_nr; i++)
-		if (irq == host_msix_entries[i].vector) {
-			index = i;
-			break;
-		}
-	if (index < 0)
-		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-
-	return index;
-}
-
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret;
-
-	spin_lock(&assigned_dev->intx_lock);
-	if (pci_check_and_mask_intx(assigned_dev->dev)) {
-		assigned_dev->host_irq_disabled = true;
-		ret = IRQ_WAKE_THREAD;
-	} else
-		ret = IRQ_NONE;
-	spin_unlock(&assigned_dev->intx_lock);
-
-	return ret;
-}
-
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
-				 int vector)
-{
-	if (unlikely(assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_GUEST_INTX)) {
-		spin_lock(&assigned_dev->intx_mask_lock);
-		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
-			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id, vector, 1,
-				    false);
-		spin_unlock(&assigned_dev->intx_mask_lock);
-	} else
-		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1, false);
-}
-
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-		spin_lock_irq(&assigned_dev->intx_lock);
-		disable_irq_nosync(irq);
-		assigned_dev->host_irq_disabled = true;
-		spin_unlock_irq(&assigned_dev->intx_lock);
-	}
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
-				int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
-						irq, level);
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
-
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-				       assigned_dev->irq_source_id,
-				       assigned_dev->guest_irq, 1);
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-	kvm_assigned_dev_raise_guest_irq(assigned_dev,
-					 assigned_dev->guest_irq);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-	int ret = 0;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-					   assigned_dev->irq_source_id,
-					   vector, 1);
-	}
-
-	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-	int index = find_index_from_host_irq(assigned_dev, irq);
-	u32 vector;
-
-	if (index >= 0) {
-		vector = assigned_dev->guest_msix_entries[index].vector;
-		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-	}
-
-	return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-	struct kvm_assigned_dev_kernel *dev =
-		container_of(kian, struct kvm_assigned_dev_kernel,
-			     ack_notifier);
-
-	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-
-	spin_lock(&dev->intx_mask_lock);
-
-	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-		bool reassert = false;
-
-		spin_lock_irq(&dev->intx_lock);
-		/*
-		 * The guest IRQ may be shared so this ack can come from an
-		 * IRQ for another guest device.
-		 */
-		if (dev->host_irq_disabled) {
-			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-				enable_irq(dev->host_irq);
-			else if (!pci_check_and_unmask_intx(dev->dev))
-				reassert = true;
-			dev->host_irq_disabled = reassert;
-		}
-		spin_unlock_irq(&dev->intx_lock);
-
-		if (reassert)
-			kvm_set_irq(dev->kvm, dev->irq_source_id,
-				    dev->guest_irq, 1, false);
-	}
-
-	spin_unlock(&dev->intx_mask_lock);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-			       struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	if (assigned_dev->ack_notifier.gsi != -1)
-		kvm_unregister_irq_ack_notifier(kvm,
-						&assigned_dev->ack_notifier);
-
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 0, false);
-
-	if (assigned_dev->irq_source_id != -1)
-		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-	assigned_dev->irq_source_id = -1;
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-			      struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	/*
-	 * We disable irq here to prevent further events.
-	 *
-	 * Notice this maybe result in nested disable if the interrupt type is
-	 * INTx, but it's OK for we are going to free it.
-	 *
-	 * If this function is a part of VM destroy, please ensure that till
-	 * now, the kvm state is still legal for probably we also have to wait
-	 * on a currently running IRQ handler.
-	 */
-	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-		int i;
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			disable_irq(assigned_dev->host_msix_entries[i].vector);
-
-		for (i = 0; i < assigned_dev->entries_nr; i++)
-			free_irq(assigned_dev->host_msix_entries[i].vector,
-				 assigned_dev);
-
-		assigned_dev->entries_nr = 0;
-		kfree(assigned_dev->host_msix_entries);
-		kfree(assigned_dev->guest_msix_entries);
-		pci_disable_msix(assigned_dev->dev);
-	} else {
-		/* Deal with MSI and INTx */
-		if ((assigned_dev->irq_requested_type &
-		     KVM_DEV_IRQ_HOST_INTX) &&
-		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			spin_lock_irq(&assigned_dev->intx_lock);
-			pci_intx(assigned_dev->dev, false);
-			spin_unlock_irq(&assigned_dev->intx_lock);
-			synchronize_irq(assigned_dev->host_irq);
-		} else
-			disable_irq(assigned_dev->host_irq);
-
-		free_irq(assigned_dev->host_irq, assigned_dev);
-
-		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-			pci_disable_msi(assigned_dev->dev);
-	}
-
-	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *assigned_dev,
-			    unsigned long irq_requested_type)
-{
-	unsigned long guest_irq_type, host_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return -EINVAL;
-	/* no irq assignment to deassign */
-	if (!assigned_dev->irq_requested_type)
-		return -ENXIO;
-
-	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-	if (host_irq_type)
-		deassign_host_irq(kvm, assigned_dev);
-	if (guest_irq_type)
-		deassign_guest_irq(kvm, assigned_dev);
-
-	return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
-{
-	kvm_free_assigned_irq(kvm, assigned_dev);
-
-	pci_reset_function(assigned_dev->dev);
-	if (pci_load_and_free_saved_state(assigned_dev->dev,
-					  &assigned_dev->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&assigned_dev->dev->dev));
-	else
-		pci_restore_state(assigned_dev->dev);
-
-	pci_clear_dev_assigned(assigned_dev->dev);
-
-	pci_release_regions(assigned_dev->dev);
-	pci_disable_device(assigned_dev->dev);
-	pci_dev_put(assigned_dev->dev);
-
-	list_del(&assigned_dev->list);
-	kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
-
-	list_for_each_entry_safe(assigned_dev, tmp,
-				 &kvm->arch.assigned_dev_head, list) {
-		kvm_free_assigned_device(kvm, assigned_dev);
-	}
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	irq_handler_t irq_handler;
-	unsigned long flags;
-
-	dev->host_irq = dev->dev->irq;
-
-	/*
-	 * We can only share the IRQ line with other host devices if we are
-	 * able to disable the IRQ source at device-level - independently of
-	 * the guest driver. Otherwise host devices may suffer from unbounded
-	 * IRQ latencies when the guest keeps the line asserted.
-	 */
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		irq_handler = kvm_assigned_dev_intx;
-		flags = IRQF_SHARED;
-	} else {
-		irq_handler = NULL;
-		flags = IRQF_ONESHOT;
-	}
-	if (request_threaded_irq(dev->host_irq, irq_handler,
-				 kvm_assigned_dev_thread_intx, flags,
-				 dev->irq_name, dev))
-		return -EIO;
-
-	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-		spin_lock_irq(&dev->intx_lock);
-		pci_intx(dev->dev, true);
-		spin_unlock_irq(&dev->intx_lock);
-	}
-	return 0;
-}
-
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-					   struct kvm_assigned_dev_kernel *dev)
-{
-	int r;
-
-	if (!dev->dev->msi_enabled) {
-		r = pci_enable_msi(dev->dev);
-		if (r)
-			return r;
-	}
-
-	dev->host_irq = dev->dev->irq;
-	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
-				 kvm_assigned_dev_thread_msi, 0,
-				 dev->irq_name, dev)) {
-		pci_disable_msi(dev->dev);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-					    struct kvm_assigned_dev_kernel *dev)
-{
-	int i, r = -EINVAL;
-
-	/* host_msix_entries and guest_msix_entries should have been
-	 * initialized */
-	if (dev->entries_nr == 0)
-		return r;
-
-	r = pci_enable_msix_exact(dev->dev,
-				  dev->host_msix_entries, dev->entries_nr);
-	if (r)
-		return r;
-
-	for (i = 0; i < dev->entries_nr; i++) {
-		r = request_threaded_irq(dev->host_msix_entries[i].vector,
-					 kvm_assigned_dev_msix,
-					 kvm_assigned_dev_thread_msix,
-					 0, dev->irq_name, dev);
-		if (r)
-			goto err;
-	}
-
-	return 0;
-err:
-	for (i -= 1; i >= 0; i--)
-		free_irq(dev->host_msix_entries[i].vector, dev);
-	pci_disable_msix(dev->dev);
-	return r;
-}
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-				struct kvm_assigned_dev_kernel *dev,
-				struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = irq->guest_irq;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *dev,
-			struct kvm_assigned_irq *irq)
-{
-	dev->guest_irq = irq->guest_irq;
-	dev->ack_notifier.gsi = -1;
-	return 0;
-}
-
-static int assign_host_irq(struct kvm *kvm,
-			   struct kvm_assigned_dev_kernel *dev,
-			   __u32 host_irq_type)
-{
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-		return r;
-
-	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
-		 pci_name(dev->dev));
-
-	switch (host_irq_type) {
-	case KVM_DEV_IRQ_HOST_INTX:
-		r = assigned_device_enable_host_intx(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSI:
-		r = assigned_device_enable_host_msi(kvm, dev);
-		break;
-	case KVM_DEV_IRQ_HOST_MSIX:
-		r = assigned_device_enable_host_msix(kvm, dev);
-		break;
-	default:
-		r = -EINVAL;
-	}
-	dev->host_irq_disabled = false;
-
-	if (!r)
-		dev->irq_requested_type |= host_irq_type;
-
-	return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-			    struct kvm_assigned_dev_kernel *dev,
-			    struct kvm_assigned_irq *irq,
-			    unsigned long guest_irq_type)
-{
-	int id;
-	int r = -EEXIST;
-
-	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-		return r;
-
-	id = kvm_request_irq_source_id(kvm);
-	if (id < 0)
-		return id;
-
-	dev->irq_source_id = id;
-
-	switch (guest_irq_type) {
-	case KVM_DEV_IRQ_GUEST_INTX:
-		r = assigned_device_enable_guest_intx(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSI:
-		r = assigned_device_enable_guest_msi(kvm, dev, irq);
-		break;
-	case KVM_DEV_IRQ_GUEST_MSIX:
-		r = assigned_device_enable_guest_msix(kvm, dev, irq);
-		break;
-	default:
-		r = -EINVAL;
-	}
-
-	if (!r) {
-		dev->irq_requested_type |= guest_irq_type;
-		if (dev->ack_notifier.gsi != -1)
-			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-	} else {
-		kvm_free_irq_source_id(kvm, dev->irq_source_id);
-		dev->irq_source_id = -1;
-	}
-
-	return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq *assigned_irq)
-{
-	int r = -EINVAL;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long host_irq_type, guest_irq_type;
-
-	if (!irqchip_in_kernel(kvm))
-		return r;
-
-	mutex_lock(&kvm->lock);
-	r = -ENODEV;
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-	r = -EINVAL;
-	/* can only assign one type at a time */
-	if (hweight_long(host_irq_type) > 1)
-		goto out;
-	if (hweight_long(guest_irq_type) > 1)
-		goto out;
-	if (host_irq_type == 0 && guest_irq_type == 0)
-		goto out;
-
-	r = 0;
-	if (host_irq_type)
-		r = assign_host_irq(kvm, match, host_irq_type);
-	if (r)
-		goto out;
-
-	if (guest_irq_type)
-		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-					 struct kvm_assigned_irq
-					 *assigned_irq)
-{
-	int r = -ENODEV;
-	struct kvm_assigned_dev_kernel *match;
-	unsigned long irq_type;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_irq->assigned_dev_id);
-	if (!match)
-		goto out;
-
-	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
-					  KVM_DEV_IRQ_GUEST_MASK);
-	r = kvm_deassign_irq(kvm, match, irq_type);
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device.  To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs.  PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file.  We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
-	int i;
-	bool bar_found = false;
-
-	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
-		char *kpath, *syspath;
-		struct path path;
-		struct inode *inode;
-		int r;
-
-		if (!pci_resource_len(dev, i))
-			continue;
-
-		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
-		if (!kpath)
-			return -ENOMEM;
-
-		/* Per sysfs-rules, sysfs is always at /sys */
-		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
-		kfree(kpath);
-		if (!syspath)
-			return -ENOMEM;
-
-		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
-		kfree(syspath);
-		if (r)
-			return r;
-
-		inode = d_backing_inode(path.dentry);
-
-		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
-		path_put(&path);
-		if (r)
-			return r;
-
-		bar_found = true;
-	}
-
-	/* If no resources, probably something special */
-	if (!bar_found)
-		return -EPERM;
-
-	return 0;
-#else
-	return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-				      struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0, idx;
-	struct kvm_assigned_dev_kernel *match;
-	struct pci_dev *dev;
-
-	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
-		return -EINVAL;
-
-	mutex_lock(&kvm->lock);
-	idx = srcu_read_lock(&kvm->srcu);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (match) {
-		/* device already assigned */
-		r = -EEXIST;
-		goto out;
-	}
-
-	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-	if (match == NULL) {
-		printk(KERN_INFO "%s: Couldn't allocate memory\n",
-		       __func__);
-		r = -ENOMEM;
-		goto out;
-	}
-	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
-				   assigned_dev->busnr,
-				   assigned_dev->devfn);
-	if (!dev) {
-		printk(KERN_INFO "%s: host device not found\n", __func__);
-		r = -EINVAL;
-		goto out_free;
-	}
-
-	/* Don't allow bridges to be assigned */
-	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
-		r = -EPERM;
-		goto out_put;
-	}
-
-	r = probe_sysfs_permissions(dev);
-	if (r)
-		goto out_put;
-
-	if (pci_enable_device(dev)) {
-		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-		r = -EBUSY;
-		goto out_put;
-	}
-	r = pci_request_regions(dev, "kvm_assigned_device");
-	if (r) {
-		printk(KERN_INFO "%s: Could not get access to device regions\n",
-		       __func__);
-		goto out_disable;
-	}
-
-	pci_reset_function(dev);
-	pci_save_state(dev);
-	match->pci_saved_state = pci_store_saved_state(dev);
-	if (!match->pci_saved_state)
-		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-
-	if (!pci_intx_mask_supported(dev))
-		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-
-	match->assigned_dev_id = assigned_dev->assigned_dev_id;
-	match->host_segnr = assigned_dev->segnr;
-	match->host_busnr = assigned_dev->busnr;
-	match->host_devfn = assigned_dev->devfn;
-	match->flags = assigned_dev->flags;
-	match->dev = dev;
-	spin_lock_init(&match->intx_lock);
-	spin_lock_init(&match->intx_mask_lock);
-	match->irq_source_id = -1;
-	match->kvm = kvm;
-	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-
-	list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-	if (!kvm->arch.iommu_domain) {
-		r = kvm_iommu_map_guest(kvm);
-		if (r)
-			goto out_list_del;
-	}
-	r = kvm_assign_device(kvm, match->dev);
-	if (r)
-		goto out_list_del;
-
-out:
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-out_list_del:
-	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
-		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-		       __func__, dev_name(&dev->dev));
-	list_del(&match->list);
-	pci_release_regions(dev);
-out_disable:
-	pci_disable_device(dev);
-out_put:
-	pci_dev_put(dev);
-out_free:
-	kfree(match);
-	srcu_read_unlock(&kvm->srcu, idx);
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		printk(KERN_INFO "%s: device hasn't been assigned before, "
-		  "so cannot be deassigned\n", __func__);
-		r = -EINVAL;
-		goto out;
-	}
-
-	kvm_deassign_device(kvm, match->dev);
-
-	kvm_free_assigned_device(kvm, match);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-				    struct kvm_assigned_msix_nr *entry_nr)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry_nr->assigned_dev_id);
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_nr_out;
-	}
-
-	if (adev->entries_nr == 0) {
-		adev->entries_nr = entry_nr->entry_nr;
-		if (adev->entries_nr == 0 ||
-		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
-			r = -EINVAL;
-			goto msix_nr_out;
-		}
-
-		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-						entry_nr->entry_nr,
-						GFP_KERNEL);
-		if (!adev->host_msix_entries) {
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-		adev->guest_msix_entries =
-			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
-				GFP_KERNEL);
-		if (!adev->guest_msix_entries) {
-			kfree(adev->host_msix_entries);
-			r = -ENOMEM;
-			goto msix_nr_out;
-		}
-	} else /* Not allowed set MSI-X number twice */
-		r = -EINVAL;
-msix_nr_out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-				       struct kvm_assigned_msix_entry *entry)
-{
-	int r = 0, i;
-	struct kvm_assigned_dev_kernel *adev;
-
-	mutex_lock(&kvm->lock);
-
-	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      entry->assigned_dev_id);
-
-	if (!adev) {
-		r = -EINVAL;
-		goto msix_entry_out;
-	}
-
-	for (i = 0; i < adev->entries_nr; i++)
-		if (adev->guest_msix_entries[i].vector == 0 ||
-		    adev->guest_msix_entries[i].entry == entry->entry) {
-			adev->guest_msix_entries[i].entry = entry->entry;
-			adev->guest_msix_entries[i].vector = entry->gsi;
-			adev->host_msix_entries[i].entry = entry->entry;
-			break;
-		}
-	if (i == adev->entries_nr) {
-		r = -ENOSPC;
-		goto msix_entry_out;
-	}
-
-msix_entry_out:
-	mutex_unlock(&kvm->lock);
-
-	return r;
-}
-
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-		struct kvm_assigned_pci_dev *assigned_dev)
-{
-	int r = 0;
-	struct kvm_assigned_dev_kernel *match;
-
-	mutex_lock(&kvm->lock);
-
-	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-				      assigned_dev->assigned_dev_id);
-	if (!match) {
-		r = -ENODEV;
-		goto out;
-	}
-
-	spin_lock(&match->intx_mask_lock);
-
-	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-
-	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-			kvm_set_irq(match->kvm, match->irq_source_id,
-				    match->guest_irq, 0, false);
-			/*
-			 * Masking at hardware-level is performed on demand,
-			 * i.e. when an IRQ actually arrives at the host.
-			 */
-		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-			/*
-			 * Unmask the IRQ line if required. Unmasking at
-			 * device level will be performed by user space.
-			 */
-			spin_lock_irq(&match->intx_lock);
-			if (match->host_irq_disabled) {
-				enable_irq(match->host_irq);
-				match->host_irq_disabled = false;
-			}
-			spin_unlock_irq(&match->intx_lock);
-		}
-	}
-
-	spin_unlock(&match->intx_mask_lock);
-
-out:
-	mutex_unlock(&kvm->lock);
-	return r;
-}
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-	int r;
-
-	switch (ioctl) {
-	case KVM_ASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_IRQ: {
-		r = -EOPNOTSUPP;
-		break;
-	}
-	case KVM_ASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_DEV_IRQ: {
-		struct kvm_assigned_irq assigned_irq;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-			goto out;
-		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_DEASSIGN_PCI_DEVICE: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_NR: {
-		struct kvm_assigned_msix_nr entry_nr;
-		r = -EFAULT;
-		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_MSIX_ENTRY: {
-		struct kvm_assigned_msix_entry entry;
-		r = -EFAULT;
-		if (copy_from_user(&entry, argp, sizeof entry))
-			goto out;
-		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-		if (r)
-			goto out;
-		break;
-	}
-	case KVM_ASSIGN_SET_INTX_MASK: {
-		struct kvm_assigned_pci_dev assigned_dev;
-
-		r = -EFAULT;
-		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-			goto out;
-		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-		break;
-	}
-	default:
-		r = -ENOTTY;
-		break;
-	}
-out:
-	return r;
-}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
deleted file mode 100644
index a428c1a211b2..000000000000
--- a/arch/x86/kvm/assigned-dev.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
-#define ARCH_X86_KVM_ASSIGNED_DEV_H
-
-#include <linux/kvm_host.h>
-
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
-
-int kvm_iommu_map_guest(struct kvm *kvm);
-int kvm_iommu_unmap_guest(struct kvm *kvm);
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-				  unsigned long arg);
-
-void kvm_free_all_assigned_devices(struct kvm *kvm);
-#else
-static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	return 0;
-}
-
-static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-						unsigned long arg)
-{
-	return -ENOTTY;
-}
-
-static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
-#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
-
-#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
deleted file mode 100644
index b181426f67b4..000000000000
--- a/arch/x86/kvm/iommu.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/iommu.h>
-#include "assigned-dev.h"
-
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
-		   allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages);
-
-static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
-			   unsigned long npages)
-{
-	gfn_t end_gfn;
-	kvm_pfn_t pfn;
-
-	pfn     = gfn_to_pfn_memslot(slot, gfn);
-	end_gfn = gfn + npages;
-	gfn    += 1;
-
-	if (is_error_noslot_pfn(pfn))
-		return pfn;
-
-	while (gfn < end_gfn)
-		gfn_to_pfn_memslot(slot, gfn++);
-
-	return pfn;
-}
-
-static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
-		unsigned long npages)
-{
-	unsigned long i;
-
-	for (i = 0; i < npages; ++i)
-		kvm_release_pfn_clean(pfn + i);
-}
-
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	gfn_t gfn, end_gfn;
-	kvm_pfn_t pfn;
-	int r = 0;
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int flags;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	gfn     = slot->base_gfn;
-	end_gfn = gfn + slot->npages;
-
-	flags = IOMMU_READ;
-	if (!(slot->flags & KVM_MEM_READONLY))
-		flags |= IOMMU_WRITE;
-	if (!kvm->arch.iommu_noncoherent)
-		flags |= IOMMU_CACHE;
-
-
-	while (gfn < end_gfn) {
-		unsigned long page_size;
-
-		/* Check if already mapped */
-		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Get the page size we could use to map */
-		page_size = kvm_host_page_size(kvm, gfn);
-
-		/* Make sure the page_size does not exceed the memslot */
-		while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
-			page_size >>= 1;
-
-		/* Make sure gfn is aligned to the page size we want to map */
-		while ((gfn << PAGE_SHIFT) & (page_size - 1))
-			page_size >>= 1;
-
-		/* Make sure hva is aligned to the page size we want to map */
-		while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
-			page_size >>= 1;
-
-		/*
-		 * Pin all pages we are about to map in memory. This is
-		 * important because we unmap and unpin in 4kb steps later.
-		 */
-		pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
-		if (is_error_noslot_pfn(pfn)) {
-			gfn += 1;
-			continue;
-		}
-
-		/* Map into IO address space */
-		r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
-			      page_size, flags);
-		if (r) {
-			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%llx\n", pfn);
-			kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
-			goto unmap_pages;
-		}
-
-		gfn += page_size >> PAGE_SHIFT;
-
-		cond_resched();
-	}
-
-	return 0;
-
-unmap_pages:
-	kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
-	return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-	int idx, r = 0;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_register_noncoherent_dma(kvm);
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots) {
-		r = kvm_iommu_map_pages(kvm, memslot);
-		if (r)
-			break;
-	}
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	return r;
-}
-
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
-	bool noncoherent;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	r = iommu_attach_device(domain, &pdev->dev);
-	if (r) {
-		dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
-		return r;
-	}
-
-	noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
-
-	/* Check if need to update IOMMU page table for guest memory */
-	if (noncoherent != kvm->arch.iommu_noncoherent) {
-		kvm_iommu_unmap_memslots(kvm);
-		kvm->arch.iommu_noncoherent = noncoherent;
-		r = kvm_iommu_map_memslots(kvm);
-		if (r)
-			goto out_unmap;
-	}
-
-	kvm_arch_start_assignment(kvm);
-	pci_set_dev_assigned(pdev);
-
-	dev_info(&pdev->dev, "kvm assign device\n");
-
-	return 0;
-out_unmap:
-	kvm_iommu_unmap_memslots(kvm);
-	return r;
-}
-
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	if (pdev == NULL)
-		return -ENODEV;
-
-	iommu_detach_device(domain, &pdev->dev);
-
-	pci_clear_dev_assigned(pdev);
-	kvm_arch_end_assignment(kvm);
-
-	dev_info(&pdev->dev, "kvm deassign device\n");
-
-	return 0;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
-	int r;
-
-	if (!iommu_present(&pci_bus_type)) {
-		printk(KERN_ERR "%s: iommu not found\n", __func__);
-		return -ENODEV;
-	}
-
-	mutex_lock(&kvm->slots_lock);
-
-	kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-	if (!kvm->arch.iommu_domain) {
-		r = -ENOMEM;
-		goto out_unlock;
-	}
-
-	if (!allow_unsafe_assigned_interrupts &&
-	    !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
-		printk(KERN_WARNING "%s: No interrupt remapping support,"
-		       " disallowing device assignment."
-		       " Re-enable with \"allow_unsafe_assigned_interrupts=1\""
-		       " module option.\n", __func__);
-		iommu_domain_free(kvm->arch.iommu_domain);
-		kvm->arch.iommu_domain = NULL;
-		r = -EPERM;
-		goto out_unlock;
-	}
-
-	r = kvm_iommu_map_memslots(kvm);
-	if (r)
-		kvm_iommu_unmap_memslots(kvm);
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-	return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-				gfn_t base_gfn, unsigned long npages)
-{
-	struct iommu_domain *domain;
-	gfn_t end_gfn, gfn;
-	kvm_pfn_t pfn;
-	u64 phys;
-
-	domain  = kvm->arch.iommu_domain;
-	end_gfn = base_gfn + npages;
-	gfn     = base_gfn;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return;
-
-	while (gfn < end_gfn) {
-		unsigned long unmap_pages;
-		size_t size;
-
-		/* Get physical address */
-		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-
-		if (!phys) {
-			gfn++;
-			continue;
-		}
-
-		pfn  = phys >> PAGE_SHIFT;
-
-		/* Unmap address from IO address space */
-		size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
-		unmap_pages = 1ULL << get_order(size);
-
-		/* Unpin all pages we just unmapped to not leak any memory */
-		kvm_unpin_pages(kvm, pfn, unmap_pages);
-
-		gfn += unmap_pages;
-
-		cond_resched();
-	}
-}
-
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-	kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-	int idx;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
-
-	idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots)
-		kvm_iommu_unmap_pages(kvm, memslot);
-
-	srcu_read_unlock(&kvm->srcu, idx);
-
-	if (kvm->arch.iommu_noncoherent)
-		kvm_arch_unregister_noncoherent_dma(kvm);
-
-	return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-	struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-	/* check if iommu exists and in use */
-	if (!domain)
-		return 0;
-
-	mutex_lock(&kvm->slots_lock);
-	kvm_iommu_unmap_memslots(kvm);
-	kvm->arch.iommu_domain = NULL;
-	kvm->arch.iommu_noncoherent = false;
-	mutex_unlock(&kvm->slots_lock);
-
-	iommu_domain_free(domain);
-	return 0;
-}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ccbd45ecd41a..1853cda7f6d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,7 +27,6 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
-#include "assigned-dev.h"
 #include "pmu.h"
 #include "hyperv.h"
 
@@ -2675,10 +2674,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_SET_BOOT_CPU_ID:
  	case KVM_CAP_SPLIT_IRQCHIP:
 	case KVM_CAP_IMMEDIATE_EXIT:
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_ASSIGN_DEV_IRQ:
-	case KVM_CAP_PCI_2_3:
-#endif
 		r = 1;
 		break;
 	case KVM_CAP_ADJUST_CLOCK:
@@ -2713,11 +2708,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-	case KVM_CAP_IOMMU:
-		r = iommu_present(&pci_bus_type);
-		break;
-#endif
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
@@ -4230,7 +4220,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 	default:
-		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+		r = -ENOTTY;
 	}
 out:
 	return r;
@@ -8068,7 +8058,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-	kvm_free_all_assigned_devices(kvm);
 	kvm_free_pit(kvm);
 }
 
@@ -8152,7 +8141,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	}
 	if (kvm_x86_ops->vm_destroy)
 		kvm_x86_ops->vm_destroy(kvm);
-	kvm_iommu_unmap_guest(kvm);
 	kvm_pic_destroy(kvm);
 	kvm_ioapic_destroy(kvm);
 	kvm_free_vcpus(kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d0250744507a..f1339a7756b3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -877,22 +877,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-#else
-static inline int kvm_iommu_map_pages(struct kvm *kvm,
-				      struct kvm_memory_slot *slot)
-{
-	return 0;
-}
-
-static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
-					 struct kvm_memory_slot *slot)
-{
-}
-#endif
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 88257b311cb5..ff3bf5d26e0b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1019,8 +1019,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 		old_memslots = install_new_memslots(kvm, as_id, slots);
 
-		/* slot was deleted or moved, clear iommu mapping */
-		kvm_iommu_unmap_pages(kvm, &old);
 		/* From this point no new shadow pages pointing to a deleted,
 		 * or moved, memslot will be created.
 		 *
@@ -1055,21 +1053,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	kvm_free_memslot(kvm, &old, &new);
 	kvfree(old_memslots);
-
-	/*
-	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
-	 * un-mapped and re-mapped if their base changes.  Since base change
-	 * unmapping is handled above with slot deletion, mapping alone is
-	 * needed here.  Anything else the iommu might care about for existing
-	 * slots (size changes, userspace addr changes and read-only flag
-	 * changes) is disallowed above, so any other attribute changes getting
-	 * here can be skipped.
-	 */
-	if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) {
-		r = kvm_iommu_map_pages(kvm, &new);
-		return r;
-	}
-
 	return 0;
 
 out_slots:
-- 
cgit v1.2.3


From 4b4357e02523ec63ad853f927f5d93a25101a1d2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 31 Mar 2017 13:53:23 +0200
Subject: kvm: make KVM_COALESCED_MMIO_PAGE_OFFSET public
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Its value has never changed; we might as well make it part of the ABI instead
of using the return value of KVM_CHECK_EXTENSION(KVM_CAP_COALESCED_MMIO).

Because PPC does not always make MMIO available, the code has to be made
dependent on CONFIG_KVM_MMIO rather than KVM_COALESCED_MMIO_PAGE_OFFSET.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/arm/include/asm/kvm_host.h     |  1 -
 arch/arm/include/uapi/asm/kvm.h     |  2 ++
 arch/arm64/include/asm/kvm_host.h   |  1 -
 arch/arm64/include/uapi/asm/kvm.h   |  2 ++
 arch/mips/include/asm/kvm_host.h    |  1 -
 arch/mips/include/uapi/asm/kvm.h    |  2 ++
 arch/powerpc/include/asm/kvm_host.h |  3 ---
 arch/powerpc/include/uapi/asm/kvm.h |  3 +++
 arch/x86/include/asm/kvm_host.h     |  2 --
 arch/x86/include/uapi/asm/kvm.h     |  3 +++
 include/linux/kvm_host.h            |  2 +-
 virt/kvm/kvm_main.c                 | 10 +++++-----
 12 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 31ee468ce667..de67ce647501 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -30,7 +30,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 32
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 6ebd3e6a1fd1..254a38cace2a 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -27,6 +27,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)						\
 	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e7705e7bb07b..522e4f60976e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -31,7 +31,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 512
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #include <kvm/arm_vgic.h>
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index c2860358ae3e..aa5ab69c1312 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -39,6 +39,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)						\
 	(1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5c518c148f9d..2998479fd4e8 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -83,7 +83,6 @@
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS	0
 
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #ifdef CONFIG_KVM_MIPS_VZ
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index 3107095d7f0a..0318c6b442ab 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -21,6 +21,8 @@
 
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7bba8f415627..01d05c76f1c7 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -45,9 +45,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#ifdef CONFIG_KVM_MMIO
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
-#endif
 #define KVM_HALT_POLL_NS_DEFAULT 10000	/* 10 us */
 
 /* These values are internal and can be increased later */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 4edbe4bb0e8b..07fbeb927834 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -29,6 +29,9 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_GUEST_DEBUG
 
+/* Not always available, but if it is, this is the correct offset.  */
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 struct kvm_regs {
 	__u64 pc;
 	__u64 cr;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7dbb8d622683..d962fa998a6f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,8 +43,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 #define KVM_HALT_POLL_NS_DEFAULT 400000
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 739c0c594022..c2824d02ba37 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,9 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
 #define DE_VECTOR 0
 #define DB_VECTOR 1
 #define BP_VECTOR 3
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f1339a7756b3..7e74ae4d99bb 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -403,7 +403,7 @@ struct kvm {
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
 	refcount_t users_count;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 	spinlock_t ring_lock;
 	struct list_head coalesced_zones;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5dcde10c53b..f489167839c4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2349,7 +2349,7 @@ static int kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->arch.pio_data);
 #endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
@@ -2918,7 +2918,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 		return 1;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	case KVM_CAP_COALESCED_MMIO:
 		return KVM_COALESCED_MMIO_PAGE_OFFSET;
 #endif
@@ -2971,7 +2971,7 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
 		break;
 	}
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	case KVM_REGISTER_COALESCED_MMIO: {
 		struct kvm_coalesced_mmio_zone zone;
 
@@ -3163,7 +3163,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 	kvm = kvm_create_vm(type);
 	if (IS_ERR(kvm))
 		return PTR_ERR(kvm);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 	r = kvm_coalesced_mmio_init(kvm);
 	if (r < 0) {
 		kvm_put_kvm(kvm);
@@ -3216,7 +3216,7 @@ static long kvm_dev_ioctl(struct file *filp,
 #ifdef CONFIG_X86
 		r += PAGE_SIZE;    /* pio data page */
 #endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
 		r += PAGE_SIZE;    /* coalesced mmio ring page */
 #endif
 		break;
-- 
cgit v1.2.3


From 54d5329d425650fafaf90660a139c771d2d49cae Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 7 Apr 2017 08:52:27 -0600
Subject: blk-mq-sched: fix crash in switch error path

In elevator_switch(), if blk_mq_init_sched() fails, we attempt to fall
back to the original scheduler. However, at this point, we've already
torn down the original scheduler's tags, so this causes a crash. Doing
the fallback like the legacy elevator path is much harder for mq, so fix
it by just falling back to none, instead.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c     | 13 +++++--
 block/blk-mq-sched.h     |  2 +-
 block/blk-mq.c           |  2 --
 block/blk-sysfs.c        |  2 +-
 block/elevator.c         | 94 +++++++++++++++++++++++++++---------------------
 include/linux/elevator.h |  2 +-
 6 files changed, 67 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0bb13bb51daa..e8c2ed654ef0 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -451,7 +451,7 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 	return ret;
 }
 
-void blk_mq_sched_teardown(struct request_queue *q)
+static void blk_mq_sched_tags_teardown(struct request_queue *q)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 	struct blk_mq_hw_ctx *hctx;
@@ -513,10 +513,19 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	return 0;
 
 err:
-	blk_mq_sched_teardown(q);
+	blk_mq_sched_tags_teardown(q);
+	q->elevator = NULL;
 	return ret;
 }
 
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
+{
+	if (e->type->ops.mq.exit_sched)
+		e->type->ops.mq.exit_sched(e);
+	blk_mq_sched_tags_teardown(q);
+	q->elevator = NULL;
+}
+
 int blk_mq_sched_init(struct request_queue *q)
 {
 	int ret;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 19db25e0c95a..e704956e0862 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -33,7 +33,7 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
 			struct request *(*get_rq)(struct blk_mq_hw_ctx *));
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
-void blk_mq_sched_teardown(struct request_queue *q);
+void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
 
 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			   unsigned int hctx_idx);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 72e744cd638c..cfb7c97b14ec 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2240,8 +2240,6 @@ void blk_mq_release(struct request_queue *q)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 
-	blk_mq_sched_teardown(q);
-
 	/* hctx kobj stays in hctx */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c44b321335f3..37f0b3ad635e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -816,7 +816,7 @@ static void blk_release_queue(struct kobject *kobj)
 
 	if (q->elevator) {
 		ioc_clear_queue(q);
-		elevator_exit(q->elevator);
+		elevator_exit(q, q->elevator);
 	}
 
 	blk_exit_rl(&q->root_rl);
diff --git a/block/elevator.c b/block/elevator.c
index f236ef1d2be9..dbeecf7be719 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -252,11 +252,11 @@ int elevator_init(struct request_queue *q, char *name)
 }
 EXPORT_SYMBOL(elevator_init);
 
-void elevator_exit(struct elevator_queue *e)
+void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->uses_mq && e->type->ops.mq.exit_sched)
-		e->type->ops.mq.exit_sched(e);
+		blk_mq_exit_sched(q, e);
 	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
 		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
@@ -941,6 +941,45 @@ void elv_unregister(struct elevator_type *e)
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 
+static int elevator_switch_mq(struct request_queue *q,
+			      struct elevator_type *new_e)
+{
+	int ret;
+
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
+	if (q->elevator) {
+		if (q->elevator->registered)
+			elv_unregister_queue(q);
+		ioc_clear_queue(q);
+		elevator_exit(q, q->elevator);
+	}
+
+	ret = blk_mq_init_sched(q, new_e);
+	if (ret)
+		goto out;
+
+	if (new_e) {
+		ret = elv_register_queue(q);
+		if (ret) {
+			elevator_exit(q, q->elevator);
+			goto out;
+		}
+	}
+
+	if (new_e)
+		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	else
+		blk_add_trace_msg(q, "elv switch: none");
+
+out:
+	blk_mq_unfreeze_queue(q);
+	blk_mq_start_stopped_hw_queues(q, true);
+	return ret;
+
+}
+
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
@@ -953,10 +992,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	bool old_registered = false;
 	int err;
 
-	if (q->mq_ops) {
-		blk_mq_freeze_queue(q);
-		blk_mq_quiesce_queue(q);
-	}
+	if (q->mq_ops)
+		return elevator_switch_mq(q, new_e);
 
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
@@ -968,11 +1005,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	if (old) {
 		old_registered = old->registered;
 
-		if (old->uses_mq)
-			blk_mq_sched_teardown(q);
-
-		if (!q->mq_ops)
-			blk_queue_bypass_start(q);
+		blk_queue_bypass_start(q);
 
 		/* unregister and clear all auxiliary data of the old elevator */
 		if (old_registered)
@@ -982,53 +1015,32 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	}
 
 	/* allocate, init and register new elevator */
-	if (q->mq_ops)
-		err = blk_mq_init_sched(q, new_e);
-	else
-		err = new_e->ops.sq.elevator_init_fn(q, new_e);
+	err = new_e->ops.sq.elevator_init_fn(q, new_e);
 	if (err)
 		goto fail_init;
 
-	if (new_e) {
-		err = elv_register_queue(q);
-		if (err)
-			goto fail_register;
-	}
+	err = elv_register_queue(q);
+	if (err)
+		goto fail_register;
 
 	/* done, kill the old one and finish */
 	if (old) {
-		elevator_exit(old);
-		if (!q->mq_ops)
-			blk_queue_bypass_end(q);
+		elevator_exit(q, old);
+		blk_queue_bypass_end(q);
 	}
 
-	if (q->mq_ops) {
-		blk_mq_unfreeze_queue(q);
-		blk_mq_start_stopped_hw_queues(q, true);
-	}
-
-	if (new_e)
-		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
-	else
-		blk_add_trace_msg(q, "elv switch: none");
+	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
 
 	return 0;
 
 fail_register:
-	if (q->mq_ops)
-		blk_mq_sched_teardown(q);
-	elevator_exit(q->elevator);
+	elevator_exit(q, q->elevator);
 fail_init:
 	/* switch failed, restore and re-register old elevator */
 	if (old) {
 		q->elevator = old;
 		elv_register_queue(q);
-		if (!q->mq_ops)
-			blk_queue_bypass_end(q);
-	}
-	if (q->mq_ops) {
-		blk_mq_unfreeze_queue(q);
-		blk_mq_start_stopped_hw_queues(q, true);
+		blk_queue_bypass_end(q);
 	}
 
 	return err;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index aebecc4ed088..22d39e8d4de1 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -211,7 +211,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *);
 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
-extern void elevator_exit(struct elevator_queue *);
+extern void elevator_exit(struct request_queue *, struct elevator_queue *);
 extern int elevator_change(struct request_queue *, const char *);
 extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
-- 
cgit v1.2.3


From dbde775cdbf5e401b8739f30c87d1af12c0028db Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 7 Apr 2017 11:10:44 +1000
Subject: block: simple improvements for bio->flags

The comment for the 'flags' field of 'bio' mentions
"command" which is no longer stored there, and doesn't
mention the bvec pool number, which is.

BIO_RESET_BITS is set in such a way that it would need to be
updated if new bits were added, which is easy to miss.

BVEC_POOL_BITS is larger than needed.  The BVEC_POOL_IDX()
ranges from 0 to 6, so 3 bits are sufficient.

This patch make improvements in each of these areas.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 67bcf8a5326e..1ebbc289b642 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -33,7 +33,7 @@ struct bio {
 						 * top bits REQ_OP. Use
 						 * accessors.
 						 */
-	unsigned short		bi_flags;	/* status, command, etc */
+	unsigned short		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 
 	struct bvec_iter	bi_iter;
@@ -110,12 +110,7 @@ struct bio {
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED	9	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS	10
+/* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
  * We support 6 different bvec pools, the last one is magic in that it
@@ -125,13 +120,22 @@ struct bio {
 #define BVEC_POOL_MAX		(BVEC_POOL_NR - 1)
 
 /*
- * Top 4 bits of bio flags indicate the pool the bvecs came from.  We add
+ * Top 3 bits of bio flags indicate the pool the bvecs came from.  We add
  * 1 to the actual index so that 0 indicates that there are no bvecs to be
  * freed.
  */
-#define BVEC_POOL_BITS		(4)
+#define BVEC_POOL_BITS		(3)
 #define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
 #define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS	BVEC_POOL_OFFSET
 
 /*
  * Operations and flags common to the bio and request structures.
-- 
cgit v1.2.3


From fbbaf700e7b163a0f1704b2d542ee28be11fce21 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 7 Apr 2017 09:40:52 -0600
Subject: block: trace completion of all bios.

Currently only dm and md/raid5 bios trigger
trace_block_bio_complete().  Now that we have bio_chain() and
bio_inc_remaining(), it is not possible, in general, for a driver to
know when the bio is really complete.  Only bio_endio() knows that.

So move the trace_block_bio_complete() call to bio_endio().

Now trace_block_bio_complete() pairs with trace_block_bio_queue().
Any bio for which a 'queue' event is traced, will subsequently
generate a 'complete' event.

There are a few cases where completion tracing is not wanted.
1/ If blk_update_request() has already generated a completion
   trace event at the 'request' level, there is no point generating
   one at the bio level too.  In this case the bi_sector and bi_size
   will have changed, so the bio level event would be wrong

2/ If the bio hasn't actually been queued yet, but is being aborted
   early, then a trace event could be confusing.  Some filesystems
   call bio_endio() but do not want tracing.

3/ The bio_integrity code interposes itself by replacing bi_end_io,
   then restoring it and calling bio_endio() again.  This would produce
   two identical trace events if left like that.

To handle these, we introduce a flag BIO_TRACE_COMPLETION and only
produce the trace event when this is set.
We address point 1 above by clearing the flag in blk_update_request().
We address point 2 above by only setting the flag when
generic_make_request() is called.
We address point 3 above by clearing the flag after generating a
completion event.

When bio_split() is used on a bio, particularly in blk_queue_split(),
there is an extra complication.  A new bio is split off the front, and
may be handle directly without going through generic_make_request().
The old bio, which has been advanced, is passed to
generic_make_request(), so it will trigger a trace event a second
time.
Probably the best result when a split happens is to see a single
'queue' event for the whole bio, then multiple 'complete' events - one
for each component.  To achieve this was can:
- copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split()
- avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set.
This way, the split-off bio won't create a queue event, the original
won't either even if it re-submitted to generic_make_request(),
but both will produce completion events, each for their own range.

So if generic_make_request() is called (which generates a QUEUED
event), then bi_endio() will create a single COMPLETE event for each
range that the bio is split into, unless the driver has explicitly
requested it not to.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c               | 14 ++++++++++++++
 block/blk-core.c          | 10 +++++++++-
 drivers/md/dm.c           |  1 -
 drivers/md/raid5.c        |  2 --
 include/linux/blk_types.h |  2 ++
 5 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index f1857c0f0826..f4d207180266 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1826,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
  *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
  *   way to end I/O on a bio. No one should call bi_end_io() directly on a
  *   bio unless they own it and thus know that it has an end_io function.
+ *
+ *   bio_endio() can be called several times on a bio that has been chained
+ *   using bio_chain().  The ->bi_end_io() function will only be called the
+ *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
+ *   generated if BIO_TRACE_COMPLETION is set.
  **/
 void bio_endio(struct bio *bio)
 {
@@ -1846,6 +1851,12 @@ again:
 		goto again;
 	}
 
+	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+					 bio, bio->bi_error);
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+	}
+
 	blk_throtl_bio_endio(bio);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
@@ -1885,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
 
 	bio_advance(bio, split->bi_iter.bi_size);
 
+	if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
 	return split;
 }
 EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-core.c b/block/blk-core.c
index 316a5399fb15..8654aa0cef6d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1957,7 +1957,13 @@ generic_make_request_checks(struct bio *bio)
 	if (!blkcg_bio_issue_check(q, bio))
 		return false;
 
-	trace_block_bio_queue(q, bio);
+	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_queue(q, bio);
+		/* Now that enqueuing has been traced, we need to trace
+		 * completion as well.
+		 */
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+	}
 	return true;
 
 not_supported:
@@ -2622,6 +2628,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		if (bio_bytes == bio->bi_iter.bi_size)
 			req->bio = bio->bi_next;
 
+		/* Completion has already been traced */
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 		req_bio_endio(req, bio, bio_bytes, error);
 
 		total_bytes += bio_bytes;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..cd93a3b9ceca 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			trace_block_bio_complete(md->queue, bio, io_error);
 			bio->bi_error = io_error;
 			bio_endio(bio);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..7aeb9691c2e1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
-					 raid_bi, 0);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1ebbc289b642..72aa9519167e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -110,6 +110,8 @@ struct bio {
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED	9	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
+#define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
+				 * of this bio. */
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
-- 
cgit v1.2.3


From 7587a5ae7eef0439f7be31f1b5959af062bbc5ec Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 7 Apr 2017 11:16:52 -0700
Subject: blk-mq: Introduce blk_mq_delay_run_hw_queue()

Introduce a function that runs a hardware queue unconditionally
after a delay. Note: there is already a function that stops and
restarts a hardware queue after a delay, namely blk_mq_delay_queue().

This function will be used in the next patch in this series.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Long Li <longli@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 32 ++++++++++++++++++++++++++++++--
 include/linux/blk-mq.h |  2 ++
 2 files changed, 32 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f1be5c3eb600..ad45ae743186 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1135,7 +1135,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 	return hctx->next_cpu;
 }
 
-void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
+					unsigned long msecs)
 {
 	if (unlikely(blk_mq_hctx_stopped(hctx) ||
 		     !blk_mq_hw_queue_mapped(hctx)))
@@ -1152,7 +1153,24 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 		put_cpu();
 	}
 
-	kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
+	if (msecs == 0)
+		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
+					 &hctx->run_work);
+	else
+		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+						 &hctx->delayed_run_work,
+						 msecs_to_jiffies(msecs));
+}
+
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
+{
+	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
+
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+{
+	__blk_mq_delay_run_hw_queue(hctx, async, 0);
 }
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -1255,6 +1273,15 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 	__blk_mq_run_hw_queue(hctx);
 }
 
+static void blk_mq_delayed_run_work_fn(struct work_struct *work)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
+
+	__blk_mq_run_hw_queue(hctx);
+}
+
 static void blk_mq_delay_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
@@ -1962,6 +1989,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		node = hctx->numa_node = set->numa_node;
 
 	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
+	INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b296a9006117..9382c5da7a2e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,6 +51,7 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	struct delayed_work	delayed_run_work;
 	struct delayed_work	delay_work;
 
 	struct hlist_node	cpuhp_dead;
@@ -238,6 +239,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.2.3


From 6d8c6c0f97ad8a3517c42b179c1dc8e77397d0e2 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 7 Apr 2017 12:40:09 -0600
Subject: blk-mq: Restart a single queue if tag sets are shared

To improve scalability, if hardware queues are shared, restart
a single hardware queue in round-robin fashion. Rename
blk_mq_sched_restart_queues() to reflect the new semantics.
Remove blk_mq_sched_mark_restart_queue() because this function
has no callers. Remove flag QUEUE_FLAG_RESTART because this
patch removes the code that uses this flag.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c   | 63 ++++++++++++++++++++++++++++++++++++++++++--------
 block/blk-mq-sched.h   | 16 +------------
 block/blk-mq.c         |  2 +-
 include/linux/blkdev.h |  1 -
 4 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e8c2ed654ef0..c974a1bbf4cb 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -318,25 +318,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 	return true;
 }
 
-static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-		if (blk_mq_hctx_has_pending(hctx))
+		if (blk_mq_hctx_has_pending(hctx)) {
 			blk_mq_run_hw_queue(hctx, true);
+			return true;
+		}
 	}
+	return false;
 }
 
-void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
-{
-	struct request_queue *q = hctx->queue;
-	unsigned int i;
+/**
+ * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
+ * @pos:    loop cursor.
+ * @skip:   the list element that will not be examined. Iteration starts at
+ *          @skip->next.
+ * @head:   head of the list to examine. This list must have at least one
+ *          element, namely @skip.
+ * @member: name of the list_head structure within typeof(*pos).
+ */
+#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
+	for ((pos) = (skip);						\
+	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
+			(pos)->member.next, typeof(*pos), member) :	\
+	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
+	     (pos) != (skip); )
 
-	if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
-		if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
-			queue_for_each_hw_ctx(q, hctx, i)
-				blk_mq_sched_restart_hctx(hctx);
+/*
+ * Called after a driver tag has been freed to check whether a hctx needs to
+ * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
+ * queues in a round-robin fashion if the tag set of @hctx is shared with other
+ * hardware queues.
+ */
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
+{
+	struct blk_mq_tags *const tags = hctx->tags;
+	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
+	struct request_queue *const queue = hctx->queue, *q;
+	struct blk_mq_hw_ctx *hctx2;
+	unsigned int i, j;
+
+	if (set->flags & BLK_MQ_F_TAG_SHARED) {
+		rcu_read_lock();
+		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
+					   tag_set_list) {
+			queue_for_each_hw_ctx(q, hctx2, i)
+				if (hctx2->tags == tags &&
+				    blk_mq_sched_restart_hctx(hctx2))
+					goto done;
+		}
+		j = hctx->queue_num + 1;
+		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
+			if (j == queue->nr_hw_queues)
+				j = 0;
+			hctx2 = queue->queue_hw_ctx[j];
+			if (hctx2->tags == tags &&
+			    blk_mq_sched_restart_hctx(hctx2))
+				break;
 		}
+done:
+		rcu_read_unlock();
 	} else {
 		blk_mq_sched_restart_hctx(hctx);
 	}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index e704956e0862..3a9e6e40558b 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -19,7 +19,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 				struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
-void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async, bool can_block);
@@ -136,20 +136,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
-/*
- * Mark a hardware queue and the request queue it belongs to as needing a
- * restart.
- */
-static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
-{
-	struct request_queue *q = hctx->queue;
-
-	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
-	if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
-		set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
-}
-
 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ad45ae743186..572966f49596 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -348,7 +348,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
 		blk_mq_sched_completed_request(hctx, rq);
-	blk_mq_sched_restart_queues(hctx);
+	blk_mq_sched_restart(hctx);
 	blk_queue_exit(q);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..7548f332121a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -610,7 +610,6 @@ struct request_queue {
 #define QUEUE_FLAG_FLUSH_NQ    25	/* flush not queueuable */
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
-#define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From ee056f98126170ca8b16b9a4a6e20aae7c5c184e Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 5 Apr 2017 12:01:34 -0700
Subject: blk-mq-sched: provide hooks for initializing hardware queue data

Schedulers need to be informed when a hardware queue is added or removed
at runtime so they can allocate/free per-hardware queue data. So,
replace the blk_mq_sched_init_hctx_data() helper, which only makes sense
at init time, with .init_hctx() and .exit_hctx() hooks.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c     | 81 +++++++++++++++++++++++++-----------------------
 block/blk-mq-sched.h     |  4 ---
 include/linux/elevator.h |  2 ++
 3 files changed, 45 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c974a1bbf4cb..9e3c0f92851b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
 
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
-				int (*init)(struct blk_mq_hw_ctx *),
-				void (*exit)(struct blk_mq_hw_ctx *))
-{
-	struct blk_mq_hw_ctx *hctx;
-	int ret;
-	int i;
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
-		if (!hctx->sched_data) {
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		if (init) {
-			ret = init(hctx);
-			if (ret) {
-				/*
-				 * We don't want to give exit() a partially
-				 * initialized sched_data. init() must clean up
-				 * if it fails.
-				 */
-				kfree(hctx->sched_data);
-				hctx->sched_data = NULL;
-				goto error;
-			}
-		}
-	}
-
-	return 0;
-error:
-	blk_mq_sched_free_hctx_data(q, exit);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
-
 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
 				      struct request *rq,
 				      struct bio *bio,
@@ -508,11 +471,24 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			   unsigned int hctx_idx)
 {
 	struct elevator_queue *e = q->elevator;
+	int ret;
 
 	if (!e)
 		return 0;
 
-	return blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+	ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+	if (ret)
+		return ret;
+
+	if (e->type->ops.mq.init_hctx) {
+		ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
+		if (ret) {
+			blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+			return ret;
+		}
+	}
+
+	return 0;
 }
 
 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
@@ -523,12 +499,18 @@ void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 	if (!e)
 		return;
 
+	if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
+		e->type->ops.mq.exit_hctx(hctx, hctx_idx);
+		hctx->sched_data = NULL;
+	}
+
 	blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 }
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
 	struct blk_mq_hw_ctx *hctx;
+	struct elevator_queue *eq;
 	unsigned int i;
 	int ret;
 
@@ -553,6 +535,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	if (ret)
 		goto err;
 
+	if (e->ops.mq.init_hctx) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			ret = e->ops.mq.init_hctx(hctx, i);
+			if (ret) {
+				eq = q->elevator;
+				blk_mq_exit_sched(q, eq);
+				kobject_put(&eq->kobj);
+				return ret;
+			}
+		}
+	}
+
 	return 0;
 
 err:
@@ -563,6 +557,17 @@ err:
 
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 {
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	if (e->type->ops.mq.exit_hctx) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (hctx->sched_data) {
+				e->type->ops.mq.exit_hctx(hctx, i);
+				hctx->sched_data = NULL;
+			}
+		}
+	}
 	if (e->type->ops.mq.exit_sched)
 		e->type->ops.mq.exit_sched(e);
 	blk_mq_sched_tags_teardown(q);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3a9e6e40558b..f4bc186c3440 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -4,10 +4,6 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
-				int (*init)(struct blk_mq_hw_ctx *),
-				void (*exit)(struct blk_mq_hw_ctx *));
-
 void blk_mq_sched_free_hctx_data(struct request_queue *q,
 				 void (*exit)(struct blk_mq_hw_ctx *));
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 22d39e8d4de1..b7ec315ee7e7 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -93,6 +93,8 @@ struct blk_mq_hw_ctx;
 struct elevator_mq_ops {
 	int (*init_sched)(struct request_queue *, struct elevator_type *);
 	void (*exit_sched)(struct elevator_queue *);
+	int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+	void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
 
 	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
 	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
-- 
cgit v1.2.3


From 5d9854eaea776441b38a9a45b4e6879524c4f48c Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 7 Apr 2017 17:13:17 -0700
Subject: iio: hid-sensor: Store restore poll and hysteresis on S3

This change undo the change done by 'commit 3bec24747446
("iio: hid-sensor-trigger: Change get poll value function order to avoid
sensor properties losing after resume from S3")' as this breaks some
USB/i2c sensor hubs.

Instead of relying on HW for restoring poll and hysteresis, driver stores
and restores on resume (S3). In this way user space modified settings are
not lost for any kind of sensor hub behavior.

In this change, whenever user space modifies sampling frequency or
hysteresis driver will get the feature value from the hub and store in the
per device hid_sensor_common data structure. On resume callback from S3,
system will set the feature to sensor hub, if user space ever modified the
feature value.

Fixes: 3bec24747446 ("iio: hid-sensor-trigger: Change get poll value function order to avoid sensor properties losing after resume from S3")
Reported-by: Ritesh Raj Sarraf <rrs@researchut.com>
Tested-by: Ritesh Raj Sarraf <rrs@researchut.com>
Tested-by: Song, Hongyan <hongyan.song@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 .../iio/common/hid-sensors/hid-sensor-attributes.c | 26 ++++++++++++++++++++--
 .../iio/common/hid-sensors/hid-sensor-trigger.c    | 20 ++++++++++++++---
 include/linux/hid-sensor-hub.h                     |  2 ++
 3 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
index 01e02b9926d4..ca742ac8f128 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-attributes.c
@@ -221,7 +221,15 @@ int hid_sensor_write_samp_freq_value(struct hid_sensor_common *st,
 	if (ret < 0 || value < 0)
 		ret = -EINVAL;
 
-	return ret;
+	ret = sensor_hub_get_feature(st->hsdev,
+				     st->poll.report_id,
+				     st->poll.index, sizeof(value), &value);
+	if (ret < 0 || value < 0)
+		return -EINVAL;
+
+	st->poll_interval = value;
+
+	return 0;
 }
 EXPORT_SYMBOL(hid_sensor_write_samp_freq_value);
 
@@ -266,7 +274,16 @@ int hid_sensor_write_raw_hyst_value(struct hid_sensor_common *st,
 	if (ret < 0 || value < 0)
 		ret = -EINVAL;
 
-	return ret;
+	ret = sensor_hub_get_feature(st->hsdev,
+				     st->sensitivity.report_id,
+				     st->sensitivity.index, sizeof(value),
+				     &value);
+	if (ret < 0 || value < 0)
+		return -EINVAL;
+
+	st->raw_hystersis = value;
+
+	return 0;
 }
 EXPORT_SYMBOL(hid_sensor_write_raw_hyst_value);
 
@@ -369,6 +386,9 @@ int hid_sensor_get_reporting_interval(struct hid_sensor_hub_device *hsdev,
 	/* Default unit of measure is milliseconds */
 	if (st->poll.units == 0)
 		st->poll.units = HID_USAGE_SENSOR_UNITS_MILLISECOND;
+
+	st->poll_interval = -1;
+
 	return 0;
 
 }
@@ -399,6 +419,8 @@ int hid_sensor_parse_common_attributes(struct hid_sensor_hub_device *hsdev,
 			HID_USAGE_SENSOR_PROP_SENSITIVITY_ABS,
 			 &st->sensitivity);
 
+	st->raw_hystersis = -1;
+
 	sensor_hub_input_get_attribute_info(hsdev,
 					    HID_INPUT_REPORT, usage_id,
 					    HID_USAGE_SENSOR_TIME_TIMESTAMP,
diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index ecf592d69043..60829340a82e 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -51,6 +51,8 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 			st->report_state.report_id,
 			st->report_state.index,
 			HID_USAGE_SENSOR_PROP_REPORTING_STATE_ALL_EVENTS_ENUM);
+
+		poll_value = hid_sensor_read_poll_value(st);
 	} else {
 		int val;
 
@@ -87,9 +89,7 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state)
 	sensor_hub_get_feature(st->hsdev, st->power_state.report_id,
 			       st->power_state.index,
 			       sizeof(state_val), &state_val);
-	if (state)
-		poll_value = hid_sensor_read_poll_value(st);
-	if (poll_value > 0)
+	if (state && poll_value)
 		msleep_interruptible(poll_value * 2);
 
 	return 0;
@@ -127,6 +127,20 @@ static void hid_sensor_set_power_work(struct work_struct *work)
 	struct hid_sensor_common *attrb = container_of(work,
 						       struct hid_sensor_common,
 						       work);
+
+	if (attrb->poll_interval >= 0)
+		sensor_hub_set_feature(attrb->hsdev, attrb->poll.report_id,
+				       attrb->poll.index,
+				       sizeof(attrb->poll_interval),
+				       &attrb->poll_interval);
+
+	if (attrb->raw_hystersis >= 0)
+		sensor_hub_set_feature(attrb->hsdev,
+				       attrb->sensitivity.report_id,
+				       attrb->sensitivity.index,
+				       sizeof(attrb->raw_hystersis),
+				       &attrb->raw_hystersis);
+
 	_hid_sensor_power_state(attrb, true);
 }
 
diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h
index 7ef111d3ecc5..f32d7c392c1e 100644
--- a/include/linux/hid-sensor-hub.h
+++ b/include/linux/hid-sensor-hub.h
@@ -231,6 +231,8 @@ struct hid_sensor_common {
 	unsigned usage_id;
 	atomic_t data_ready;
 	atomic_t user_requested_state;
+	int poll_interval;
+	int raw_hystersis;
 	struct iio_trigger *trigger;
 	int timestamp_ns_scale;
 	struct hid_sensor_hub_attribute_info poll;
-- 
cgit v1.2.3


From 42d5ec954719917e2b7a9160fe05d2316eece5bf Mon Sep 17 00:00:00 2001
From: Alan Tull <atull@opensource.altera.com>
Date: Thu, 23 Mar 2017 19:34:27 -0500
Subject: fpga: add config complete timeout

Adding timeout for maximum allowed time for FPGA to go to
operating mode after a FPGA region has been programmed.

Signed-off-by: Alan Tull <atull@opensource.altera.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/fpga-region.c    | 3 +++
 include/linux/fpga/fpga-mgr.h | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
index 2fe2a52c66ca..ae4c61aada99 100644
--- a/drivers/fpga/fpga-region.c
+++ b/drivers/fpga/fpga-region.c
@@ -385,6 +385,9 @@ static int fpga_region_notify_pre_apply(struct fpga_region *region,
 	of_property_read_u32(nd->overlay, "region-freeze-timeout-us",
 			     &info->disable_timeout_us);
 
+	of_property_read_u32(nd->overlay, "config-complete-timeout-us",
+			     &info->config_complete_timeout_us);
+
 	/* If FPGA was externally programmed, don't specify firmware */
 	if ((info->flags & FPGA_MGR_EXTERNAL_CONFIG) && firmware_name) {
 		pr_err("error: specified firmware and external-fpga-config");
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index e2ef94fd37af..b4ac24c4411d 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -77,11 +77,14 @@ enum fpga_mgr_states {
  * @flags: boolean flags as defined above
  * @enable_timeout_us: maximum time to enable traffic through bridge (uSec)
  * @disable_timeout_us: maximum time to disable traffic through bridge (uSec)
+ * @config_complete_timeout_us: maximum time for FPGA to switch to operating
+ *	   status in the write_complete op.
  */
 struct fpga_image_info {
 	u32 flags;
 	u32 enable_timeout_us;
 	u32 disable_timeout_us;
+	u32 config_complete_timeout_us;
 };
 
 /**
-- 
cgit v1.2.3


From d201cc17a8a31cc6c4f3944988fe9e2f04b021fb Mon Sep 17 00:00:00 2001
From: Matthew Gerlach <matthew.gerlach@linux.intel.com>
Date: Thu, 23 Mar 2017 19:34:28 -0500
Subject: fpga pr ip: Core driver support for Altera Partial Reconfiguration
 IP.

Adding the core functions necessary for a fpga-mgr driver
for the Altera Partial IP component.  It is intended for
these functions to be used by the various bus implementations
like the platform bus or the PCIe bus.

Signed-off-by: Matthew Gerlach <matthew.gerlach@linux.intel.com>
Acked-by: Alan Tull <atull@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/fpga/Kconfig                   |   5 +
 drivers/fpga/Makefile                  |   1 +
 drivers/fpga/altera-pr-ip-core.c       | 220 +++++++++++++++++++++++++++++++++
 include/linux/fpga/altera-pr-ip-core.h |  29 +++++
 4 files changed, 255 insertions(+)
 create mode 100644 drivers/fpga/altera-pr-ip-core.c
 create mode 100644 include/linux/fpga/altera-pr-ip-core.h

(limited to 'include/linux')

diff --git a/drivers/fpga/Kconfig b/drivers/fpga/Kconfig
index c81cb7d9fbd5..e2cc0ad5656c 100644
--- a/drivers/fpga/Kconfig
+++ b/drivers/fpga/Kconfig
@@ -83,6 +83,11 @@ config ALTERA_FREEZE_BRIDGE
 	  isolate one region of the FPGA from the busses while that
 	  region is being reprogrammed.
 
+config ALTERA_PR_IP_CORE
+        tristate "Altera Partial Reconfiguration IP Core"
+        help
+          Core driver support for Altera Partial Reconfiguration IP component
+
 endif # FPGA
 
 endmenu
diff --git a/drivers/fpga/Makefile b/drivers/fpga/Makefile
index c6f5d740b946..968fd51cb619 100644
--- a/drivers/fpga/Makefile
+++ b/drivers/fpga/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_FPGA_MGR_SOCFPGA_A10)	+= socfpga-a10.o
 obj-$(CONFIG_FPGA_MGR_TS73XX)		+= ts73xx-fpga.o
 obj-$(CONFIG_FPGA_MGR_XILINX_SPI)	+= xilinx-spi.o
 obj-$(CONFIG_FPGA_MGR_ZYNQ_FPGA)	+= zynq-fpga.o
+obj-$(CONFIG_ALTERA_PR_IP_CORE)         += altera-pr-ip-core.o
 
 # FPGA Bridge Drivers
 obj-$(CONFIG_FPGA_BRIDGE)		+= fpga-bridge.o
diff --git a/drivers/fpga/altera-pr-ip-core.c b/drivers/fpga/altera-pr-ip-core.c
new file mode 100644
index 000000000000..a7b31f9797ce
--- /dev/null
+++ b/drivers/fpga/altera-pr-ip-core.c
@@ -0,0 +1,220 @@
+/*
+ * Driver for Altera Partial Reconfiguration IP Core
+ *
+ * Copyright (C) 2016-2017 Intel Corporation
+ *
+ * Based on socfpga-a10.c Copyright (C) 2015-2016 Altera Corporation
+ *  by Alan Tull <atull@opensource.altera.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/delay.h>
+#include <linux/fpga/altera-pr-ip-core.h>
+#include <linux/fpga/fpga-mgr.h>
+#include <linux/module.h>
+
+#define ALT_PR_DATA_OFST		0x00
+#define ALT_PR_CSR_OFST			0x04
+
+#define ALT_PR_CSR_PR_START		BIT(0)
+#define ALT_PR_CSR_STATUS_SFT		2
+#define ALT_PR_CSR_STATUS_MSK		(7 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_NRESET	(0 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_ERR	(1 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_CRC_ERR	(2 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_BAD_BITS	(3 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_IN_PROG	(4 << ALT_PR_CSR_STATUS_SFT)
+#define ALT_PR_CSR_STATUS_PR_SUCCESS	(5 << ALT_PR_CSR_STATUS_SFT)
+
+struct alt_pr_priv {
+	void __iomem *reg_base;
+};
+
+static enum fpga_mgr_states alt_pr_fpga_state(struct fpga_manager *mgr)
+{
+	struct alt_pr_priv *priv = mgr->priv;
+	const char *err = "unknown";
+	enum fpga_mgr_states ret = FPGA_MGR_STATE_UNKNOWN;
+	u32 val;
+
+	val = readl(priv->reg_base + ALT_PR_CSR_OFST);
+
+	val &= ALT_PR_CSR_STATUS_MSK;
+
+	switch (val) {
+	case ALT_PR_CSR_STATUS_NRESET:
+		return FPGA_MGR_STATE_RESET;
+
+	case ALT_PR_CSR_STATUS_PR_ERR:
+		err = "pr error";
+		ret = FPGA_MGR_STATE_WRITE_ERR;
+		break;
+
+	case ALT_PR_CSR_STATUS_CRC_ERR:
+		err = "crc error";
+		ret = FPGA_MGR_STATE_WRITE_ERR;
+		break;
+
+	case ALT_PR_CSR_STATUS_BAD_BITS:
+		err = "bad bits";
+		ret = FPGA_MGR_STATE_WRITE_ERR;
+		break;
+
+	case ALT_PR_CSR_STATUS_PR_IN_PROG:
+		return FPGA_MGR_STATE_WRITE;
+
+	case ALT_PR_CSR_STATUS_PR_SUCCESS:
+		return FPGA_MGR_STATE_OPERATING;
+
+	default:
+		break;
+	}
+
+	dev_err(&mgr->dev, "encountered error code %d (%s) in %s()\n",
+		val, err, __func__);
+	return ret;
+}
+
+static int alt_pr_fpga_write_init(struct fpga_manager *mgr,
+				  struct fpga_image_info *info,
+				  const char *buf, size_t count)
+{
+	struct alt_pr_priv *priv = mgr->priv;
+	u32 val;
+
+	if (!(info->flags & FPGA_MGR_PARTIAL_RECONFIG)) {
+		dev_err(&mgr->dev, "%s Partial Reconfiguration flag not set\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	val = readl(priv->reg_base + ALT_PR_CSR_OFST);
+
+	if (val & ALT_PR_CSR_PR_START) {
+		dev_err(&mgr->dev,
+			"%s Partial Reconfiguration already started\n",
+		       __func__);
+		return -EINVAL;
+	}
+
+	writel(val | ALT_PR_CSR_PR_START, priv->reg_base + ALT_PR_CSR_OFST);
+
+	return 0;
+}
+
+static int alt_pr_fpga_write(struct fpga_manager *mgr, const char *buf,
+			     size_t count)
+{
+	struct alt_pr_priv *priv = mgr->priv;
+	u32 *buffer_32 = (u32 *)buf;
+	size_t i = 0;
+
+	if (count <= 0)
+		return -EINVAL;
+
+	/* Write out the complete 32-bit chunks */
+	while (count >= sizeof(u32)) {
+		writel(buffer_32[i++], priv->reg_base);
+		count -= sizeof(u32);
+	}
+
+	/* Write out remaining non 32-bit chunks */
+	switch (count) {
+	case 3:
+		writel(buffer_32[i++] & 0x00ffffff, priv->reg_base);
+		break;
+	case 2:
+		writel(buffer_32[i++] & 0x0000ffff, priv->reg_base);
+		break;
+	case 1:
+		writel(buffer_32[i++] & 0x000000ff, priv->reg_base);
+		break;
+	case 0:
+		break;
+	default:
+		/* This will never happen */
+		return -EFAULT;
+	}
+
+	if (alt_pr_fpga_state(mgr) == FPGA_MGR_STATE_WRITE_ERR)
+		return -EIO;
+
+	return 0;
+}
+
+static int alt_pr_fpga_write_complete(struct fpga_manager *mgr,
+				      struct fpga_image_info *info)
+{
+	u32 i = 0;
+
+	do {
+		switch (alt_pr_fpga_state(mgr)) {
+		case FPGA_MGR_STATE_WRITE_ERR:
+			return -EIO;
+
+		case FPGA_MGR_STATE_OPERATING:
+			dev_info(&mgr->dev,
+				 "successful partial reconfiguration\n");
+			return 0;
+
+		default:
+			break;
+		}
+		udelay(1);
+	} while (info->config_complete_timeout_us > i++);
+
+	dev_err(&mgr->dev, "timed out waiting for write to complete\n");
+	return -ETIMEDOUT;
+}
+
+static const struct fpga_manager_ops alt_pr_ops = {
+	.state = alt_pr_fpga_state,
+	.write_init = alt_pr_fpga_write_init,
+	.write = alt_pr_fpga_write,
+	.write_complete = alt_pr_fpga_write_complete,
+};
+
+int alt_pr_register(struct device *dev, void __iomem *reg_base)
+{
+	struct alt_pr_priv *priv;
+	u32 val;
+
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->reg_base = reg_base;
+
+	val = readl(priv->reg_base + ALT_PR_CSR_OFST);
+
+	dev_dbg(dev, "%s status=%d start=%d\n", __func__,
+		(val & ALT_PR_CSR_STATUS_MSK) >> ALT_PR_CSR_STATUS_SFT,
+		(int)(val & ALT_PR_CSR_PR_START));
+
+	return fpga_mgr_register(dev, dev_name(dev), &alt_pr_ops, priv);
+}
+EXPORT_SYMBOL_GPL(alt_pr_register);
+
+int alt_pr_unregister(struct device *dev)
+{
+	dev_dbg(dev, "%s\n", __func__);
+
+	fpga_mgr_unregister(dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(alt_pr_unregister);
+
+MODULE_AUTHOR("Matthew Gerlach <matthew.gerlach@linux.intel.com>");
+MODULE_DESCRIPTION("Altera Partial Reconfiguration IP Core");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/fpga/altera-pr-ip-core.h b/include/linux/fpga/altera-pr-ip-core.h
new file mode 100644
index 000000000000..3810a9033f49
--- /dev/null
+++ b/include/linux/fpga/altera-pr-ip-core.h
@@ -0,0 +1,29 @@
+/*
+ * Driver for Altera Partial Reconfiguration IP Core
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Based on socfpga-a10.c Copyright (C) 2015-2016 Altera Corporation
+ *  by Alan Tull <atull@opensource.altera.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ALT_PR_IP_CORE_H
+#define _ALT_PR_IP_CORE_H
+#include <linux/io.h>
+
+int alt_pr_register(struct device *dev, void __iomem *reg_base);
+int alt_pr_unregister(struct device *dev);
+
+#endif /* _ALT_PR_IP_CORE_H */
-- 
cgit v1.2.3


From 895ce6c877cb0282a7dc2178f3643e6635716a2a Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Sat, 11 Mar 2017 08:44:57 +0800
Subject: debugfs: set no_llseek in DEFINE_DEBUGFS_ATTRIBUTE

In DEFINE_DEBUGFS_ATTRIBUTE() macro, since we use nonseekable_open() to
open, we should use no_llseek() to seek, not generic_file_llseek().

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/debugfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 7dff776e6d16..9174b0d28582 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -74,7 +74,7 @@ static const struct file_operations __fops = {				\
 	.release = simple_attr_release,					\
 	.read	 = debugfs_attr_read,					\
 	.write	 = debugfs_attr_write,					\
-	.llseek  = generic_file_llseek,					\
+	.llseek  = no_llseek,						\
 }
 
 #if defined(CONFIG_DEBUG_FS)
-- 
cgit v1.2.3


From 171058fb0883247b3a484a542b5dc89753c57cb5 Mon Sep 17 00:00:00 2001
From: Michal Sojka <sojkam1@fel.cvut.cz>
Date: Thu, 16 Mar 2017 14:50:08 +0100
Subject: uio: Allow handling of non page-aligned memory regions

Since commit b65502879556 ("uio: we cannot mmap unaligned page
contents") addresses and sizes of UIO memory regions must be
page-aligned. If the address in the BAR register is not
page-aligned (which is the case of the mf264 card), the mentioned
commit forces the UIO driver to round the address down to the page
size. Then, there is no easy way for user-space to learn the offset of
the actual memory region within the page, because the offset seen in
/sys/class/uio/uio?/maps/map?/offset is calculated from the rounded
address and thus it is always zero.

Fix that problem by including the offset in struct uio_mem. UIO
drivers can set this field and userspace can read its value from
/sys/class/uio/uio?/maps/map?/offset.

The following commits update the uio_mf264 driver to set this new offs
field.

Drivers for hardware with page-aligned BARs need not to be modified
provided that they initialize struct uio_info (which contains uio_mem)
with zeros.

Signed-off-by: Michal Sojka <sojkam1@fel.cvut.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/uio/uio.c          |  2 +-
 include/linux/uio_driver.h | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index 60ce7fd54e89..1c196f87e9d9 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -66,7 +66,7 @@ static ssize_t map_size_show(struct uio_mem *mem, char *buf)
 
 static ssize_t map_offset_show(struct uio_mem *mem, char *buf)
 {
-	return sprintf(buf, "0x%llx\n", (unsigned long long)mem->addr & ~PAGE_MASK);
+	return sprintf(buf, "0x%llx\n", (unsigned long long)mem->offs);
 }
 
 struct map_sysfs_entry {
diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h
index 32c0e83d6239..3c85c81b0027 100644
--- a/include/linux/uio_driver.h
+++ b/include/linux/uio_driver.h
@@ -23,11 +23,13 @@ struct uio_map;
 /**
  * struct uio_mem - description of a UIO memory region
  * @name:		name of the memory region for identification
- * @addr:		address of the device's memory (phys_addr is used since
- * 			addr can be logical, virtual, or physical & phys_addr_t
- * 			should always be large enough to handle any of the
- * 			address types)
- * @size:		size of IO
+ * @addr:               address of the device's memory rounded to page
+ * 			size (phys_addr is used since addr can be
+ * 			logical, virtual, or physical & phys_addr_t
+ * 			should always be large enough to handle any of
+ * 			the address types)
+ * @offs:               offset of device memory within the page
+ * @size:		size of IO (multiple of page size)
  * @memtype:		type of memory addr points to
  * @internal_addr:	ioremap-ped version of addr, for driver internal use
  * @map:		for use by the UIO core only.
@@ -35,6 +37,7 @@ struct uio_map;
 struct uio_mem {
 	const char		*name;
 	phys_addr_t		addr;
+	unsigned long		offs;
 	resource_size_t		size;
 	int			memtype;
 	void __iomem		*internal_addr;
-- 
cgit v1.2.3


From 808a8b73772c6ac7d999c0508d2f757831cd83ca Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@free-electrons.com>
Date: Wed, 5 Apr 2017 11:06:30 +0200
Subject: iio: adc: sun4i-gpadc-iio: add support for A33 thermal sensor

This adds support for the Allwinner A33 thermal sensor.

Unlike the A10, A13 and A31, the Allwinner A33 only has one channel
which is dedicated to the thermal sensor. Moreover, its thermal sensor
does not generate interruptions, thus we only need to directly read the
register storing the temperature value.

The MFD used by the A10, A13 and A31, was created to avoid breaking the
DT binding, but since the nodes for the ADC weren't there for the A33,
it is not needed.

Though the A33 does not have an internal ADC, it has a thermal sensor
which shares the same registers with GPADC of the already supported SoCs
and almost the same bits, for the same purpose (thermal sensor).

The thermal sensor behaves exactly the same (except the presence of
interrupts or not) on the different SoCs.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/adc/Kconfig           |   2 +-
 drivers/iio/adc/sun4i-gpadc-iio.c | 100 ++++++++++++++++++++++++++++++++++++--
 include/linux/mfd/sun4i-gpadc.h   |   4 ++
 3 files changed, 102 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index 1e25ef009130..f924fa0d4fd6 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -604,7 +604,7 @@ config STX104
 config SUN4I_GPADC
 	tristate "Support for the Allwinner SoCs GPADC"
 	depends on IIO
-	depends on MFD_SUN4I_GPADC
+	depends on MFD_SUN4I_GPADC || MACH_SUN8I
 	depends on THERMAL || !THERMAL_OF
 	help
 	  Say yes here to build support for Allwinner (A10, A13 and A31) SoCs
diff --git a/drivers/iio/adc/sun4i-gpadc-iio.c b/drivers/iio/adc/sun4i-gpadc-iio.c
index 4c0167562965..b23527309088 100644
--- a/drivers/iio/adc/sun4i-gpadc-iio.c
+++ b/drivers/iio/adc/sun4i-gpadc-iio.c
@@ -85,6 +85,12 @@ static const struct gpadc_data sun6i_gpadc_data = {
 	.adc_chan_mask = SUN6I_GPADC_CTRL1_ADC_CHAN_MASK,
 };
 
+static const struct gpadc_data sun8i_a33_gpadc_data = {
+	.temp_offset = -1662,
+	.temp_scale = 162,
+	.tp_mode_en = SUN8I_GPADC_CTRL1_CHOP_TEMP_EN,
+};
+
 struct sun4i_gpadc_iio {
 	struct iio_dev			*indio_dev;
 	struct completion		completion;
@@ -96,6 +102,7 @@ struct sun4i_gpadc_iio {
 	unsigned int			temp_data_irq;
 	atomic_t			ignore_temp_data_irq;
 	const struct gpadc_data		*data;
+	bool				no_irq;
 	/* prevents concurrent reads of temperature and ADC */
 	struct mutex			mutex;
 };
@@ -138,6 +145,23 @@ static const struct iio_chan_spec sun4i_gpadc_channels_no_temp[] = {
 	SUN4I_GPADC_ADC_CHANNEL(3, "adc_chan3"),
 };
 
+static const struct iio_chan_spec sun8i_a33_gpadc_channels[] = {
+	{
+		.type = IIO_TEMP,
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) |
+				      BIT(IIO_CHAN_INFO_SCALE) |
+				      BIT(IIO_CHAN_INFO_OFFSET),
+		.datasheet_name = "temp_adc",
+	},
+};
+
+static const struct regmap_config sun4i_gpadc_regmap_config = {
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.fast_io = true,
+};
+
 static int sun4i_prepare_for_irq(struct iio_dev *indio_dev, int channel,
 				 unsigned int irq)
 {
@@ -247,6 +271,17 @@ static int sun4i_gpadc_temp_read(struct iio_dev *indio_dev, int *val)
 {
 	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
 
+	if (info->no_irq) {
+		pm_runtime_get_sync(indio_dev->dev.parent);
+
+		regmap_read(info->regmap, SUN4I_GPADC_TEMP_DATA, val);
+
+		pm_runtime_mark_last_busy(indio_dev->dev.parent);
+		pm_runtime_put_autosuspend(indio_dev->dev.parent);
+
+		return 0;
+	}
+
 	return sun4i_gpadc_read(indio_dev, 0, val, info->temp_data_irq);
 }
 
@@ -454,6 +489,58 @@ static int sun4i_irq_init(struct platform_device *pdev, const char *name,
 	return 0;
 }
 
+static const struct of_device_id sun4i_gpadc_of_id[] = {
+	{
+		.compatible = "allwinner,sun8i-a33-ths",
+		.data = &sun8i_a33_gpadc_data,
+	},
+	{ /* sentinel */ }
+};
+
+static int sun4i_gpadc_probe_dt(struct platform_device *pdev,
+				struct iio_dev *indio_dev)
+{
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
+	const struct of_device_id *of_dev;
+	struct thermal_zone_device *tzd;
+	struct resource *mem;
+	void __iomem *base;
+	int ret;
+
+	of_dev = of_match_device(sun4i_gpadc_of_id, &pdev->dev);
+	if (!of_dev)
+		return -ENODEV;
+
+	info->no_irq = true;
+	info->data = (struct gpadc_data *)of_dev->data;
+	indio_dev->num_channels = ARRAY_SIZE(sun8i_a33_gpadc_channels);
+	indio_dev->channels = sun8i_a33_gpadc_channels;
+
+	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	base = devm_ioremap_resource(&pdev->dev, mem);
+	if (IS_ERR(base))
+		return PTR_ERR(base);
+
+	info->regmap = devm_regmap_init_mmio(&pdev->dev, base,
+					     &sun4i_gpadc_regmap_config);
+	if (IS_ERR(info->regmap)) {
+		ret = PTR_ERR(info->regmap);
+		dev_err(&pdev->dev, "failed to init regmap: %d\n", ret);
+		return ret;
+	}
+
+	if (!IS_ENABLED(CONFIG_THERMAL_OF))
+		return 0;
+
+	tzd = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, info,
+						   &sun4i_ts_tz_ops);
+	if (IS_ERR(tzd))
+		dev_err(&pdev->dev, "could not register thermal sensor: %ld\n",
+			PTR_ERR(tzd));
+
+	return PTR_ERR_OR_ZERO(tzd);
+}
+
 static int sun4i_gpadc_probe_mfd(struct platform_device *pdev,
 				 struct iio_dev *indio_dev)
 {
@@ -462,6 +549,7 @@ static int sun4i_gpadc_probe_mfd(struct platform_device *pdev,
 		dev_get_drvdata(pdev->dev.parent);
 	int ret;
 
+	info->no_irq = false;
 	info->regmap = sun4i_gpadc_dev->regmap;
 
 	indio_dev->num_channels = ARRAY_SIZE(sun4i_gpadc_channels);
@@ -561,7 +649,11 @@ static int sun4i_gpadc_probe(struct platform_device *pdev)
 	indio_dev->info = &sun4i_gpadc_iio_info;
 	indio_dev->modes = INDIO_DIRECT_MODE;
 
-	ret = sun4i_gpadc_probe_mfd(pdev, indio_dev);
+	if (pdev->dev.of_node)
+		ret = sun4i_gpadc_probe_dt(pdev, indio_dev);
+	else
+		ret = sun4i_gpadc_probe_mfd(pdev, indio_dev);
+
 	if (ret)
 		return ret;
 
@@ -580,7 +672,7 @@ static int sun4i_gpadc_probe(struct platform_device *pdev)
 	return 0;
 
 err_map:
-	if (IS_ENABLED(CONFIG_THERMAL_OF))
+	if (!info->no_irq && IS_ENABLED(CONFIG_THERMAL_OF))
 		iio_map_array_unregister(indio_dev);
 
 	pm_runtime_put(&pdev->dev);
@@ -592,10 +684,11 @@ err_map:
 static int sun4i_gpadc_remove(struct platform_device *pdev)
 {
 	struct iio_dev *indio_dev = platform_get_drvdata(pdev);
+	struct sun4i_gpadc_iio *info = iio_priv(indio_dev);
 
 	pm_runtime_put(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
-	if (IS_ENABLED(CONFIG_THERMAL_OF))
+	if (!info->no_irq && IS_ENABLED(CONFIG_THERMAL_OF))
 		iio_map_array_unregister(indio_dev);
 
 	return 0;
@@ -611,6 +704,7 @@ static const struct platform_device_id sun4i_gpadc_id[] = {
 static struct platform_driver sun4i_gpadc_driver = {
 	.driver = {
 		.name = "sun4i-gpadc-iio",
+		.of_match_table = sun4i_gpadc_of_id,
 		.pm = &sun4i_gpadc_pm_ops,
 	},
 	.id_table = sun4i_gpadc_id,
diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h
index 509e736d27fb..139872c2e0fe 100644
--- a/include/linux/mfd/sun4i-gpadc.h
+++ b/include/linux/mfd/sun4i-gpadc.h
@@ -38,6 +38,10 @@
 #define SUN6I_GPADC_CTRL1_ADC_CHAN_SELECT(x)		(GENMASK(3, 0) & BIT(x))
 #define SUN6I_GPADC_CTRL1_ADC_CHAN_MASK			GENMASK(3, 0)
 
+/* TP_CTRL1 bits for sun8i SoCs */
+#define SUN8I_GPADC_CTRL1_CHOP_TEMP_EN			BIT(8)
+#define SUN8I_GPADC_CTRL1_GPADC_CALI_EN			BIT(7)
+
 #define SUN4I_GPADC_CTRL2				0x08
 
 #define SUN4I_GPADC_CTRL2_TP_SENSITIVE_ADJUST(x)	((GENMASK(3, 0) & (x)) << 28)
-- 
cgit v1.2.3


From 6fe729c4bdae41b4c5a5ff21312f021a48c69399 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Tue, 4 Apr 2017 07:22:33 -0700
Subject: serdev: Add serdev_device_write subroutine

Add serdev_device_write() a blocking call allowing to transfer
arbitraty amount of data (potentially exceeding amount that
serdev_device_write_buf can process in a single call)

To support that, also add serdev_device_write_wakeup().

Drivers wanting to use full extent of serdev_device_write
functionality are expected to provide serdev_device_write_wakeup() as
a sole handler of .write_wakeup event or call it as a part of driver's
custom .write_wakeup code.

Because serdev_device_write() subroutine is a superset of
serdev_device_write_buf() the patch re-impelements latter is terms of
the former. For drivers wanting to just use serdev_device_write_buf()
.write_wakeup handler is optional.

Cc: cphealy@gmail.com
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: linux-serial@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Rob Herring <robh@kernel.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serdev/core.c | 36 +++++++++++++++++++++++++++++++-----
 include/linux/serdev.h    | 17 +++++++++++++++--
 2 files changed, 46 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index f4c6c90add78..6701d1011fce 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -116,17 +116,41 @@ void serdev_device_close(struct serdev_device *serdev)
 }
 EXPORT_SYMBOL_GPL(serdev_device_close);
 
-int serdev_device_write_buf(struct serdev_device *serdev,
-			    const unsigned char *buf, size_t count)
+void serdev_device_write_wakeup(struct serdev_device *serdev)
+{
+	complete(&serdev->write_comp);
+}
+EXPORT_SYMBOL_GPL(serdev_device_write_wakeup);
+
+int serdev_device_write(struct serdev_device *serdev,
+			const unsigned char *buf, size_t count,
+			unsigned long timeout)
 {
 	struct serdev_controller *ctrl = serdev->ctrl;
+	int ret;
 
-	if (!ctrl || !ctrl->ops->write_buf)
+	if (!ctrl || !ctrl->ops->write_buf ||
+	    (timeout && !serdev->ops->write_wakeup))
 		return -EINVAL;
 
-	return ctrl->ops->write_buf(ctrl, buf, count);
+	mutex_lock(&serdev->write_lock);
+	do {
+		reinit_completion(&serdev->write_comp);
+
+		ret = ctrl->ops->write_buf(ctrl, buf, count);
+		if (ret < 0)
+			break;
+
+		buf += ret;
+		count -= ret;
+
+	} while (count &&
+		 (timeout = wait_for_completion_timeout(&serdev->write_comp,
+							timeout)));
+	mutex_unlock(&serdev->write_lock);
+	return ret < 0 ? ret : (count ? -ETIMEDOUT : 0);
 }
-EXPORT_SYMBOL_GPL(serdev_device_write_buf);
+EXPORT_SYMBOL_GPL(serdev_device_write);
 
 void serdev_device_write_flush(struct serdev_device *serdev)
 {
@@ -232,6 +256,8 @@ struct serdev_device *serdev_device_alloc(struct serdev_controller *ctrl)
 	serdev->dev.parent = &ctrl->dev;
 	serdev->dev.bus = &serdev_bus_type;
 	serdev->dev.type = &serdev_device_type;
+	init_completion(&serdev->write_comp);
+	mutex_init(&serdev->write_lock);
 	return serdev;
 }
 EXPORT_SYMBOL_GPL(serdev_device_alloc);
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 5176cdc2057f..0beaff886992 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -39,12 +39,16 @@ struct serdev_device_ops {
  * @nr:		Device number on serdev bus.
  * @ctrl:	serdev controller managing this device.
  * @ops:	Device operations.
+ * @write_comp	Completion used by serdev_device_write() internally
+ * @write_lock	Lock to serialize access when writing data
  */
 struct serdev_device {
 	struct device dev;
 	int nr;
 	struct serdev_controller *ctrl;
 	const struct serdev_device_ops *ops;
+	struct completion write_comp;
+	struct mutex write_lock;
 };
 
 static inline struct serdev_device *to_serdev_device(struct device *d)
@@ -186,7 +190,8 @@ int serdev_device_open(struct serdev_device *);
 void serdev_device_close(struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
-int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
+void serdev_device_write_wakeup(struct serdev_device *);
+int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, unsigned long);
 void serdev_device_write_flush(struct serdev_device *);
 int serdev_device_write_room(struct serdev_device *);
 
@@ -223,7 +228,8 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 	return 0;
 }
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
-static inline int serdev_device_write_buf(struct serdev_device *sdev, const unsigned char *buf, size_t count)
+static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf,
+				      size_t count, unsigned long timeout)
 {
 	return -ENODEV;
 }
@@ -259,4 +265,11 @@ static inline struct device *serdev_tty_port_register(struct tty_port *port,
 static inline void serdev_tty_port_unregister(struct tty_port *port) {}
 #endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */
 
+static inline int serdev_device_write_buf(struct serdev_device *serdev,
+					  const unsigned char *data,
+					  size_t count)
+{
+	return serdev_device_write(serdev, data, count, 0);
+}
+
 #endif /*_LINUX_SERDEV_H */
-- 
cgit v1.2.3


From 2e94d5ae5da1d2e798045a53b5e234a42b090908 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 31 Mar 2017 21:35:17 +0300
Subject: serial: core: constify struct uart_port {name} field

Don't allow modifications of port name. It's serial core's business only.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 60530678c633..64d892f1e5cd 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -247,7 +247,7 @@ struct uart_port {
 	unsigned char		suspended;
 	unsigned char		irq_wake;
 	unsigned char		unused[2];
-	char			*name;			/* port name */
+	const char		*name;			/* port name */
 	struct attribute_group	*attr_group;		/* port specific attributes */
 	const struct attribute_group **tty_groups;	/* all attributes (serial core use only) */
 	struct serial_rs485     rs485;
-- 
cgit v1.2.3


From 1d62ac13634840e02f9b20df9d8e21204f9ab8b8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:00 +0200
Subject: block: renumber REQ_OP_WRITE_ZEROES

Make life easy for implementations that needs to send a data buffer
to the device (e.g. SCSI) by numbering it as a data out command.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 72aa9519167e..c5bae0a669d1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,7 +174,7 @@ enum req_opf {
 	/* write the same sector many times */
 	REQ_OP_WRITE_SAME	= 7,
 	/* write the zero filled sector many times */
-	REQ_OP_WRITE_ZEROES	= 8,
+	REQ_OP_WRITE_ZEROES	= 9,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
-- 
cgit v1.2.3


From ac62d6208a7977107a47be4eb8566d6e5034b5f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:05 +0200
Subject: dm: support REQ_OP_WRITE_ZEROES

Copy & paste from the REQ_OP_WRITE_SAME code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-io.c            |  8 ++++++--
 drivers/md/dm-linear.c        |  1 +
 drivers/md/dm-mpath.c         |  1 +
 drivers/md/dm-rq.c            | 11 ++++++++---
 drivers/md/dm-stripe.c        |  2 ++
 drivers/md/dm-table.c         | 30 ++++++++++++++++++++++++++++++
 drivers/md/dm.c               | 31 ++++++++++++++++++++++++++++---
 include/linux/device-mapper.h |  6 ++++++
 9 files changed, 83 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..fea5bd52ada8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md);
 void dm_init_normal_md_queue(struct mapped_device *md);
 int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
 
 static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
 {
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b808cbe22678..3702e502466d 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region,
 	 */
 	if (op == REQ_OP_DISCARD)
 		special_cmd_max_sectors = q->limits.max_discard_sectors;
+	else if (op == REQ_OP_WRITE_ZEROES)
+		special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
 	else if (op == REQ_OP_WRITE_SAME)
 		special_cmd_max_sectors = q->limits.max_write_same_sectors;
-	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
+	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+	     op == REQ_OP_WRITE_SAME)  &&
 	    special_cmd_max_sectors == 0) {
 		dec_count(io, region, -EOPNOTSUPP);
 		return;
@@ -330,6 +333,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		 */
 		switch (op) {
 		case REQ_OP_DISCARD:
+		case REQ_OP_WRITE_ZEROES:
 			num_bvecs = 0;
 			break;
 		case REQ_OP_WRITE_SAME:
@@ -347,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		bio_set_op_attrs(bio, op, op_flags);
 		store_io_and_region_in_bio(bio, io, region);
 
-		if (op == REQ_OP_DISCARD) {
+		if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
 			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
 			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 			remaining -= num_sectors;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..e17fd44ceef5 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	ti->num_write_zeroes_bios = 1;
 	ti->private = lc;
 	return 0;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7f223dbed49f..ab55955ed704 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	ti->num_write_zeroes_bios = 1;
 	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
 	else
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index d19af1d21f4c..e60f1b6845be 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			r = rq_end_io(tio->ti, clone, error, &tio->info);
 	}
 
-	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
-		     !clone->q->limits.max_write_same_sectors))
-		disable_write_same(tio->md);
+	if (unlikely(r == -EREMOTEIO)) {
+		if (req_op(clone) == REQ_OP_WRITE_SAME &&
+		    !clone->q->limits.max_write_same_sectors)
+			disable_write_same(tio->md);
+		if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+		    !clone->q->limits.max_write_zeroes_sectors)
+			disable_write_zeroes(tio->md);
+	}
 
 	if (r <= 0)
 		/* The target wants to complete the I/O */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..5ef49c121d99 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = stripes;
 	ti->num_discard_bios = stripes;
 	ti->num_write_same_bios = stripes;
+	ti->num_write_zeroes_bios = stripes;
 
 	sc->chunk_size = chunk_size;
 	if (chunk_size & (chunk_size - 1))
@@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+	    unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
 	    unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5a..5cd665c91ead 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1533,6 +1533,34 @@ static bool dm_table_supports_write_same(struct dm_table *t)
 	return true;
 }
 
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+					   sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_write_zeroes_bios)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+			return false;
+	}
+
+	return true;
+}
+
+
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
@@ -1603,6 +1631,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	if (!dm_table_supports_write_same(t))
 		q->limits.max_write_same_sectors = 0;
+	if (!dm_table_supports_write_zeroes(t))
+		q->limits.max_write_zeroes_sectors = 0;
 
 	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
 		queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cd93a3b9ceca..8bf397729bbd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -824,6 +824,14 @@ void disable_write_same(struct mapped_device *md)
 	limits->max_write_same_sectors = 0;
 }
 
+void disable_write_zeroes(struct mapped_device *md)
+{
+	struct queue_limits *limits = dm_get_queue_limits(md);
+
+	/* device doesn't really support WRITE ZEROES, disable it */
+	limits->max_write_zeroes_sectors = 0;
+}
+
 static void clone_endio(struct bio *bio)
 {
 	int error = bio->bi_error;
@@ -850,9 +858,14 @@ static void clone_endio(struct bio *bio)
 		}
 	}
 
-	if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
-		     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
-		disable_write_same(md);
+	if (unlikely(r == -EREMOTEIO)) {
+		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+			disable_write_same(md);
+		if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+			disable_write_zeroes(md);
+	}
 
 	free_tio(tio);
 	dec_pending(io, error);
@@ -1201,6 +1214,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
 	return ti->num_write_same_bios;
 }
 
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+	return ti->num_write_zeroes_bios;
+}
+
 typedef bool (*is_split_required_fn)(struct dm_target *ti);
 
 static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1255,6 +1273,11 @@ static int __send_write_same(struct clone_info *ci)
 	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_write_zeroes(struct clone_info *ci)
+{
+	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1269,6 +1292,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 		return __send_discard(ci);
 	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
 		return __send_write_same(ci);
+	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+		return __send_write_zeroes(ci);
 
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..3829bee2302a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -254,6 +254,12 @@ struct dm_target {
 	 */
 	unsigned num_write_same_bios;
 
+	/*
+	 * The number of WRITE ZEROES bios that will be submitted to the target.
+	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
+	 */
+	unsigned num_write_zeroes_bios;
+
 	/*
 	 * The minimum number of extra bytes allocated in each io for the
 	 * target to use.
-- 
cgit v1.2.3


From ee472d835c264a4cb77f8cf878603e1e40f3559e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:08 +0200
Subject: block: add a flags argument to (__)blkdev_issue_zeroout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turn the existing discard flag into a new BLKDEV_ZERO_UNMAP flag with
similar semantics, but without referring to diѕcard.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c                    | 31 ++++++++++++++-----------------
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_receiver.c |  9 ++++++---
 drivers/nvme/target/io-cmd.c       |  2 +-
 fs/block_dev.c                     |  2 +-
 fs/dax.c                           |  2 +-
 fs/xfs/xfs_bmap_util.c             |  2 +-
 include/linux/blkdev.h             | 16 ++++++++++------
 8 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2a8d638544a7..f9f24ec69c27 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -282,14 +282,18 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @biop:	pointer to anchor bio
- * @discard:	discard flag
+ * @flags:	controls detailed behavior
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.
+ *
+ *  If a device is using logical block provisioning, the underlying space will
+ *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-		bool discard)
+		unsigned flags)
 {
 	int ret;
 	int bi_size = 0;
@@ -337,28 +341,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @discard:	whether to discard the block range
+ * @flags:	controls detailed behavior
  *
  * Description:
- *  Zero-fill a block range.  If the discard flag is set and the block
- *  device guarantees that subsequent READ operations to the block range
- *  in question will return zeroes, the blocks will be discarded. Should
- *  the discard request fail, if the discard flag is not set, or if
- *  discard_zeroes_data is not supported, this function will resort to
- *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
- *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
- *  clearing the block range. Otherwise the zeroing will be performed
- *  using regular WRITE calls.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.  See __blkdev_issue_zeroout() for the
+ *  valid values for %flags.
  */
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
 {
 	int ret;
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 
-	if (discard) {
+	if (!(flags & BLKDEV_ZERO_NOUNMAP)) {
 		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
 				BLKDEV_DISCARD_ZERO))
 			return 0;
@@ -366,7 +363,7 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 	blk_start_plug(&plug);
 	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-			&bio, discard);
+			&bio, flags);
 	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
diff --git a/block/ioctl.c b/block/ioctl.c
index 7b88820b93d9..8ea00a41be01 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 	truncate_inode_pages_range(mapping, start, end);
 
 	return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
-				    false);
+			BLKDEV_ZERO_NOUNMAP);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index aa6bf9692eff..dc9a6dcd431c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1499,19 +1499,22 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u
 		tmp = start + granularity - sector_div(tmp, granularity);
 
 		nr = tmp - start;
-		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
+		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 		nr_sectors -= nr;
 		start = tmp;
 	}
 	while (nr_sectors >= granularity) {
 		nr = min_t(sector_t, nr_sectors, max_discard_sectors);
-		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
+		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 		nr_sectors -= nr;
 		start += nr;
 	}
  zero_out:
 	if (nr_sectors) {
-		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
+		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 	}
 	return err != 0;
 }
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index a4b455156bb5..c77940d80fc8 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -184,7 +184,7 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
 		(req->ns->blksize_shift - 9)) + 1;
 
 	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
-				GFP_KERNEL, &bio, true))
+				GFP_KERNEL, &bio, 0))
 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
 
 	if (bio) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f2d59f143ef4..2f704c3a816f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2105,7 +2105,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 	case FALLOC_FL_ZERO_RANGE:
 	case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
 		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-					    GFP_KERNEL, false);
+					    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
 		/* Only punch if the device can do zeroing discard. */
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..2bfbcd726047 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -982,7 +982,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		sector_t start_sector = dax.sector + (offset >> 9);
 
 		return blkdev_issue_zeroout(bdev, start_sector,
-				length >> 9, GFP_NOFS, true);
+				length >> 9, GFP_NOFS, 0);
 	} else {
 		if (dax_map_atomic(bdev, &dax) < 0)
 			return PTR_ERR(dax.addr);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8b75dcea5966..142bbbe06114 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
 	return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
 		block << (mp->m_super->s_blocksize_bits - 9),
 		count_fsb << (mp->m_super->s_blocksize_bits - 9),
-		GFP_NOFS, true);
+		GFP_NOFS, 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d76bebbc632e..bd60f4401c9d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1336,23 +1336,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 	return bqt->tag_index[tag];
 }
 
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
 #define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+
+#define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
+
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-		bool discard);
+		unsigned flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, bool discard);
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1366,7 +1370,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
-				    gfp_mask, true);
+				    gfp_mask, 0);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.2.3


From d928be9f853b9755692d7e9aed402c1809a88e56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:09 +0200
Subject: block: add a REQ_NOUNMAP flag for REQ_OP_WRITE_ZEROES

If this flag is set logical provisioning capable device should
release space for the zeroed blocks if possible, if it is not set
devices should keep the blocks anchored.

Also remove an out of sync kerneldoc comment for a static function
that would have become even more out of data with this change.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c           | 19 +++++--------------
 include/linux/blk_types.h |  6 ++++++
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index f9f24ec69c27..2f6d2cb2e1a2 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,20 +226,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
-/**
- * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
- * @bdev:	blockdev to issue
- * @sector:	start sector
- * @nr_sects:	number of sectors to write
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @biop:	pointer to anchor bio
- *
- * Description:
- *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
- */
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
-		struct bio **biop)
+		struct bio **biop, unsigned flags)
 {
 	struct bio *bio = *biop;
 	unsigned int max_write_zeroes_sectors;
@@ -258,7 +247,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+		bio->bi_opf = REQ_OP_WRITE_ZEROES;
+		if (flags & BLKDEV_ZERO_NOUNMAP)
+			bio->bi_opf |= REQ_NOUNMAP;
 
 		if (nr_sects > max_write_zeroes_sectors) {
 			bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -306,7 +297,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		return -EINVAL;
 
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
-			biop);
+			biop, flags);
 	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
 		goto out;
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c5bae0a669d1..61339bc44400 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -201,6 +201,10 @@ enum req_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
+
+	/* command specific flags for REQ_OP_WRITE_ZEROES: */
+	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
+
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -218,6 +222,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 
+#define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
+
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
-- 
cgit v1.2.3


From cb365b9675fda026caba4cb5df83292cb7c0811a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:10 +0200
Subject: block: add a new BLKDEV_ZERO_NOFALLBACK flag

This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout if
the caller doesn't want them.

Also clean up the convoluted check for the return condition that this
new flag is added to.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c        | 5 ++++-
 include/linux/blkdev.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2f6d2cb2e1a2..2f882e22890b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -281,6 +281,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  *
  *  If a device is using logical block provisioning, the underlying space will
  *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
+ *
+ *  If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
+ *  -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
@@ -298,7 +301,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
 			biop, flags);
-	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+	if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
 		goto out;
 
 	ret = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd60f4401c9d..21a30f011674 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1350,6 +1350,7 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		struct bio **biop);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
+#define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
 
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-- 
cgit v1.2.3


From 48920ff2a5a940cd07d12cc79e4a2c75f1185aee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:23 +0200
Subject: block: remove the discard_zeroes_data flag

Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can
kill this hack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/ABI/testing/sysfs-block | 10 ++-----
 Documentation/block/queue-sysfs.txt   |  5 ----
 block/blk-lib.c                       |  7 +----
 block/blk-settings.c                  |  3 ---
 block/blk-sysfs.c                     |  2 +-
 block/compat_ioctl.c                  |  2 +-
 block/ioctl.c                         |  2 +-
 drivers/block/drbd/drbd_main.c        |  2 --
 drivers/block/drbd/drbd_nl.c          |  7 +----
 drivers/block/loop.c                  |  2 --
 drivers/block/mtip32xx/mtip32xx.c     |  1 -
 drivers/block/nbd.c                   |  1 -
 drivers/md/dm-cache-target.c          |  1 -
 drivers/md/dm-crypt.c                 |  1 -
 drivers/md/dm-raid.c                  |  6 ++---
 drivers/md/dm-raid1.c                 |  1 -
 drivers/md/dm-table.c                 | 19 -------------
 drivers/md/dm-thin.c                  |  2 --
 drivers/md/raid5.c                    | 50 +++++++++++------------------------
 drivers/scsi/sd.c                     |  5 ----
 drivers/target/target_core_device.c   |  2 +-
 include/linux/blkdev.h                | 15 -----------
 include/linux/device-mapper.h         |  5 ----
 23 files changed, 27 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef..dea212db9df3 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
 Date:		May 2011
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-		Devices that support discard functionality may return
-		stale or random data when a previously discarded block
-		is read back. This can cause problems if the filesystem
-		expects discarded blocks to be explicitly cleared. If a
-		device reports that it deterministically returns zeroes
-		when a discarded area is read the discard_zeroes_data
-		parameter will be set to one. Otherwise it will be 0 and
-		the result of reading a discarded area is undefined.
+		Will always return 0.  Don't rely on any specific behavior
+		for discards, and don't read this file.
 
 What:		/sys/block/<disk>/queue/write_same_max_bytes
 Date:		January 2012
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index b7f6bdc96d73..2c1e67058fd3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b0c6c4bcf441..e8caecd71688 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -ENXIO;
 
 	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (flags & BLKDEV_DISCARD_ZERO)
-			return -EOPNOTSUPP;
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
 		op = REQ_OP_SECURE_ERASE;
 	} else {
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
-		if ((flags & BLKDEV_DISCARD_ZERO) &&
-		    !q->limits.discard_zeroes_data)
-			return -EOPNOTSUPP;
 		op = REQ_OP_DISCARD;
 	}
 
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
-		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+		if (ret == -EOPNOTSUPP)
 			ret = 0;
 		bio_put(bio);
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d4..4fa81ed383ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	blk_set_default_limits(lim);
 
 	/* Inherit limits from component devices */
-	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
 	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
 	t->cluster &= b->cluster;
-	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c47db43a40cc..fc20489f0d2b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_discard_zeroes_data(q), page);
+	return queue_var_show(0, page);
 }
 
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return compat_put_uint(arg, 0);
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 8ea00a41be01..0de02ee67eed 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return put_uint(arg, 0);
 	case BLKSECTGET:
 		max_sectors = min_t(unsigned int, USHRT_MAX,
 				    queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8e62d9f65510..84455c365f57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = blk_queue_discard(q);
-		p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
 		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 	} else {
 		q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = 0;
-		p->qlim->discard_zeroes_data = 0;
 		p->qlim->write_same_capable = 0;
 	}
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e4516d3b971d..02255a0d68b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
 	struct drbd_connection *connection = first_peer_device(device)->connection;
 	bool can_do = b ? blk_queue_discard(b) : true;
 
-	if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
-		can_do = false;
-		drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
-	}
 	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
 		can_do = false;
 		drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1484,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
 	if (disk_conf->al_extents > drbd_al_extents_max(nbc))
 		disk_conf->al_extents = drbd_al_extents_max(nbc);
 
-	if (!blk_queue_discard(q)
-	    || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+	if (!blk_queue_discard(q)) {
 		if (disk_conf->rs_discard_granularity) {
 			disk_conf->rs_discard_granularity = 0; /* disable feature */
 			drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3bb04c1a4ba1..3081d83d2ea3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -828,7 +828,6 @@ static void loop_config_discard(struct loop_device *lo)
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		q->limits.discard_zeroes_data = 0;
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		return;
 	}
@@ -837,7 +836,6 @@ static void loop_config_discard(struct loop_device *lo)
 	q->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	q->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 30076e7753bc..05e3e664ea1b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4025,7 +4025,6 @@ skip_create_disk:
 		dd->queue->limits.discard_granularity = 4096;
 		blk_queue_max_discard_sectors(dd->queue,
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
-		dd->queue->limits.discard_zeroes_data = 0;
 	}
 
 	/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 03ae72985c79..b02f2362fdf7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1110,7 +1110,6 @@ static int nbd_dev_add(int index)
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 	disk->queue->limits.discard_granularity = 512;
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
-	disk->queue->limits.discard_zeroes_data = 0;
 	blk_queue_max_hw_sectors(disk->queue, 65536);
 	disk->queue->limits.max_sectors = 256;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..975922c8f231 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->discard_zeroes_data_unsupported = true;
 	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..ef1d836bd81b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	wake_up_process(cc->write_thread);
 
 	ti->num_flush_bios = 1;
-	ti->discard_zeroes_data_unsupported = true;
 
 	return 0;
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f8564d63982f..468f1380de1d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
 
-	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+	/*
+	 * XXX: RAID level 4,5,6 require zeroing for safety.
+	 */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
 	for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
 			return;
 
 		if (raid456) {
-			if (!q->limits.discard_zeroes_data)
-				return;
 			if (!devices_handle_discard_safely) {
 				DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
 				DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a95cbb80fb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
-	ti->discard_zeroes_data_unsupported = true;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5cd665c91ead..958275aca008 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	return false;
 }
 
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	/* Ensure that all targets supports discard_zeroes_data. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (ti->discard_zeroes_data_unsupported)
-			return false;
-	}
-
-	return true;
-}
-
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
@@ -1620,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
-	if (!dm_table_discard_zeroes_data(t))
-		q->limits.discard_zeroes_data = 0;
-
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..a5f1916f621a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
-	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
 
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
-	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1725a54042bb..2efdb0d67460 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7227,7 +7227,6 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
-		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -7263,12 +7262,6 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      0xfffe * STRIPE_SECTORS);
 
-		/*
-		 * unaligned part of discard request will be ignored, so can't
-		 * guarantee discard_zeroes_data
-		 */
-		mddev->queue->limits.discard_zeroes_data = 0;
-
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
@@ -7277,35 +7270,24 @@ static int raid5_run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
-			/*
-			 * discard_zeroes_data is required, otherwise data
-			 * could be lost. Consider a scenario: discard a stripe
-			 * (the stripe could be inconsistent if
-			 * discard_zeroes_data is 0); write one disk of the
-			 * stripe (the stripe could be inconsistent again
-			 * depending on which disks are used to calculate
-			 * parity); the disk is broken; The stripe data of this
-			 * disk is lost.
-			 */
-			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-			    !bdev_get_queue(rdev->bdev)->
-						limits.discard_zeroes_data)
-				discard_supported = false;
-			/* Unfortunately, discard_zeroes_data is not currently
-			 * a guarantee - just a hint.  So we only allow DISCARD
-			 * if the sysadmin has confirmed that only safe devices
-			 * are in use by setting a module parameter.
-			 */
-			if (!devices_handle_discard_safely) {
-				if (discard_supported) {
-					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
-					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
-				}
-				discard_supported = false;
-			}
 		}
 
-		if (discard_supported &&
+		/*
+		 * zeroing is required, otherwise data
+		 * could be lost. Consider a scenario: discard a stripe
+		 * (the stripe could be inconsistent if
+		 * discard_zeroes_data is 0); write one disk of the
+		 * stripe (the stripe could be inconsistent again
+		 * depending on which disks are used to calculate
+		 * parity); the disk is broken; The stripe data of this
+		 * disk is lost.
+		 *
+		 * We only allow DISCARD if the sysadmin has confirmed that
+		 * only safe devices are in use by setting a module parameter.
+		 * A better idea might be to turn DISCARD into WRITE_ZEROES
+		 * requests, as that is required to be safe.
+		 */
+		if (devices_handle_discard_safely &&
 		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
 		    mddev->queue->limits.discard_granularity >= stripe)
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 001593ed0444..bcb0cb020fd2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -644,8 +644,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	unsigned int logical_block_size = sdkp->device->sector_size;
 	unsigned int max_blocks = 0;
 
-	q->limits.discard_zeroes_data = 0;
-
 	/*
 	 * When LBPRZ is reported, discard alignment and granularity
 	 * must be fixed to the logical block size. Otherwise the block
@@ -681,19 +679,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	case SD_LBP_WS16:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS16_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_WS10:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_ZERO:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = 1;
 		break;
 	}
 
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b..d2f089cfa9ae 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
 	attrib->unmap_granularity = q->limits.discard_granularity / block_size;
 	attrib->unmap_granularity_alignment = q->limits.discard_alignment /
 								block_size;
-	attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+	attrib->unmap_zeroes_data = 0;
 	return true;
 }
 EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 21a30f011674..ec993573e0a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -339,7 +339,6 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
-	unsigned char		discard_zeroes_data;
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 };
@@ -1341,7 +1340,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
@@ -1541,19 +1539,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 	return q->limits.discard_alignment;
 }
 
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
-	if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
-	return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
 static inline unsigned int bdev_write_same(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3829bee2302a..c7ea33e38fb9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -296,11 +296,6 @@ struct dm_target {
 	 * on max_io_len boundary.
 	 */
 	bool split_discard_bios:1;
-
-	/*
-	 * Set if this target does not return zeroes on discarded blocks.
-	 */
-	bool discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
cgit v1.2.3


From 7f564528a480084e2318cd48caba7aef4a54a77f Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Sat, 8 Apr 2017 20:36:24 +0200
Subject: skbuff: Extend gso_type to unsigned int.

All available gso_type flags are currently in use, so
extend gso_type from 'unsigned short' to 'unsigned int'
to be able to add further flags.

We reorder the struct skb_shared_info to use
two bytes of the four byte hole before dataref.
All fields before dataref are cleared, i.e.
four bytes more than before the change.

The remaining two byte hole is moved to the
beginning of the structure, this protects us
from immediate overwites on out of bound writes
to the sk_buff head.

Structure layout on x86-64 before the change:

struct skb_shared_info {
	unsigned char              nr_frags;             /*     0     1 */
	__u8                       tx_flags;             /*     1     1 */
	short unsigned int         gso_size;             /*     2     2 */
	short unsigned int         gso_segs;             /*     4     2 */
	short unsigned int         gso_type;             /*     6     2 */
	struct sk_buff *           frag_list;            /*     8     8 */
	struct skb_shared_hwtstamps hwtstamps;           /*    16     8 */
	u32                        tskey;                /*    24     4 */
	__be32                     ip6_frag_id;          /*    28     4 */
	atomic_t                   dataref;              /*    32     4 */

	/* XXX 4 bytes hole, try to pack */

	void *                     destructor_arg;       /*    40     8 */
	skb_frag_t                 frags[17];            /*    48   272 */
	/* --- cacheline 5 boundary (320 bytes) --- */

	/* size: 320, cachelines: 5, members: 12 */
	/* sum members: 316, holes: 1, sum holes: 4 */
};

Structure layout on x86-64 after the change:

struct skb_shared_info {
	short unsigned int         _unused;              /*     0     2 */
	unsigned char              nr_frags;             /*     2     1 */
	__u8                       tx_flags;             /*     3     1 */
	short unsigned int         gso_size;             /*     4     2 */
	short unsigned int         gso_segs;             /*     6     2 */
	struct sk_buff *           frag_list;            /*     8     8 */
	struct skb_shared_hwtstamps hwtstamps;           /*    16     8 */
	unsigned int               gso_type;             /*    24     4 */
	u32                        tskey;                /*    28     4 */
	__be32                     ip6_frag_id;          /*    32     4 */
	atomic_t                   dataref;              /*    36     4 */
	void *                     destructor_arg;       /*    40     8 */
	skb_frag_t                 frags[17];            /*    48   272 */
	/* --- cacheline 5 boundary (320 bytes) --- */

	/* size: 320, cachelines: 5, members: 13 */
};

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c776abd86937..741d75cfc686 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -413,14 +413,15 @@ struct ubuf_info {
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
+	unsigned short	_unused;
 	unsigned char	nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
 	unsigned short	gso_segs;
-	unsigned short  gso_type;
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
+	unsigned int	gso_type;
 	u32		tskey;
 	__be32          ip6_frag_id;
 
-- 
cgit v1.2.3


From bf74b20d00b13919db7ae5d1015636e76f56f6ae Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 9 Apr 2017 14:45:21 -0700
Subject: Revert "rtnl: Add support for netdev event to link messages"

This reverts commit def12888c161e6fec0702e5ec9c3962846e3a21d.

As per discussion between Roopa Prabhu and David Ahern, it is
advisable that we instead have the code collect the setlink triggered
events into a bitmask emitted in the IFLA_EVENT netlink attribute.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h    |  3 +-
 include/uapi/linux/if_link.h | 21 ----------
 net/core/dev.c               |  2 +-
 net/core/rtnetlink.c         | 92 +++++---------------------------------------
 4 files changed, 11 insertions(+), 107 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 0459018173cf..57e54847b0b9 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -18,8 +18,7 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
-				       unsigned change, unsigned long event,
-				       gfp_t flags);
+				       unsigned change, gfp_t flags);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
 		       gfp_t flags);
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 97f6d302f627..8b405afb2376 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -157,7 +157,6 @@ enum {
 	IFLA_GSO_MAX_SIZE,
 	IFLA_PAD,
 	IFLA_XDP,
-	IFLA_EVENT,
 	__IFLA_MAX
 };
 
@@ -900,24 +899,4 @@ enum {
 
 #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1)
 
-enum {
-	IFLA_EVENT_UNSPEC,
-	IFLA_EVENT_REBOOT,
-	IFLA_EVENT_CHANGE_MTU,
-	IFLA_EVENT_CHANGE_ADDR,
-	IFLA_EVENT_CHANGE_NAME,
-	IFLA_EVENT_FEAT_CHANGE,
-	IFLA_EVENT_BONDING_FAILOVER,
-	IFLA_EVENT_POST_TYPE_CHANGE,
-	IFLA_EVENT_NOTIFY_PEERS,
-	IFLA_EVENT_CHANGE_UPPER,
-	IFLA_EVENT_RESEND_IGMP,
-	IFLA_EVENT_PRE_CHANGE_MTU,
-	IFLA_EVENT_CHANGE_INFO_DATA,
-	IFLA_EVENT_PRE_CHANGE_UPPER,
-	IFLA_EVENT_CHANGE_LOWER_STATE,
-	IFLA_EVENT_UDP_TUNNEL_PUSH_INFO,
-	IFLA_EVENT_CHANGE_TX_QUEUE_LEN,
-};
-
 #endif /* _UAPI_LINUX_IF_LINK_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 7efb4178ffef..ef9fe60ee294 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6840,7 +6840,7 @@ static void rollback_registered_many(struct list_head *head)
 
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
 						     GFP_KERNEL);
 
 		/*
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b2bd4c9ee860..58419da7961b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -944,7 +944,6 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
 	       + rtnl_xdp_size(dev) /* IFLA_XDP */
-	       + nla_total_size(4)  /* IFLA_EVENT */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1277,70 +1276,9 @@ err_cancel:
 	return err;
 }
 
-static int rtnl_fill_link_event(struct sk_buff *skb, unsigned long event)
-{
-	u32 rtnl_event;
-
-	switch (event) {
-	case NETDEV_REBOOT:
-		rtnl_event = IFLA_EVENT_REBOOT;
-		break;
-	case NETDEV_CHANGEMTU:
-		rtnl_event = IFLA_EVENT_CHANGE_MTU;
-		break;
-	case NETDEV_CHANGEADDR:
-		rtnl_event = IFLA_EVENT_CHANGE_ADDR;
-		break;
-	case NETDEV_CHANGENAME:
-		rtnl_event = IFLA_EVENT_CHANGE_NAME;
-		break;
-	case NETDEV_FEAT_CHANGE:
-		rtnl_event = IFLA_EVENT_FEAT_CHANGE;
-		break;
-	case NETDEV_BONDING_FAILOVER:
-		rtnl_event = IFLA_EVENT_BONDING_FAILOVER;
-		break;
-	case NETDEV_POST_TYPE_CHANGE:
-		rtnl_event = IFLA_EVENT_POST_TYPE_CHANGE;
-		break;
-	case NETDEV_NOTIFY_PEERS:
-		rtnl_event = IFLA_EVENT_NOTIFY_PEERS;
-		break;
-	case NETDEV_CHANGEUPPER:
-		rtnl_event = IFLA_EVENT_CHANGE_UPPER;
-		break;
-	case NETDEV_RESEND_IGMP:
-		rtnl_event = IFLA_EVENT_RESEND_IGMP;
-		break;
-	case NETDEV_PRECHANGEMTU:
-		rtnl_event = IFLA_EVENT_PRE_CHANGE_MTU;
-		break;
-	case NETDEV_CHANGEINFODATA:
-		rtnl_event = IFLA_EVENT_CHANGE_INFO_DATA;
-		break;
-	case NETDEV_PRECHANGEUPPER:
-		rtnl_event = IFLA_EVENT_PRE_CHANGE_UPPER;
-		break;
-	case NETDEV_CHANGELOWERSTATE:
-		rtnl_event = IFLA_EVENT_CHANGE_LOWER_STATE;
-		break;
-	case NETDEV_UDP_TUNNEL_PUSH_INFO:
-		rtnl_event = IFLA_EVENT_UDP_TUNNEL_PUSH_INFO;
-		break;
-	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		rtnl_event = IFLA_EVENT_CHANGE_TX_QUEUE_LEN;
-		break;
-	default:
-		return 0;
-	}
-
-	return nla_put_u32(skb, IFLA_EVENT, rtnl_event);
-}
-
 static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			    int type, u32 pid, u32 seq, u32 change,
-			    unsigned int flags, u32 ext_filter_mask,
-			    unsigned long event)
+			    unsigned int flags, u32 ext_filter_mask)
 {
 	struct ifinfomsg *ifm;
 	struct nlmsghdr *nlh;
@@ -1389,9 +1327,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
 		goto nla_put_failure;
 
-	if (rtnl_fill_link_event(skb, event))
-		goto nla_put_failure;
-
 	if (rtnl_fill_link_ifmap(skb, dev))
 		goto nla_put_failure;
 
@@ -1526,7 +1461,6 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
 	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },
 	[IFLA_XDP]		= { .type = NLA_NESTED },
-	[IFLA_EVENT]		= { .type = NLA_U32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1685,7 +1619,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, 0,
 					       flags,
-					       ext_filter_mask, 0);
+					       ext_filter_mask);
 			/* If we ran out of room on the first message,
 			 * we're in trouble
 			 */
@@ -2776,7 +2710,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh)
 		return -ENOBUFS;
 
 	err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid,
-			       nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0);
+			       nlh->nlmsg_seq, 0, 0, ext_filter_mask);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size */
 		WARN_ON(err == -EMSGSIZE);
@@ -2848,8 +2782,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 }
 
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
-				       unsigned int change,
-				       unsigned long event, gfp_t flags)
+				       unsigned int change, gfp_t flags)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -2860,7 +2793,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 	if (skb == NULL)
 		goto errout;
 
-	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event);
+	err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -2881,25 +2814,18 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
 	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
 }
 
-static void rtmsg_ifinfo_event(int type, struct net_device *dev,
-			       unsigned int change, unsigned long event,
-			       gfp_t flags)
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
+		  gfp_t flags)
 {
 	struct sk_buff *skb;
 
 	if (dev->reg_state != NETREG_REGISTERED)
 		return;
 
-	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags);
+	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);
 }
-
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
-{
-	rtmsg_ifinfo_event(type, dev, change, 0, flags);
-}
 EXPORT_SYMBOL(rtmsg_ifinfo);
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -4206,7 +4132,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_UDP_TUNNEL_PUSH_INFO:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
-		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, event, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 5812f0106c449533d0eea0b16a6244ec3d6d4abb Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 29 Mar 2017 13:55:57 +0530
Subject: phy: exynos4: Remove duplicated defines of PHY register defines

Phy drivers access PMU region through regmap provided by exynos-pmu
driver.   However there is no need to duplicate defines for PMU
registers.  Instead just use whatever is defined in exynos-regs-pmu.h.

Additionally MIPI PHY registers for Exynos5433 start from the same
address as Exynos4 and Exynos5250 so re-use existing defines.

This reduces number of defines and allows removal of one header file.

Suggested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/phy-exynos-mipi-video.c         | 12 ++++++------
 include/linux/mfd/syscon/exynos5-pmu.h      |  3 ---
 include/linux/soc/samsung/exynos-regs-pmu.h |  9 ++++++++-
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index 6bee04cc4d53..d7fe1f8c3ac8 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -12,7 +12,6 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
-#include <linux/mfd/syscon/exynos4-pmu.h>
 #include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -21,6 +20,7 @@
 #include <linux/phy/phy.h>
 #include <linux/regmap.h>
 #include <linux/spinlock.h>
+#include <linux/soc/samsung/exynos-regs-pmu.h>
 #include <linux/mfd/syscon.h>
 
 enum exynos_mipi_phy_id {
@@ -173,7 +173,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5433_MIPI_PHY0_CONTROL,
+			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(0),
 			.resetn_reg = EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON,
@@ -182,7 +182,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5433_MIPI_PHY0_CONTROL,
+			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(0),
 			.resetn_reg = EXYNOS5433_SYSREG_DISP_MIPI_PHY,
@@ -191,7 +191,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5433_MIPI_PHY1_CONTROL,
+			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(1),
 			.resetn_reg = EXYNOS5433_SYSREG_CAM0_MIPI_DPHY_CON,
@@ -200,7 +200,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5433_MIPI_PHY1_CONTROL,
+			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(1),
 			.resetn_reg = EXYNOS5433_SYSREG_DISP_MIPI_PHY,
@@ -209,7 +209,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5433_MIPI_PHY2_CONTROL,
+			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(2),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(0),
 			.resetn_reg = EXYNOS5433_SYSREG_CAM1_MIPI_DPHY_CON,
diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h
index c28ff21ca4d2..77c93551ee58 100644
--- a/include/linux/mfd/syscon/exynos5-pmu.h
+++ b/include/linux/mfd/syscon/exynos5-pmu.h
@@ -38,9 +38,6 @@
 
 /* Exynos5433 specific register definitions */
 #define EXYNOS5433_USBHOST30_PHY_CONTROL	(0x728)
-#define EXYNOS5433_MIPI_PHY0_CONTROL		(0x710)
-#define EXYNOS5433_MIPI_PHY1_CONTROL		(0x714)
-#define EXYNOS5433_MIPI_PHY2_CONTROL		(0x718)
 
 #define EXYNOS5_PHY_ENABLE			BIT(0)
 #define EXYNOS5_MIPI_PHY_S_RESETN		BIT(1)
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 49df0a01a2cc..e57d75889a09 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2012 Samsung Electronics Co., Ltd.
+ * Copyright (c) 2010-2015 Samsung Electronics Co., Ltd.
  *		http://www.samsung.com
  *
  * EXYNOS - Power management unit definition
@@ -50,6 +50,13 @@
 #define S5P_WAKEUP_MASK				0x0608
 #define S5P_WAKEUP_MASK2				0x0614
 
+/* MIPI_PHYn_CONTROL, valid for Exynos3250, Exynos4, Exynos5250 and Exynos5433 */
+#define EXYNOS4_MIPI_PHY_CONTROL(n)		(0x0710 + (n) * 4)
+#define EXYNOS4_MIPI_PHY_ENABLE			(1 << 0)
+#define EXYNOS4_MIPI_PHY_SRESETN		(1 << 1)
+#define EXYNOS4_MIPI_PHY_MRESETN		(1 << 2)
+#define EXYNOS4_MIPI_PHY_RESET_MASK		(3 << 1)
+
 #define S5P_INFORM0				0x0800
 #define S5P_INFORM1				0x0804
 #define S5P_INFORM5				0x0814
-- 
cgit v1.2.3


From 424c9841480f1761285748b08aa85ac774a30db1 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 14 Mar 2017 18:46:50 +0200
Subject: phy: exynos5: Remove duplicated defines of PHY register defines

Phy drivers access PMU region through regmap provided by exynos-pmu
driver.   However there is no need to duplicate defines for PMU
registers.  Instead just use whatever is defined in exynos-regs-pmu.h.

This reduces number of defines.

Suggested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/phy-exynos-dp-video.c           |  1 +
 drivers/phy/phy-exynos5-usbdrd.c            |  1 +
 include/linux/mfd/syscon/exynos5-pmu.h      | 27 ---------------------------
 include/linux/soc/samsung/exynos-regs-pmu.h |  8 ++++++++
 4 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-exynos-dp-video.c b/drivers/phy/phy-exynos-dp-video.c
index 34b06154e5d9..d72193188980 100644
--- a/drivers/phy/phy-exynos-dp-video.c
+++ b/drivers/phy/phy-exynos-dp-video.c
@@ -20,6 +20,7 @@
 #include <linux/phy/phy.h>
 #include <linux/platform_device.h>
 #include <linux/regmap.h>
+#include <linux/soc/samsung/exynos-regs-pmu.h>
 
 struct exynos_dp_video_phy_drvdata {
 	u32 phy_ctrl_offset;
diff --git a/drivers/phy/phy-exynos5-usbdrd.c b/drivers/phy/phy-exynos5-usbdrd.c
index 07ed608905ac..7c896d0cda18 100644
--- a/drivers/phy/phy-exynos5-usbdrd.c
+++ b/drivers/phy/phy-exynos5-usbdrd.c
@@ -25,6 +25,7 @@
 #include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
+#include <linux/soc/samsung/exynos-regs-pmu.h>
 
 /* Exynos USB PHY registers */
 #define EXYNOS5_FSEL_9MHZ6		0x0
diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h
index 77c93551ee58..0a4ddabc395e 100644
--- a/include/linux/mfd/syscon/exynos5-pmu.h
+++ b/include/linux/mfd/syscon/exynos5-pmu.h
@@ -12,33 +12,6 @@
 #ifndef _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_
 #define _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_
 
-/* Exynos5 PMU register definitions */
-#define EXYNOS5_HDMI_PHY_CONTROL		(0x700)
-#define EXYNOS5_USBDRD_PHY_CONTROL		(0x704)
-
-/* Exynos5250 specific register definitions */
-#define EXYNOS5_USBHOST_PHY_CONTROL		(0x708)
-#define EXYNOS5_EFNAND_PHY_CONTROL		(0x70c)
-#define EXYNOS5_MIPI_PHY0_CONTROL		(0x710)
-#define EXYNOS5_MIPI_PHY1_CONTROL		(0x714)
-#define EXYNOS5_ADC_PHY_CONTROL			(0x718)
-#define EXYNOS5_MTCADC_PHY_CONTROL		(0x71c)
-#define EXYNOS5_DPTX_PHY_CONTROL		(0x720)
-#define EXYNOS5_SATA_PHY_CONTROL		(0x724)
-
-/* Exynos5420 specific register definitions */
-#define EXYNOS5420_USBDRD1_PHY_CONTROL		(0x708)
-#define EXYNOS5420_USBHOST_PHY_CONTROL		(0x70c)
-#define EXYNOS5420_MIPI_PHY0_CONTROL		(0x714)
-#define EXYNOS5420_MIPI_PHY1_CONTROL		(0x718)
-#define EXYNOS5420_MIPI_PHY2_CONTROL		(0x71c)
-#define EXYNOS5420_ADC_PHY_CONTROL		(0x720)
-#define EXYNOS5420_MTCADC_PHY_CONTROL		(0x724)
-#define EXYNOS5420_DPTX_PHY_CONTROL		(0x728)
-
-/* Exynos5433 specific register definitions */
-#define EXYNOS5433_USBHOST30_PHY_CONTROL	(0x728)
-
 #define EXYNOS5_PHY_ENABLE			BIT(0)
 #define EXYNOS5_MIPI_PHY_S_RESETN		BIT(1)
 #define EXYNOS5_MIPI_PHY_M_RESETN		BIT(2)
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index e57d75889a09..4ee54b3fcd57 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -349,6 +349,8 @@
 
 #define EXYNOS5_AUTO_WDTRESET_DISABLE				0x0408
 #define EXYNOS5_MASK_WDTRESET_REQUEST				0x040C
+#define EXYNOS5_USBDRD_PHY_CONTROL				0x0704
+#define EXYNOS5_DPTX_PHY_CONTROL				0x0720
 
 #define EXYNOS5_USE_RETENTION			BIT(4)
 #define EXYNOS5_SYS_WDTRESET					(1 << 20)
@@ -502,6 +504,11 @@
 #define EXYNOS5420_KFC_CORE_RESET(_nr)				\
 	((EXYNOS5420_KFC_CORE_RESET0 | EXYNOS5420_KFC_ETM_RESET0) << (_nr))
 
+#define EXYNOS5420_USBDRD1_PHY_CONTROL				0x0708
+#define EXYNOS5420_MIPI_PHY0_CONTROL				0x0714
+#define EXYNOS5420_MIPI_PHY1_CONTROL				0x0718
+#define EXYNOS5420_MIPI_PHY2_CONTROL				0x071C
+#define EXYNOS5420_DPTX_PHY_CONTROL				0x0728
 #define EXYNOS5420_ARM_CORE2_SYS_PWR_REG			0x1020
 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_LOCAL_SYS_PWR_REG		0x1024
 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_CENTRAL_SYS_PWR_REG	0x1028
@@ -639,6 +646,7 @@
 					 | EXYNOS5420_KFC_USE_STANDBY_WFI3)
 
 /* For EXYNOS5433 */
+#define EXYNOS5433_USBHOST30_PHY_CONTROL			(0x0728)
 #define EXYNOS5433_PAD_RETENTION_AUD_OPTION			(0x3028)
 #define EXYNOS5433_PAD_RETENTION_MMC2_OPTION			(0x30C8)
 #define EXYNOS5433_PAD_RETENTION_TOP_OPTION			(0x3108)
-- 
cgit v1.2.3


From cf09ee599714e630ea610ff4c4fd8c71e2b1f616 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 14 Mar 2017 18:46:51 +0200
Subject: phy: exynos-mipi-video: Use consistent method to address phy
 registers

Exynos4 MIPI phy registers are defined with macro calculating the offset
for given phyN.  Use the same method for Exynos5420 to be consistent.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/phy-exynos-mipi-video.c         | 20 ++++++++++----------
 include/linux/soc/samsung/exynos-regs-pmu.h |  4 +---
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index d7fe1f8c3ac8..acef1d92691e 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -110,46 +110,46 @@ static const struct mipi_phy_device_desc exynos5420_mipi_phy = {
 			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
-			.resetn_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
-			.resetn_reg = EXYNOS5420_MIPI_PHY0_CONTROL,
+			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM1,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
-			.resetn_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS1,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
-			.resetn_reg = EXYNOS5420_MIPI_PHY1_CONTROL,
+			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
 			.enable_val = EXYNOS5_PHY_ENABLE,
-			.enable_reg = EXYNOS5420_MIPI_PHY2_CONTROL,
+			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(2),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
-			.resetn_reg = EXYNOS5420_MIPI_PHY2_CONTROL,
+			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(2),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		},
 	},
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index 4ee54b3fcd57..c261ed927e1e 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -505,9 +505,7 @@
 	((EXYNOS5420_KFC_CORE_RESET0 | EXYNOS5420_KFC_ETM_RESET0) << (_nr))
 
 #define EXYNOS5420_USBDRD1_PHY_CONTROL				0x0708
-#define EXYNOS5420_MIPI_PHY0_CONTROL				0x0714
-#define EXYNOS5420_MIPI_PHY1_CONTROL				0x0718
-#define EXYNOS5420_MIPI_PHY2_CONTROL				0x071C
+#define EXYNOS5420_MIPI_PHY_CONTROL(n)				(0x0714 + (n) * 4)
 #define EXYNOS5420_DPTX_PHY_CONTROL				0x0728
 #define EXYNOS5420_ARM_CORE2_SYS_PWR_REG			0x1020
 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_LOCAL_SYS_PWR_REG		0x1024
-- 
cgit v1.2.3


From 7a66647b25b68c2a2da51bc9845fc91dd27529a9 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 29 Mar 2017 13:59:57 +0530
Subject: phy: exynos: Use one define for enable bit

There is no need for separate defines for Exynos4 and Exynos5 phy enable
bit and MIPI phy reset bits.  In both cases there are the same so
simplify it.

This reduces number of defines and allows removal of one header file.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
---
 drivers/phy/phy-exynos-dp-video.c           |  5 ++--
 drivers/phy/phy-exynos-mipi-video.c         | 39 ++++++++++++++---------------
 drivers/phy/phy-exynos5-usbdrd.c            |  5 ++--
 include/linux/soc/samsung/exynos-regs-pmu.h |  3 ++-
 4 files changed, 25 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/phy/phy-exynos-dp-video.c b/drivers/phy/phy-exynos-dp-video.c
index d72193188980..bb3279dbf88c 100644
--- a/drivers/phy/phy-exynos-dp-video.c
+++ b/drivers/phy/phy-exynos-dp-video.c
@@ -14,7 +14,6 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mfd/syscon.h>
-#include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/phy/phy.h>
@@ -37,7 +36,7 @@ static int exynos_dp_video_phy_power_on(struct phy *phy)
 
 	/* Disable power isolation on DP-PHY */
 	return regmap_update_bits(state->regs, state->drvdata->phy_ctrl_offset,
-				  EXYNOS5_PHY_ENABLE, EXYNOS5_PHY_ENABLE);
+				  EXYNOS4_PHY_ENABLE, EXYNOS4_PHY_ENABLE);
 }
 
 static int exynos_dp_video_phy_power_off(struct phy *phy)
@@ -46,7 +45,7 @@ static int exynos_dp_video_phy_power_off(struct phy *phy)
 
 	/* Enable power isolation on DP-PHY */
 	return regmap_update_bits(state->regs, state->drvdata->phy_ctrl_offset,
-				  EXYNOS5_PHY_ENABLE, 0);
+				  EXYNOS4_PHY_ENABLE, 0);
 }
 
 static const struct phy_ops exynos_dp_video_phy_ops = {
diff --git a/drivers/phy/phy-exynos-mipi-video.c b/drivers/phy/phy-exynos-mipi-video.c
index acef1d92691e..c198886f80a3 100644
--- a/drivers/phy/phy-exynos-mipi-video.c
+++ b/drivers/phy/phy-exynos-mipi-video.c
@@ -12,7 +12,6 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
-#include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
@@ -64,7 +63,7 @@ static const struct mipi_phy_device_desc s5pv210_mipi_phy = {
 		{
 			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
-			.enable_val = EXYNOS4_MIPI_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS4_MIPI_PHY_SRESETN,
@@ -73,7 +72,7 @@ static const struct mipi_phy_device_desc s5pv210_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
-			.enable_val = EXYNOS4_MIPI_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS4_MIPI_PHY_MRESETN,
@@ -82,7 +81,7 @@ static const struct mipi_phy_device_desc s5pv210_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM1,
-			.enable_val = EXYNOS4_MIPI_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS4_MIPI_PHY_SRESETN,
@@ -91,7 +90,7 @@ static const struct mipi_phy_device_desc s5pv210_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS1,
-			.enable_val = EXYNOS4_MIPI_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = EXYNOS4_MIPI_PHY_MRESETN,
@@ -109,46 +108,46 @@ static const struct mipi_phy_device_desc exynos5420_mipi_phy = {
 		{
 			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
-			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_val = EXYNOS4_MIPI_PHY_SRESETN,
 			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
-			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
+			.resetn_val = EXYNOS4_MIPI_PHY_MRESETN,
 			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(0),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM1,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
-			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_val = EXYNOS4_MIPI_PHY_SRESETN,
 			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS1,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
-			.resetn_val = EXYNOS5_MIPI_PHY_M_RESETN,
+			.resetn_val = EXYNOS4_MIPI_PHY_MRESETN,
 			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(1),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS5420_MIPI_PHY_CONTROL(2),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
-			.resetn_val = EXYNOS5_MIPI_PHY_S_RESETN,
+			.resetn_val = EXYNOS4_MIPI_PHY_SRESETN,
 			.resetn_reg = EXYNOS5420_MIPI_PHY_CONTROL(2),
 			.resetn_map = EXYNOS_MIPI_REGMAP_PMU,
 		},
@@ -172,7 +171,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 		{
 			/* EXYNOS_MIPI_PHY_ID_CSIS0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_DSIM0,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(0),
@@ -181,7 +180,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM0 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_CSIS0,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(0),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(0),
@@ -190,7 +189,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(1),
@@ -199,7 +198,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_DSIM1 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(1),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(1),
@@ -208,7 +207,7 @@ static const struct mipi_phy_device_desc exynos5433_mipi_phy = {
 		}, {
 			/* EXYNOS_MIPI_PHY_ID_CSIS2 */
 			.coupled_phy_id = EXYNOS_MIPI_PHY_ID_NONE,
-			.enable_val = EXYNOS5_PHY_ENABLE,
+			.enable_val = EXYNOS4_PHY_ENABLE,
 			.enable_reg = EXYNOS4_MIPI_PHY_CONTROL(2),
 			.enable_map = EXYNOS_MIPI_REGMAP_PMU,
 			.resetn_val = BIT(0),
diff --git a/drivers/phy/phy-exynos5-usbdrd.c b/drivers/phy/phy-exynos5-usbdrd.c
index 7c896d0cda18..7c41daa2c625 100644
--- a/drivers/phy/phy-exynos5-usbdrd.c
+++ b/drivers/phy/phy-exynos5-usbdrd.c
@@ -22,7 +22,6 @@
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/mfd/syscon.h>
-#include <linux/mfd/syscon/exynos5-pmu.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
 #include <linux/soc/samsung/exynos-regs-pmu.h>
@@ -236,10 +235,10 @@ static void exynos5_usbdrd_phy_isol(struct phy_usb_instance *inst,
 	if (!inst->reg_pmu)
 		return;
 
-	val = on ? 0 : EXYNOS5_PHY_ENABLE;
+	val = on ? 0 : EXYNOS4_PHY_ENABLE;
 
 	regmap_update_bits(inst->reg_pmu, inst->pmu_offset,
-			   EXYNOS5_PHY_ENABLE, val);
+			   EXYNOS4_PHY_ENABLE, val);
 }
 
 /*
diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h
index c261ed927e1e..bebdde5dccd6 100644
--- a/include/linux/soc/samsung/exynos-regs-pmu.h
+++ b/include/linux/soc/samsung/exynos-regs-pmu.h
@@ -52,7 +52,8 @@
 
 /* MIPI_PHYn_CONTROL, valid for Exynos3250, Exynos4, Exynos5250 and Exynos5433 */
 #define EXYNOS4_MIPI_PHY_CONTROL(n)		(0x0710 + (n) * 4)
-#define EXYNOS4_MIPI_PHY_ENABLE			(1 << 0)
+/* Phy enable bit, common for all phy registers, not only MIPI */
+#define EXYNOS4_PHY_ENABLE			(1 << 0)
 #define EXYNOS4_MIPI_PHY_SRESETN		(1 << 1)
 #define EXYNOS4_MIPI_PHY_MRESETN		(1 << 2)
 #define EXYNOS4_MIPI_PHY_RESET_MASK		(3 << 1)
-- 
cgit v1.2.3


From f437a3f477cce402dbec6537b29e9e33962c9f73 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 6 Apr 2017 16:16:11 +0800
Subject: crypto: api - Extend algorithm name limit to 128 bytes

With the new explicit IV generators, we may now exceed the 64-byte
length limit on the algorithm name, e.g., with

	echainiv(authencesn(hmac(sha256-generic),cbc(des3_ede-generic)))

This patch extends the length limit to 128 bytes.

Reported-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
Tested-by: Alexander Sverdlin <alexander.sverdlin@nokia.com>
---
 include/linux/crypto.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index c0b0cf3d2d2f..84da9978e951 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -123,7 +123,7 @@
 /*
  * Miscellaneous stuff.
  */
-#define CRYPTO_MAX_ALG_NAME		64
+#define CRYPTO_MAX_ALG_NAME		128
 
 /*
  * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual
-- 
cgit v1.2.3


From db47d5f856467ce0dd3af7e20a33df3d901266df Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 25 Jan 2017 20:30:29 +0100
Subject: x86/nmi, EDAC: Get rid of DRAM error reporting thru PCI SERR NMI

Apparently, some machines used to report DRAM errors through a PCI SERR
NMI. This is why we have a call into EDAC in the NMI handler. See

  c0d121720220 ("drivers/edac: add new nmi rescan").

From looking at the patch above, that's two drivers: e752x_edac.c and
e7xxx_edac.c. Now, I wanna say those are old machines which are probably
decommissioned already.

Tony says that "[t]the newest CPU supported by either of those drivers
is the Xeon E7520 (a.k.a. "Nehalem") released in Q1'2010. Possibly some
folks are still using these ... but people that hold onto h/w for 7
years generally cling to old s/w too ... so I'd guess it unlikely that
we will get complaints for breaking these in upstream."

So even if there is a small number still in use, we did load EDAC with
edac_op_state == EDAC_OPSTATE_POLL by default (we still do, in fact)
which means a default EDAC setup without any parameters supplied on the
command line or otherwise would never even log the error in the NMI
handler because we're polling by default:

  inline int edac_handler_set(void)
  {
         if (edac_op_state == EDAC_OPSTATE_POLL)
                 return 0;

         return atomic_read(&edac_handlers);
  }

So, long story short, I'd like to get rid of that nastiness called
edac_stub.c and confine all the EDAC drivers solely to drivers/edac/. If
we ever have to do stuff like that again, it should be notifiers we're
using and not some insanity like this one.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/nmi.c    | 11 -----------
 drivers/edac/edac_stub.c | 22 ----------------------
 include/linux/edac.h     |  2 --
 3 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index f088ea4c66e7..f0c4c890f71b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -224,17 +224,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
 
-	/*
-	 * On some machines, PCI SERR line is used to report memory
-	 * errors. EDAC makes use of it.
-	 */
-#if defined(CONFIG_EDAC)
-	if (edac_handler_set()) {
-		edac_atomic_assert_error();
-		return;
-	}
-#endif
-
 	if (panic_on_unrecovered_nmi)
 		nmi_panic(regs, "NMI: Not continuing");
 
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index 952e411f01f2..d1389e4b7989 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -44,25 +44,3 @@ static int __init edac_report_setup(char *str)
 	return 0;
 }
 __setup("edac_report=", edac_report_setup);
-
-/*
- * called to determine if there is an EDAC driver interested in
- * knowing an event (such as NMI) occurred
- */
-int edac_handler_set(void)
-{
-	if (edac_op_state == EDAC_OPSTATE_POLL)
-		return 0;
-
-	return atomic_read(&edac_handlers);
-}
-EXPORT_SYMBOL_GPL(edac_handler_set);
-
-/*
- * handler for NMI type of interrupts to assert error
- */
-void edac_atomic_assert_error(void)
-{
-	edac_err_assert++;
-}
-EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 5b6adf964248..bf8daabf3d51 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -31,8 +31,6 @@ extern int edac_op_state;
 extern int edac_err_assert;
 extern atomic_t edac_handlers;
 
-extern int edac_handler_set(void);
-extern void edac_atomic_assert_error(void);
 extern struct bus_type *edac_get_sysfs_subsys(void);
 
 enum {
-- 
cgit v1.2.3


From 97bb6c17ad5a0892beb45070dfe8c7d6d0e5326e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Jan 2017 16:49:59 +0100
Subject: EDAC: Get rid of edac_handlers

Use mc_devices list instead to check whether we have EDAC driver
instances successfully registered with EDAC core.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/edac_mc.c   | 6 ++----
 drivers/edac/edac_stub.c | 3 ---
 include/linux/edac.h     | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e5573c56b15e..824d31193b69 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -601,7 +601,6 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
 	}
 
 	list_add_tail_rcu(&mci->link, insert_before);
-	atomic_inc(&edac_handlers);
 	return 0;
 
 fail0:
@@ -619,7 +618,6 @@ fail1:
 
 static int del_mc_from_global_list(struct mem_ctl_info *mci)
 {
-	int handlers = atomic_dec_return(&edac_handlers);
 	list_del_rcu(&mci->link);
 
 	/* these are for safe removal of devices from global list while
@@ -628,7 +626,7 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
 	synchronize_rcu();
 	INIT_LIST_HEAD(&mci->link);
 
-	return handlers;
+	return list_empty(&mc_devices);
 }
 
 struct mem_ctl_info *edac_mc_find(int idx)
@@ -763,7 +761,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
 	/* mark MCI offline: */
 	mci->op_state = OP_OFFLINE;
 
-	if (!del_mc_from_global_list(mci))
+	if (del_mc_from_global_list(mci))
 		edac_mc_owner = NULL;
 
 	mutex_unlock(&mem_ctls_mutex);
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index d1389e4b7989..cfb250fa38ce 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -20,9 +20,6 @@
 int edac_op_state = EDAC_OPSTATE_INVAL;
 EXPORT_SYMBOL_GPL(edac_op_state);
 
-atomic_t edac_handlers = ATOMIC_INIT(0);
-EXPORT_SYMBOL_GPL(edac_handlers);
-
 int edac_err_assert = 0;
 EXPORT_SYMBOL_GPL(edac_err_assert);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index bf8daabf3d51..9fd6fe53ab2a 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -29,7 +29,6 @@ struct device;
 
 extern int edac_op_state;
 extern int edac_err_assert;
-extern atomic_t edac_handlers;
 
 extern struct bus_type *edac_get_sysfs_subsys(void);
 
-- 
cgit v1.2.3


From d3116a0837261405e0febb8043fe7040c8ebccb4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Jan 2017 18:25:11 +0100
Subject: EDAC: Remove edac_err_assert

... and the glue around it. It is not needed anymore.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/edac_mc.c   | 18 +-----------------
 drivers/edac/edac_stub.c |  3 ---
 include/linux/edac.h     |  1 -
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 824d31193b69..482b6aea1ce7 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -504,22 +504,6 @@ struct mem_ctl_info *find_mci_by_dev(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(find_mci_by_dev);
 
-/*
- * handler for EDAC to check if NMI type handler has asserted interrupt
- */
-static int edac_mc_assert_error_check_and_clear(void)
-{
-	int old_state;
-
-	if (edac_op_state == EDAC_OPSTATE_POLL)
-		return 1;
-
-	old_state = edac_err_assert;
-	edac_err_assert = 0;
-
-	return old_state;
-}
-
 /*
  * edac_mc_workq_function
  *	performs the operation scheduled by a workq request
@@ -536,7 +520,7 @@ static void edac_mc_workq_function(struct work_struct *work_req)
 		return;
 	}
 
-	if (edac_mc_assert_error_check_and_clear())
+	if (edac_op_state == EDAC_OPSTATE_POLL)
 		mci->edac_check(mci);
 
 	mutex_unlock(&mem_ctls_mutex);
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index cfb250fa38ce..f02d21d8130f 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -20,9 +20,6 @@
 int edac_op_state = EDAC_OPSTATE_INVAL;
 EXPORT_SYMBOL_GPL(edac_op_state);
 
-int edac_err_assert = 0;
-EXPORT_SYMBOL_GPL(edac_err_assert);
-
 int edac_report_status = EDAC_REPORTING_ENABLED;
 EXPORT_SYMBOL_GPL(edac_report_status);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 9fd6fe53ab2a..c55e93975079 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -28,7 +28,6 @@ struct device;
 #define EDAC_OPSTATE_INT	2
 
 extern int edac_op_state;
-extern int edac_err_assert;
 
 extern struct bus_type *edac_get_sysfs_subsys(void);
 
-- 
cgit v1.2.3


From fee27d7d97886515a60cce38b4152b7f5b5a21fc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 4 Feb 2017 17:42:03 +0100
Subject: EDAC: Delete edac_stub.c

Move the remaining functionality to edac_mc.c. Convert "edac_report=" to
a module parameter.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/Makefile    |  2 +-
 drivers/edac/edac_mc.c   | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/edac/edac_stub.c | 37 -----------------------------
 include/linux/edac.h     | 26 +++------------------
 4 files changed, 65 insertions(+), 61 deletions(-)
 delete mode 100644 drivers/edac/edac_stub.c

(limited to 'include/linux')

diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index a8fb734cb28d..0fd9ffa63299 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -6,7 +6,7 @@
 # GNU General Public License.
 #
 
-obj-$(CONFIG_EDAC)			:= edac_stub.o edac_core.o
+obj-$(CONFIG_EDAC)			:= edac_core.o
 
 edac_core-y	:= edac_mc.o edac_device.o edac_mc_sysfs.o
 edac_core-y	+= edac_module.o edac_device_sysfs.o wq.o
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 735546ea6ebe..536b65aa6fac 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -43,6 +43,8 @@
 int edac_op_state = EDAC_OPSTATE_INVAL;
 EXPORT_SYMBOL_GPL(edac_op_state);
 
+static int edac_report = EDAC_REPORTING_ENABLED;
+
 /* lock to memory controller's control array */
 static DEFINE_MUTEX(mem_ctls_mutex);
 static LIST_HEAD(mc_devices);
@@ -55,6 +57,65 @@ static void const *edac_mc_owner;
 
 static struct bus_type mc_bus[EDAC_MAX_MCS];
 
+int get_edac_report_status(void)
+{
+	return edac_report;
+}
+EXPORT_SYMBOL_GPL(get_edac_report_status);
+
+void set_edac_report_status(int new)
+{
+	if (new == EDAC_REPORTING_ENABLED ||
+	    new == EDAC_REPORTING_DISABLED ||
+	    new == EDAC_REPORTING_FORCE)
+		edac_report = new;
+}
+EXPORT_SYMBOL_GPL(set_edac_report_status);
+
+static int edac_report_set(const char *str, const struct kernel_param *kp)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strncmp(str, "on", 2))
+		edac_report = EDAC_REPORTING_ENABLED;
+	else if (!strncmp(str, "off", 3))
+		edac_report = EDAC_REPORTING_DISABLED;
+	else if (!strncmp(str, "force", 5))
+		edac_report = EDAC_REPORTING_FORCE;
+
+	return 0;
+}
+
+static int edac_report_get(char *buffer, const struct kernel_param *kp)
+{
+	int ret = 0;
+
+	switch (edac_report) {
+	case EDAC_REPORTING_ENABLED:
+		ret = sprintf(buffer, "on");
+		break;
+	case EDAC_REPORTING_DISABLED:
+		ret = sprintf(buffer, "off");
+		break;
+	case EDAC_REPORTING_FORCE:
+		ret = sprintf(buffer, "force");
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static const struct kernel_param_ops edac_report_ops = {
+	.set = edac_report_set,
+	.get = edac_report_get,
+};
+
+module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644);
+
 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
 			         unsigned len)
 {
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
deleted file mode 100644
index 6aacc569401e..000000000000
--- a/drivers/edac/edac_stub.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * common EDAC components that must be in kernel
- *
- * Author: Dave Jiang <djiang@mvista.com>
- *
- * 2007 (c) MontaVista Software, Inc.
- * 2010 (c) Advanced Micro Devices Inc.
- *	    Borislav Petkov <bp@alien8.de>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- *
- */
-#include <linux/module.h>
-#include <linux/edac.h>
-#include <linux/atomic.h>
-#include <linux/device.h>
-
-int edac_report_status = EDAC_REPORTING_ENABLED;
-EXPORT_SYMBOL_GPL(edac_report_status);
-
-static int __init __maybe_unused edac_report_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	if (!strncmp(str, "on", 2))
-		set_edac_report_status(EDAC_REPORTING_ENABLED);
-	else if (!strncmp(str, "off", 3))
-		set_edac_report_status(EDAC_REPORTING_DISABLED);
-	else if (!strncmp(str, "force", 5))
-		set_edac_report_status(EDAC_REPORTING_FORCE);
-
-	return 0;
-}
-__setup("edac_report=", edac_report_setup);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c55e93975079..faf87e1eca21 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -29,7 +29,9 @@ struct device;
 
 extern int edac_op_state;
 
-extern struct bus_type *edac_get_sysfs_subsys(void);
+struct bus_type *edac_get_sysfs_subsys(void);
+int get_edac_report_status(void);
+void set_edac_report_status(int new);
 
 enum {
 	EDAC_REPORTING_ENABLED,
@@ -37,28 +39,6 @@ enum {
 	EDAC_REPORTING_FORCE
 };
 
-extern int edac_report_status;
-#ifdef CONFIG_EDAC
-static inline int get_edac_report_status(void)
-{
-	return edac_report_status;
-}
-
-static inline void set_edac_report_status(int new)
-{
-	edac_report_status = new;
-}
-#else
-static inline int get_edac_report_status(void)
-{
-	return EDAC_REPORTING_DISABLED;
-}
-
-static inline void set_edac_report_status(int new)
-{
-}
-#endif
-
 static inline void opstate_init(void)
 {
 	switch (edac_op_state) {
-- 
cgit v1.2.3


From bffc7dece92edd0b6445b76a378e2fa9e324c7ed Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 4 Feb 2017 18:10:14 +0100
Subject: EDAC: Rename report status accessors

Change them to have the edac_ prefix.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/acpi/acpi_extlog.c | 8 ++++----
 drivers/edac/edac_mc.c     | 8 ++++----
 drivers/edac/pnd2_edac.c   | 2 +-
 drivers/edac/sb_edac.c     | 4 ++--
 drivers/edac/skx_edac.c    | 2 +-
 include/linux/edac.h       | 4 ++--
 6 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index a15270a806fc..502ea4dc2080 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -229,7 +229,7 @@ static int __init extlog_init(void)
 	if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
 		return -ENODEV;
 
-	if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
+	if (edac_get_report_status() == EDAC_REPORTING_FORCE) {
 		pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
 		return -EPERM;
 	}
@@ -285,8 +285,8 @@ static int __init extlog_init(void)
 	 * eMCA event report method has higher priority than EDAC method,
 	 * unless EDAC event report method is mandatory.
 	 */
-	old_edac_report_status = get_edac_report_status();
-	set_edac_report_status(EDAC_REPORTING_DISABLED);
+	old_edac_report_status = edac_get_report_status();
+	edac_set_report_status(EDAC_REPORTING_DISABLED);
 	mce_register_decode_chain(&extlog_mce_dec);
 	/* enable OS to be involved to take over management from BIOS */
 	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,7 +308,7 @@ err:
 
 static void __exit extlog_exit(void)
 {
-	set_edac_report_status(old_edac_report_status);
+	edac_set_report_status(old_edac_report_status);
 	mce_unregister_decode_chain(&extlog_mce_dec);
 	((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
 	if (extlog_l1_addr)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 536b65aa6fac..480072139b7a 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -57,20 +57,20 @@ static void const *edac_mc_owner;
 
 static struct bus_type mc_bus[EDAC_MAX_MCS];
 
-int get_edac_report_status(void)
+int edac_get_report_status(void)
 {
 	return edac_report;
 }
-EXPORT_SYMBOL_GPL(get_edac_report_status);
+EXPORT_SYMBOL_GPL(edac_get_report_status);
 
-void set_edac_report_status(int new)
+void edac_set_report_status(int new)
 {
 	if (new == EDAC_REPORTING_ENABLED ||
 	    new == EDAC_REPORTING_DISABLED ||
 	    new == EDAC_REPORTING_FORCE)
 		edac_report = new;
 }
-EXPORT_SYMBOL_GPL(set_edac_report_status);
+EXPORT_SYMBOL_GPL(edac_set_report_status);
 
 static int edac_report_set(const char *str, const struct kernel_param *kp)
 {
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index 928e0dba41fc..1cad5a9af8d0 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1349,7 +1349,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo
 	struct dram_addr daddr;
 	char *type;
 
-	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
 
 	mci = pnd2_mci;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index a65ea44e3b0b..ea21cb651b3c 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -3075,7 +3075,7 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
 	struct sbridge_pvt *pvt;
 	char *type;
 
-	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
 
 	mci = get_mci_for_node_id(mce->socketid);
@@ -3441,7 +3441,7 @@ static int __init sbridge_init(void)
 
 	if (rc >= 0) {
 		mce_register_decode_chain(&sbridge_mce_dec);
-		if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+		if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 			sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
 		return 0;
 	}
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index 1159dba4671f..64bef6c9cfb4 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -971,7 +971,7 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	struct mem_ctl_info *mci;
 	char *type;
 
-	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
 
 	/* ignore unless this is memory related with an address */
diff --git a/include/linux/edac.h b/include/linux/edac.h
index faf87e1eca21..8ae0f45fafd6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -30,8 +30,8 @@ struct device;
 extern int edac_op_state;
 
 struct bus_type *edac_get_sysfs_subsys(void);
-int get_edac_report_status(void);
-void set_edac_report_status(int new);
+int edac_get_report_status(void);
+void edac_set_report_status(int new);
 
 enum {
 	EDAC_REPORTING_ENABLED,
-- 
cgit v1.2.3


From 9dd813c15b2c101168808d4f5941a29985758973 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 14 Mar 2017 12:31:02 +0100
Subject: fsnotify: Move mark list head from object into dedicated structure

Currently notification marks are attached to object (inode or vfsmnt) by
a hlist_head in the object. The list is also protected by a spinlock in
the object. So while there is any mark attached to the list of marks,
the object must be pinned in memory (and thus e.g. last iput() deleting
inode cannot happen). Also for list iteration in fsnotify() to work, we
must hold fsnotify_mark_srcu lock so that mark itself and
mark->obj_list.next cannot get freed. Thus we are required to wait for
response to fanotify events from userspace process with
fsnotify_mark_srcu lock held. That causes issues when userspace process
is buggy and does not reply to some event - basically the whole
notification subsystem gets eventually stuck.

So to be able to drop fsnotify_mark_srcu lock while waiting for
response, we have to pin the mark in memory and make sure it stays in
the object list (as removing the mark waiting for response could lead to
lost notification events for groups later in the list). However we don't
want inode reclaim to block on such mark as that would lead to system
just locking up elsewhere.

This commit is the first in the series that paves way towards solving
these conflicting lifetime needs. Instead of anchoring the list of marks
directly in the object, we anchor it in a dedicated structure
(fsnotify_mark_connector) and just point to that structure from the
object. The following commits will also add spinlock protecting the list
and object pointer to the structure.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/inode.c                       |  6 +--
 fs/mount.h                       |  2 +-
 fs/namespace.c                   |  6 +--
 fs/notify/fsnotify.c             | 32 +++++++++++-----
 fs/notify/fsnotify.h             | 16 ++++----
 fs/notify/inode_mark.c           |  8 ++--
 fs/notify/mark.c                 | 80 +++++++++++++++++++++++++++++++++-------
 fs/notify/vfsmount_mark.c        |  8 ++--
 include/linux/fs.h               |  4 +-
 include/linux/fsnotify_backend.h | 10 +++++
 kernel/auditsc.c                 |  7 +++-
 11 files changed, 132 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 88110fd0b282..750e952d2918 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -234,6 +234,9 @@ void __destroy_inode(struct inode *inode)
 	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
+#ifdef CONFIG_FSNOTIFY
+	fsnotify_connector_free(&inode->i_fsnotify_marks);
+#endif
 	locks_free_lock_context(inode);
 	if (!inode->i_nlink) {
 		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
@@ -371,9 +374,6 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_lru);
 	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
-#ifdef CONFIG_FSNOTIFY
-	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
-#endif
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/fs/mount.h b/fs/mount.h
index 2826543a131d..bc409360a03b 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -59,7 +59,7 @@ struct mount {
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
 #ifdef CONFIG_FSNOTIFY
-	struct hlist_head mnt_fsnotify_marks;
+	struct fsnotify_mark_connector *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
 #endif
 	int mnt_id;			/* mount identifier */
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..2625e1d97a3a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
-#ifdef CONFIG_FSNOTIFY
-		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
-#endif
 		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
 	}
 	return mnt;
@@ -1111,6 +1108,9 @@ static void cleanup_mnt(struct mount *mnt)
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
+#ifdef CONFIG_FSNOTIFY
+	fsnotify_connector_free(&mnt->mnt_fsnotify_marks);
+#endif
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index b41515d3f081..eae621a18ac9 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -193,6 +193,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
 	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
 	struct fsnotify_group *inode_group, *vfsmount_group;
+	struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
 	struct mount *mnt;
 	int idx, ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
@@ -210,8 +211,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	 * SRCU because we have no references to any objects and do not
 	 * need SRCU to keep them "alive".
 	 */
-	if (hlist_empty(&to_tell->i_fsnotify_marks) &&
-	    (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+	if (!to_tell->i_fsnotify_marks &&
+	    (!mnt || !mnt->mnt_fsnotify_marks))
 		return 0;
 	/*
 	 * if this is a modify event we may need to clear the ignored masks
@@ -226,16 +227,24 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	idx = srcu_read_lock(&fsnotify_mark_srcu);
 
 	if ((mask & FS_MODIFY) ||
-	    (test_mask & to_tell->i_fsnotify_mask))
-		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
-					      &fsnotify_mark_srcu);
+	    (test_mask & to_tell->i_fsnotify_mask)) {
+		inode_conn = lockless_dereference(to_tell->i_fsnotify_marks);
+		if (inode_conn)
+			inode_node = srcu_dereference(inode_conn->list.first,
+						      &fsnotify_mark_srcu);
+	}
 
 	if (mnt && ((mask & FS_MODIFY) ||
 		    (test_mask & mnt->mnt_fsnotify_mask))) {
-		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
-						 &fsnotify_mark_srcu);
-		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
-					      &fsnotify_mark_srcu);
+		inode_conn = lockless_dereference(to_tell->i_fsnotify_marks);
+		if (inode_conn)
+			inode_node = srcu_dereference(inode_conn->list.first,
+						      &fsnotify_mark_srcu);
+		vfsmount_conn = lockless_dereference(mnt->mnt_fsnotify_marks);
+		if (vfsmount_conn)
+			vfsmount_node = srcu_dereference(
+						vfsmount_conn->list.first,
+						&fsnotify_mark_srcu);
 	}
 
 	/*
@@ -293,6 +302,8 @@ out:
 }
 EXPORT_SYMBOL_GPL(fsnotify);
 
+extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
 static __init int fsnotify_init(void)
 {
 	int ret;
@@ -303,6 +314,9 @@ static __init int fsnotify_init(void)
 	if (ret)
 		panic("initializing fsnotify_mark_srcu");
 
+	fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
+						    SLAB_PANIC);
+
 	return 0;
 }
 core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 0a3bc2cf192c..eb64c59c9ad1 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -15,7 +15,7 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
 extern struct srcu_struct fsnotify_mark_srcu;
 
 /* Calculate mask of events for a list of marks */
-extern u32 fsnotify_recalc_mask(struct hlist_head *head);
+extern u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
 
 /* compare two groups for sorting of marks lists */
 extern int fsnotify_compare_groups(struct fsnotify_group *a,
@@ -24,7 +24,7 @@ extern int fsnotify_compare_groups(struct fsnotify_group *a,
 extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
 						__u32 mask);
 /* Add mark to a proper place in mark list */
-extern int fsnotify_add_mark_list(struct hlist_head *head,
+extern int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 				  struct fsnotify_mark *mark,
 				  int allow_dups);
 /* add a mark to an inode */
@@ -41,19 +41,21 @@ extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* Find mark belonging to given group in the list of marks */
-extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
-						struct fsnotify_group *group);
+extern struct fsnotify_mark *fsnotify_find_mark(
+					struct fsnotify_mark_connector *conn,
+					struct fsnotify_group *group);
 /* Destroy all marks in the given list protected by 'lock' */
-extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+extern void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn,
+				   spinlock_t *lock);
 /* run the list of all marks associated with inode and destroy them */
 static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-	fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+	fsnotify_destroy_marks(inode->i_fsnotify_marks, &inode->i_lock);
 }
 /* run the list of all marks associated with vfsmount and destroy them */
 static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 {
-	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+	fsnotify_destroy_marks(real_mount(mnt)->mnt_fsnotify_marks,
 			       &mnt->mnt_root->d_lock);
 }
 /* prepare for freeing all marks associated with given group */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a3645249f7ec..e8c6b822ff8d 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -37,7 +37,7 @@
 void fsnotify_recalc_inode_mask(struct inode *inode)
 {
 	spin_lock(&inode->i_lock);
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	spin_unlock(&inode->i_lock);
 
 	__fsnotify_update_child_dentry_flags(inode);
@@ -60,7 +60,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	 * hold the inode->i_lock, so this is the perfect time to update the
 	 * inode->i_fsnotify_mask
 	 */
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	spin_unlock(&inode->i_lock);
 }
 
@@ -82,7 +82,7 @@ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
 	struct fsnotify_mark *mark;
 
 	spin_lock(&inode->i_lock);
-	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+	mark = fsnotify_find_mark(inode->i_fsnotify_marks, group);
 	spin_unlock(&inode->i_lock);
 
 	return mark;
@@ -135,7 +135,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 	mark->inode = inode;
 	ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
 				     allow_dups);
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
+	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	spin_unlock(&inode->i_lock);
 
 	return ret;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 44836e539169..24b6191bd6c6 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -83,6 +83,8 @@
 #define FSNOTIFY_REAPER_DELAY	(1)	/* 1 jiffy */
 
 struct srcu_struct fsnotify_mark_srcu;
+struct kmem_cache *fsnotify_mark_connector_cachep;
+
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
 
@@ -104,12 +106,15 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 }
 
 /* Calculate mask of events for a list of marks */
-u32 fsnotify_recalc_mask(struct hlist_head *head)
+u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	u32 new_mask = 0;
 	struct fsnotify_mark *mark;
 
-	hlist_for_each_entry(mark, head, obj_list)
+	if (!conn)
+		return 0;
+
+	hlist_for_each_entry(mark, &conn->list, obj_list)
 		new_mask |= mark->mask;
 	return new_mask;
 }
@@ -220,10 +225,14 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	fsnotify_free_mark(mark);
 }
 
-void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
+void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn,
+			    spinlock_t *lock)
 {
 	struct fsnotify_mark *mark;
 
+	if (!conn)
+		return;
+
 	while (1) {
 		/*
 		 * We have to be careful since we can race with e.g.
@@ -233,11 +242,12 @@ void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 		 * calling fsnotify_destroy_mark() more than once is fine.
 		 */
 		spin_lock(lock);
-		if (hlist_empty(head)) {
+		if (hlist_empty(&conn->list)) {
 			spin_unlock(lock);
 			break;
 		}
-		mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+		mark = hlist_entry(conn->list.first, struct fsnotify_mark,
+				   obj_list);
 		/*
 		 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
 		 * since inode / mount is going away anyway. So just remove
@@ -251,6 +261,14 @@ void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 	}
 }
 
+void fsnotify_connector_free(struct fsnotify_mark_connector **connp)
+{
+	if (*connp) {
+		kmem_cache_free(fsnotify_mark_connector_cachep, *connp);
+		*connp = NULL;
+	}
+}
+
 void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
 {
 	assert_spin_locked(&mark->lock);
@@ -304,21 +322,54 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 	return -1;
 }
 
-/* Add mark into proper place in given list of marks */
-int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
-			   int allow_dups)
+static int fsnotify_attach_connector_to_object(
+					struct fsnotify_mark_connector **connp)
+{
+	struct fsnotify_mark_connector *conn;
+
+	conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_ATOMIC);
+	if (!conn)
+		return -ENOMEM;
+	INIT_HLIST_HEAD(&conn->list);
+	/*
+	 * Make sure 'conn' initialization is visible. Matches
+	 * lockless_dereference() in fsnotify().
+	 */
+	smp_wmb();
+	*connp = conn;
+
+	return 0;
+}
+
+/*
+ * Add mark into proper place in given list of marks. These marks may be used
+ * for the fsnotify backend to determine which event types should be delivered
+ * to which group and for which inodes. These marks are ordered according to
+ * priority, highest number first, and then by the group's location in memory.
+ */
+int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
+			   struct fsnotify_mark *mark, int allow_dups)
 {
 	struct fsnotify_mark *lmark, *last = NULL;
+	struct fsnotify_mark_connector *conn;
 	int cmp;
+	int err;
+
+	if (!*connp) {
+		err = fsnotify_attach_connector_to_object(connp);
+		if (err)
+			return err;
+	}
+	conn = *connp;
 
 	/* is mark the first mark? */
-	if (hlist_empty(head)) {
-		hlist_add_head_rcu(&mark->obj_list, head);
+	if (hlist_empty(&conn->list)) {
+		hlist_add_head_rcu(&mark->obj_list, &conn->list);
 		return 0;
 	}
 
 	/* should mark be in the middle of the current list? */
-	hlist_for_each_entry(lmark, head, obj_list) {
+	hlist_for_each_entry(lmark, &conn->list, obj_list) {
 		last = lmark;
 
 		if ((lmark->group == mark->group) && !allow_dups)
@@ -419,12 +470,15 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
  * Given a list of marks, find the mark associated with given group. If found
  * take a reference to that mark and return it, else return NULL.
  */
-struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
+struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector *conn,
 					 struct fsnotify_group *group)
 {
 	struct fsnotify_mark *mark;
 
-	hlist_for_each_entry(mark, head, obj_list) {
+	if (!conn)
+		return NULL;
+
+	hlist_for_each_entry(mark, &conn->list, obj_list) {
 		if (mark->group == group) {
 			fsnotify_get_mark(mark);
 			return mark;
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index a8fcab68faef..28815d5cba7c 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -43,7 +43,7 @@ void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
 	struct mount *m = real_mount(mnt);
 
 	spin_lock(&mnt->mnt_root->d_lock);
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 	spin_unlock(&mnt->mnt_root->d_lock);
 }
 
@@ -60,7 +60,7 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
 	hlist_del_init_rcu(&mark->obj_list);
 	mark->mnt = NULL;
 
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 	spin_unlock(&mnt->mnt_root->d_lock);
 }
 
@@ -75,7 +75,7 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
 	struct fsnotify_mark *mark;
 
 	spin_lock(&mnt->mnt_root->d_lock);
-	mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
+	mark = fsnotify_find_mark(m->mnt_fsnotify_marks, group);
 	spin_unlock(&mnt->mnt_root->d_lock);
 
 	return mark;
@@ -101,7 +101,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 	spin_lock(&mnt->mnt_root->d_lock);
 	mark->mnt = mnt;
 	ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
+	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 	spin_unlock(&mnt->mnt_root->d_lock);
 
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..66e52342be2d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -546,6 +546,8 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_XATTR	0x0008
 #define IOP_DEFAULT_READLINK	0x0010
 
+struct fsnotify_mark_connector;
+
 /*
  * Keep mostly read-only and often accessed (especially for
  * the RCU path lookup and 'stat' data) fields at the beginning
@@ -645,7 +647,7 @@ struct inode {
 
 #ifdef CONFIG_FSNOTIFY
 	__u32			i_fsnotify_mask; /* all events this inode cares about */
-	struct hlist_head	i_fsnotify_marks;
+	struct fsnotify_mark_connector	*i_fsnotify_marks;
 #endif
 
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index e6e689b5569e..8b63085f8855 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -194,6 +194,15 @@ struct fsnotify_group {
 #define FSNOTIFY_EVENT_PATH	1
 #define FSNOTIFY_EVENT_INODE	2
 
+/*
+ * Inode / vfsmount point to this structure which tracks all marks attached to
+ * the inode / vfsmount. The structure is freed only when inode / vfsmount gets
+ * freed.
+ */
+struct fsnotify_mark_connector {
+	struct hlist_head list;
+};
+
 /*
  * A mark is simply an object attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
@@ -346,6 +355,7 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
+extern void fsnotify_connector_free(struct fsnotify_mark_connector **connp);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d6a8de5f8fa3..bf7b7ca295d0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -73,6 +73,7 @@
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
+#include <linux/fsnotify_backend.h>
 #include <uapi/linux/limits.h>
 
 #include "audit.h"
@@ -1596,7 +1597,8 @@ static inline void handle_one(const struct inode *inode)
 	struct audit_tree_refs *p;
 	struct audit_chunk *chunk;
 	int count;
-	if (likely(hlist_empty(&inode->i_fsnotify_marks)))
+	if (likely(!inode->i_fsnotify_marks ||
+		   hlist_empty(&inode->i_fsnotify_marks->list)))
 		return;
 	context = current->audit_context;
 	p = context->trees;
@@ -1639,7 +1641,8 @@ retry:
 	seq = read_seqbegin(&rename_lock);
 	for(;;) {
 		struct inode *inode = d_backing_inode(d);
-		if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) {
+		if (inode && unlikely(inode->i_fsnotify_marks &&
+		    !hlist_empty(&inode->i_fsnotify_marks->list))) {
 			struct audit_chunk *chunk;
 			chunk = audit_tree_lookup(inode);
 			if (chunk) {
-- 
cgit v1.2.3


From 86ffe245c430f07f95d5d28d3b694ea72f4492e7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 14 Mar 2017 14:29:35 +0100
Subject: fsnotify: Move object pointer to fsnotify_mark_connector

Move pointer to inode / vfsmount from mark itself to the
fsnotify_mark_connector structure. This is another step on the path
towards decoupling inode / vfsmount lifetime from notification mark
lifetime.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c      |  4 ++--
 fs/notify/fdinfo.c               | 12 ++++++------
 fs/notify/fsnotify.h             |  3 ++-
 fs/notify/inode_mark.c           | 18 +++++++-----------
 fs/notify/mark.c                 | 32 ++++++++++++++++++++++----------
 fs/notify/vfsmount_mark.c        | 12 +++++-------
 include/linux/fsnotify_backend.h | 17 ++++++++++-------
 kernel/audit_tree.c              | 25 ++++++++++++++++++++-----
 8 files changed, 74 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5a4ec309e283..5024729dba23 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 	if (old_mask == new_mask)
 		return;
 
-	if (fsn_mark->inode)
-		fsnotify_recalc_inode_mask(fsn_mark->inode);
+	if (fsn_mark->connector)
+		fsnotify_recalc_inode_mask(fsn_mark->connector->inode);
 }
 
 /*
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 601a59c8d87e..dd63aa9a6f9a 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,11 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
 
-	if (!(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
+	if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE))
 		return;
 
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
-	inode = igrab(mark->inode);
+	inode = igrab(mark->connector->inode);
 	if (inode) {
 		/*
 		 * IN_ALL_EVENTS represents all of the mask bits
@@ -115,8 +115,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
 
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
-		inode = igrab(mark->inode);
+	if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+		inode = igrab(mark->connector->inode);
 		if (!inode)
 			return;
 		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
@@ -125,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
-	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
-		struct mount *mnt = real_mount(mark->mnt);
+	} else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+		struct mount *mnt = real_mount(mark->connector->mnt);
 
 		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
 			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index eb64c59c9ad1..dd1a6798c9cd 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -26,6 +26,7 @@ extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
 /* Add mark to a proper place in mark list */
 extern int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 				  struct fsnotify_mark *mark,
+				  struct inode *inode, struct vfsmount *mnt,
 				  int allow_dups);
 /* add a mark to an inode */
 extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
@@ -44,7 +45,7 @@ extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 extern struct fsnotify_mark *fsnotify_find_mark(
 					struct fsnotify_mark_connector *conn,
 					struct fsnotify_group *group);
-/* Destroy all marks in the given list protected by 'lock' */
+/* Destroy all marks connected via given connector protected by 'lock' */
 extern void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn,
 				   spinlock_t *lock);
 /* run the list of all marks associated with inode and destroy them */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e8c6b822ff8d..1644ba09efd4 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -45,7 +45,7 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 
 void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 {
-	struct inode *inode = mark->inode;
+	struct inode *inode = mark->connector->inode;
 
 	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
 	assert_spin_locked(&mark->lock);
@@ -53,7 +53,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	spin_lock(&inode->i_lock);
 
 	hlist_del_init_rcu(&mark->obj_list);
-	mark->inode = NULL;
+	mark->connector = NULL;
 
 	/*
 	 * this mark is now off the inode->i_fsnotify_marks list and we
@@ -69,7 +69,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
  */
 void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE);
 }
 
 /*
@@ -99,11 +99,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
 
 	assert_spin_locked(&mark->lock);
 
-	if (mask &&
-	    mark->inode &&
+	if (mask && mark->connector &&
 	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
 		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
-		inode = igrab(mark->inode);
+		inode = igrab(mark->connector->inode);
 		/*
 		 * we shouldn't be able to get here if the inode wasn't
 		 * already safely held in memory.  But bug in case it
@@ -126,15 +125,12 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 {
 	int ret;
 
-	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
-
 	BUG_ON(!mutex_is_locked(&group->mark_mutex));
 	assert_spin_locked(&mark->lock);
 
 	spin_lock(&inode->i_lock);
-	mark->inode = inode;
-	ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
-				     allow_dups);
+	ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, inode,
+				     NULL, allow_dups);
 	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	spin_unlock(&inode->i_lock);
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 24b6191bd6c6..3d6e7a8e58be 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -142,10 +142,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
 
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
-		inode = mark->inode;
+	if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+		inode = mark->connector->inode;
 		fsnotify_destroy_inode_mark(mark);
-	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
+	} else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 		fsnotify_destroy_vfsmount_mark(mark);
 	else
 		BUG();
@@ -275,7 +275,7 @@ void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
 
 	mark->mask = mask;
 
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
+	if (mark->connector && mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)
 		fsnotify_set_inode_mark_mask_locked(mark, mask);
 }
 
@@ -323,7 +323,9 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 }
 
 static int fsnotify_attach_connector_to_object(
-					struct fsnotify_mark_connector **connp)
+					struct fsnotify_mark_connector **connp,
+					struct inode *inode,
+					struct vfsmount *mnt)
 {
 	struct fsnotify_mark_connector *conn;
 
@@ -331,6 +333,13 @@ static int fsnotify_attach_connector_to_object(
 	if (!conn)
 		return -ENOMEM;
 	INIT_HLIST_HEAD(&conn->list);
+	if (inode) {
+		conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
+		conn->inode = inode;
+	} else {
+		conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+		conn->mnt = mnt;
+	}
 	/*
 	 * Make sure 'conn' initialization is visible. Matches
 	 * lockless_dereference() in fsnotify().
@@ -348,7 +357,8 @@ static int fsnotify_attach_connector_to_object(
  * priority, highest number first, and then by the group's location in memory.
  */
 int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
-			   struct fsnotify_mark *mark, int allow_dups)
+			   struct fsnotify_mark *mark, struct inode *inode,
+			   struct vfsmount *mnt, int allow_dups)
 {
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
@@ -356,7 +366,7 @@ int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 	int err;
 
 	if (!*connp) {
-		err = fsnotify_attach_connector_to_object(connp);
+		err = fsnotify_attach_connector_to_object(connp, inode, mnt);
 		if (err)
 			return err;
 	}
@@ -365,7 +375,7 @@ int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 	/* is mark the first mark? */
 	if (hlist_empty(&conn->list)) {
 		hlist_add_head_rcu(&mark->obj_list, &conn->list);
-		return 0;
+		goto added;
 	}
 
 	/* should mark be in the middle of the current list? */
@@ -378,13 +388,15 @@ int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 		cmp = fsnotify_compare_groups(lmark->group, mark->group);
 		if (cmp >= 0) {
 			hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
-			return 0;
+			goto added;
 		}
 	}
 
 	BUG_ON(last == NULL);
 	/* mark should be the last entry.  last is the current last entry */
 	hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
+added:
+	mark->connector = conn;
 	return 0;
 }
 
@@ -507,7 +519,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 	 */
 	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
-		if (mark->flags & flags)
+		if (mark->connector->flags & flags)
 			list_move(&mark->g_list, &to_free);
 	}
 	mutex_unlock(&group->mark_mutex);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 28815d5cba7c..e04e33ef02d4 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -31,7 +31,7 @@
 
 void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
 }
 
 /*
@@ -49,7 +49,7 @@ void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
 
 void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
 {
-	struct vfsmount *mnt = mark->mnt;
+	struct vfsmount *mnt = mark->connector->mnt;
 	struct mount *m = real_mount(mnt);
 
 	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
@@ -58,7 +58,7 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
 	spin_lock(&mnt->mnt_root->d_lock);
 
 	hlist_del_init_rcu(&mark->obj_list);
-	mark->mnt = NULL;
+	mark->connector = NULL;
 
 	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 	spin_unlock(&mnt->mnt_root->d_lock);
@@ -93,14 +93,12 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 	struct mount *m = real_mount(mnt);
 	int ret;
 
-	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
-
 	BUG_ON(!mutex_is_locked(&group->mark_mutex));
 	assert_spin_locked(&mark->lock);
 
 	spin_lock(&mnt->mnt_root->d_lock);
-	mark->mnt = mnt;
-	ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
+	ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, NULL, mnt,
+				     allow_dups);
 	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 	spin_unlock(&mnt->mnt_root->d_lock);
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 8b63085f8855..06f9a2cc1463 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -200,6 +200,13 @@ struct fsnotify_group {
  * freed.
  */
 struct fsnotify_mark_connector {
+#define FSNOTIFY_OBJ_TYPE_INODE		0x01
+#define FSNOTIFY_OBJ_TYPE_VFSMOUNT	0x02
+	unsigned int flags;	/* Type of object [lock] */
+	union {	/* Object pointer [lock] */
+		struct inode *inode;
+		struct vfsmount *mnt;
+	};
 	struct hlist_head list;
 };
 
@@ -234,14 +241,10 @@ struct fsnotify_mark {
 	spinlock_t lock;
 	/* List of marks for inode / vfsmount [obj_lock] */
 	struct hlist_node obj_list;
-	union {	/* Object pointer [mark->lock, group->mark_mutex] */
-		struct inode *inode;	/* inode this mark is associated with */
-		struct vfsmount *mnt;	/* vfsmount this mark is associated with */
-	};
+	/* Head of list of marks for an object [mark->lock, group->mark_mutex] */
+	struct fsnotify_mark_connector *connector;
 	/* Events types to ignore [mark->lock, group->mark_mutex] */
 	__u32 ignored_mask;
-#define FSNOTIFY_MARK_FLAG_INODE		0x01
-#define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x10
@@ -353,7 +356,7 @@ extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the inode marks */
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
-/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
+/* run all the marks in a group, and clear all of the marks attached to given object type */
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
 extern void fsnotify_connector_free(struct fsnotify_mark_connector **connp);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 51451245936a..c3b5fcb8eca4 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -172,10 +172,25 @@ static unsigned long inode_to_key(const struct inode *inode)
 /*
  * Function to return search key in our hash from chunk. Key 0 is special and
  * should never be present in the hash.
+ *
+ * Must be called with chunk->mark.lock held to protect from connector
+ * becoming NULL.
  */
+static unsigned long __chunk_to_key(struct audit_chunk *chunk)
+{
+	if (!chunk->mark.connector)
+		return 0;
+	return (unsigned long)chunk->mark.connector->inode;
+}
+
 static unsigned long chunk_to_key(struct audit_chunk *chunk)
 {
-	return (unsigned long)chunk->mark.inode;
+	unsigned long key;
+
+	spin_lock(&chunk->mark.lock);
+	key = __chunk_to_key(chunk);
+	spin_unlock(&chunk->mark.lock);
+	return key;
 }
 
 static inline struct list_head *chunk_hash(unsigned long key)
@@ -187,7 +202,7 @@ static inline struct list_head *chunk_hash(unsigned long key)
 /* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-	unsigned long key = chunk_to_key(chunk);
+	unsigned long key = __chunk_to_key(chunk);
 	struct list_head *list;
 
 	if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
@@ -276,8 +291,8 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 
-	if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode,
-				     NULL, 1)) {
+	if (fsnotify_add_mark_locked(&new->mark, entry->group,
+				     entry->connector->inode, NULL, 1)) {
 		fsnotify_put_mark(&new->mark);
 		goto Fallback;
 	}
@@ -418,7 +433,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	}
 
 	if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
-				     old_entry->inode, NULL, 1)) {
+			     old_entry->connector->inode, NULL, 1)) {
 		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(chunk_entry);
-- 
cgit v1.2.3


From e911d8af87dba7642138f4320ca3db80629989f2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 14 Mar 2017 14:48:00 +0100
Subject: fsnotify: Make fsnotify_mark_connector hold inode reference

Currently inode reference is held by fsnotify marks. Change the rules so
that inode reference is held by fsnotify_mark_connector structure
whenever the list is non-empty. This simplifies the code and is more
logical.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.h             |  4 +---
 fs/notify/inode_mark.c           | 30 +++++-------------------------
 fs/notify/mark.c                 | 17 ++++++-----------
 include/linux/fsnotify_backend.h | 12 ++++++------
 4 files changed, 18 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index dd1a6798c9cd..1a2aec65ebd8 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -21,8 +21,6 @@ extern u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
 extern int fsnotify_compare_groups(struct fsnotify_group *a,
 				   struct fsnotify_group *b);
 
-extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
-						__u32 mask);
 /* Add mark to a proper place in mark list */
 extern int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 				  struct fsnotify_mark *mark,
@@ -40,7 +38,7 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 /* vfsmount specific destruction of a mark */
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
-extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
+extern struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(
 					struct fsnotify_mark_connector *conn,
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 1644ba09efd4..c3873b6920e7 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -43,9 +43,10 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 	__fsnotify_update_child_dentry_flags(inode);
 }
 
-void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
+struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 {
 	struct inode *inode = mark->connector->inode;
+	bool empty;
 
 	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
 	assert_spin_locked(&mark->lock);
@@ -53,6 +54,7 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	spin_lock(&inode->i_lock);
 
 	hlist_del_init_rcu(&mark->obj_list);
+	empty = hlist_empty(&mark->connector->list);
 	mark->connector = NULL;
 
 	/*
@@ -62,6 +64,8 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	 */
 	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	spin_unlock(&inode->i_lock);
+
+	return empty ? inode : NULL;
 }
 
 /*
@@ -88,30 +92,6 @@ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
 	return mark;
 }
 
-/*
- * If we are setting a mark mask on an inode mark we should pin the inode
- * in memory.
- */
-void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
-					 __u32 mask)
-{
-	struct inode *inode;
-
-	assert_spin_locked(&mark->lock);
-
-	if (mask && mark->connector &&
-	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
-		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
-		inode = igrab(mark->connector->inode);
-		/*
-		 * we shouldn't be able to get here if the inode wasn't
-		 * already safely held in memory.  But bug in case it
-		 * ever is wrong.
-		 */
-		BUG_ON(!inode);
-	}
-}
-
 /*
  * Attach an initialized mark to a given inode.
  * These marks may be used for the fsnotify backend to determine which
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 3d6e7a8e58be..8a15c64fbe80 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -142,10 +142,9 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
 
-	if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
-		inode = mark->connector->inode;
-		fsnotify_destroy_inode_mark(mark);
-	} else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+	if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)
+		inode = fsnotify_destroy_inode_mark(mark);
+	else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 		fsnotify_destroy_vfsmount_mark(mark);
 	else
 		BUG();
@@ -160,7 +159,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 
 	spin_unlock(&mark->lock);
 
-	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
+	if (inode)
 		iput(inode);
 
 	atomic_dec(&group->num_marks);
@@ -274,9 +273,6 @@ void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
 	assert_spin_locked(&mark->lock);
 
 	mark->mask = mask;
-
-	if (mark->connector && mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)
-		fsnotify_set_inode_mark_mask_locked(mark, mask);
 }
 
 void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
@@ -375,6 +371,8 @@ int fsnotify_add_mark_list(struct fsnotify_mark_connector **connp,
 	/* is mark the first mark? */
 	if (hlist_empty(&conn->list)) {
 		hlist_add_head_rcu(&mark->obj_list, &conn->list);
+		if (inode)
+			__iget(inode);
 		goto added;
 	}
 
@@ -441,9 +439,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	} else {
 		BUG();
 	}
-
-	/* this will pin the object if appropriate */
-	fsnotify_set_mark_mask_locked(mark, mark->mask);
 	spin_unlock(&mark->lock);
 
 	if (inode)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 06f9a2cc1463..96333fb09309 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -196,8 +196,9 @@ struct fsnotify_group {
 
 /*
  * Inode / vfsmount point to this structure which tracks all marks attached to
- * the inode / vfsmount. The structure is freed only when inode / vfsmount gets
- * freed.
+ * the inode / vfsmount. The reference to inode / vfsmount is held by this
+ * structure whenever the list is non-empty. The structure is freed only when
+ * inode / vfsmount gets freed.
  */
 struct fsnotify_mark_connector {
 #define FSNOTIFY_OBJ_TYPE_INODE		0x01
@@ -245,10 +246,9 @@ struct fsnotify_mark {
 	struct fsnotify_mark_connector *connector;
 	/* Events types to ignore [mark->lock, group->mark_mutex] */
 	__u32 ignored_mask;
-#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
-#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
-#define FSNOTIFY_MARK_FLAG_ALIVE		0x10
-#define FSNOTIFY_MARK_FLAG_ATTACHED		0x20
+#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x01
+#define FSNOTIFY_MARK_FLAG_ALIVE		0x02
+#define FSNOTIFY_MARK_FLAG_ATTACHED		0x04
 	unsigned int flags;		/* flags [mark->lock] */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
-- 
cgit v1.2.3


From a242677bb1e6faa9bd82bd33afb2621071258231 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 15 Mar 2017 09:16:27 +0100
Subject: fsnotify: Move locking into fsnotify_recalc_mask()

Move locking of locks protecting a list of marks into
fsnotify_recalc_mask(). This reduces code churn in the following patch
which changes the lock protecting the list of marks.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c      |  3 +--
 fs/notify/fsnotify.h             |  3 ---
 fs/notify/inode_mark.c           | 18 +++---------------
 fs/notify/mark.c                 | 40 ++++++++++++++++++++++++++++++----------
 fs/notify/vfsmount_mark.c        | 13 +++----------
 include/linux/fsnotify_backend.h |  2 ++
 6 files changed, 39 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5024729dba23..41b2a070761c 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -69,8 +69,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 	if (old_mask == new_mask)
 		return;
 
-	if (fsn_mark->connector)
-		fsnotify_recalc_inode_mask(fsn_mark->connector->inode);
+	fsnotify_recalc_mask(fsn_mark->connector);
 }
 
 /*
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 0354338aad78..96051780d50e 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -14,9 +14,6 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
 /* protects reads of inode and vfsmount marks list */
 extern struct srcu_struct fsnotify_mark_srcu;
 
-/* Calculate mask of events for a list of marks */
-extern u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
-
 /* compare two groups for sorting of marks lists */
 extern int fsnotify_compare_groups(struct fsnotify_group *a,
 				   struct fsnotify_group *b);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 87bef7d802db..9b2f4e6eb8eb 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -30,17 +30,9 @@
 
 #include "../internal.h"
 
-/*
- * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this inode.
- */
 void fsnotify_recalc_inode_mask(struct inode *inode)
 {
-	spin_lock(&inode->i_lock);
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
-	spin_unlock(&inode->i_lock);
-
-	__fsnotify_update_child_dentry_flags(inode);
+	fsnotify_recalc_mask(inode->i_fsnotify_marks);
 }
 
 struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
@@ -57,14 +49,10 @@ struct inode *fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	empty = hlist_empty(&mark->connector->list);
 	mark->connector = NULL;
 
-	/*
-	 * this mark is now off the inode->i_fsnotify_marks list and we
-	 * hold the inode->i_lock, so this is the perfect time to update the
-	 * inode->i_fsnotify_mask
-	 */
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	spin_unlock(&inode->i_lock);
 
+	fsnotify_recalc_mask(inode->i_fsnotify_marks);
+
 	return empty ? inode : NULL;
 }
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index b3f83ed6e8be..06faf166c7ae 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -105,18 +105,40 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 	}
 }
 
-/* Calculate mask of events for a list of marks */
-u32 fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	u32 new_mask = 0;
 	struct fsnotify_mark *mark;
 
-	if (!conn)
-		return 0;
-
 	hlist_for_each_entry(mark, &conn->list, obj_list)
 		new_mask |= mark->mask;
-	return new_mask;
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+		conn->inode->i_fsnotify_mask = new_mask;
+	else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+		real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask;
+}
+
+/*
+ * Calculate mask of events for a list of marks. The caller must make sure
+ * connector cannot disappear under us (usually by holding a mark->lock or
+ * mark->group->mark_mutex for a mark on this list).
+ */
+void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+{
+	if (!conn)
+		return;
+
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+		spin_lock(&conn->inode->i_lock);
+	else
+		spin_lock(&conn->mnt->mnt_root->d_lock);
+	__fsnotify_recalc_mask(conn);
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+		spin_unlock(&conn->inode->i_lock);
+		__fsnotify_update_child_dentry_flags(conn->inode);
+	} else {
+		spin_unlock(&conn->mnt->mnt_root->d_lock);
+	}
 }
 
 /*
@@ -423,10 +445,8 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	if (ret)
 		goto err;
 
-	if (inode)
-		fsnotify_recalc_inode_mask(inode);
-	else
-		fsnotify_recalc_vfsmount_mask(mnt);
+	if (mark->mask)
+		fsnotify_recalc_mask(mark->connector);
 
 	return ret;
 err:
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 49ccbdb74f82..ffe0d7098cba 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -34,17 +34,9 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
 }
 
-/*
- * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this mount point
- */
 void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
 {
-	struct mount *m = real_mount(mnt);
-
-	spin_lock(&mnt->mnt_root->d_lock);
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
-	spin_unlock(&mnt->mnt_root->d_lock);
+	fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
 }
 
 void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
@@ -60,8 +52,9 @@ void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
 	hlist_del_init_rcu(&mark->obj_list);
 	mark->connector = NULL;
 
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 	spin_unlock(&mnt->mnt_root->d_lock);
+
+	fsnotify_recalc_mask(m->mnt_fsnotify_marks);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 96333fb09309..b954f1b2571c 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -327,6 +327,8 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group
 
 /* functions used to manipulate the marks attached to inodes */
 
+/* Calculate mask of events for a list of marks */
+extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
 /* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */
 extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt);
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
-- 
cgit v1.2.3


From 04662cab59fc3e8421fd7a0539d304d51d2750a4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Feb 2017 08:19:43 +0100
Subject: fsnotify: Lock object list with connector lock

So far list of marks attached to an object (inode / vfsmount) was
protected by i_lock or mnt_root->d_lock. This dictates that the list
must be empty before the object can be destroyed although the list is
now anchored in the fsnotify_mark_connector structure. Protect the list
by a spinlock in the fsnotify_mark_connector structure to decouple
lifetime of a list of marks from a lifetime of the object. This also
simplifies the code quite a bit since we don't have to differentiate
between inode and vfsmount lists in quite a few places anymore.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/mark.c                 | 90 ++++++++++++++--------------------------
 include/linux/fsnotify_backend.h |  3 +-
 2 files changed, 34 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index b5b641a2b557..bfb415d0d757 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -33,7 +33,7 @@
  *
  * group->mark_mutex
  * mark->lock
- * inode->i_lock
+ * mark->connector->lock
  *
  * group->mark_mutex protects the marks_list anchored inside a given group and
  * each mark is hooked via the g_list.  It also protects the groups private
@@ -44,10 +44,12 @@
  * is assigned to as well as the access to a reference of the inode/vfsmount
  * that is being watched by the mark.
  *
- * inode->i_lock protects the i_fsnotify_marks list anchored inside a
- * given inode and each mark is hooked via the i_list. (and sorta the
- * free_i_list)
+ * mark->connector->lock protects the list of marks anchored inside an
+ * inode / vfsmount and each mark is hooked via the i_list.
  *
+ * A list of notification marks relating to inode / mnt is contained in
+ * fsnotify_mark_connector. That structure is alive as long as there are any
+ * marks in the list and is also protected by fsnotify_mark_srcu.
  *
  * LIFETIME:
  * Inode marks survive between when they are added to an inode and when their
@@ -110,8 +112,10 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 	u32 new_mask = 0;
 	struct fsnotify_mark *mark;
 
+	assert_spin_locked(&conn->lock);
 	hlist_for_each_entry(mark, &conn->list, obj_list)
 		new_mask |= mark->mask;
+
 	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
 		conn->inode->i_fsnotify_mask = new_mask;
 	else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
@@ -128,31 +132,20 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 	if (!conn)
 		return;
 
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
-		spin_lock(&conn->inode->i_lock);
-	else
-		spin_lock(&conn->mnt->mnt_root->d_lock);
+	spin_lock(&conn->lock);
 	__fsnotify_recalc_mask(conn);
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
-		spin_unlock(&conn->inode->i_lock);
+	spin_unlock(&conn->lock);
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
 		__fsnotify_update_child_dentry_flags(conn->inode);
-	} else {
-		spin_unlock(&conn->mnt->mnt_root->d_lock);
-	}
 }
 
 static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
 {
 	struct fsnotify_mark_connector *conn;
 	struct inode *inode = NULL;
-	spinlock_t *lock;
 
 	conn = mark->connector;
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
-		lock = &conn->inode->i_lock;
-	else
-		lock = &conn->mnt->mnt_root->d_lock;
-	spin_lock(lock);
+	spin_lock(&conn->lock);
 	hlist_del_init_rcu(&mark->obj_list);
 	if (hlist_empty(&conn->list)) {
 		if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
@@ -160,7 +153,7 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
 	}
 	__fsnotify_recalc_mask(conn);
 	mark->connector = NULL;
-	spin_unlock(lock);
+	spin_unlock(&conn->lock);
 
 	return inode;
 }
@@ -326,7 +319,6 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 
 static int fsnotify_attach_connector_to_object(
 					struct fsnotify_mark_connector **connp,
-					spinlock_t *lock,
 					struct inode *inode,
 					struct vfsmount *mnt)
 {
@@ -335,6 +327,7 @@ static int fsnotify_attach_connector_to_object(
 	conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
 	if (!conn)
 		return -ENOMEM;
+	spin_lock_init(&conn->lock);
 	INIT_HLIST_HEAD(&conn->list);
 	if (inode) {
 		conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
@@ -344,16 +337,13 @@ static int fsnotify_attach_connector_to_object(
 		conn->mnt = mnt;
 	}
 	/*
-	 * Make sure 'conn' initialization is visible. Matches
-	 * lockless_dereference() in fsnotify().
+	 * cmpxchg() provides the barrier so that readers of *connp can see
+	 * only initialized structure
 	 */
-	smp_wmb();
-	spin_lock(lock);
-	if (!*connp)
-		*connp = conn;
-	else
+	if (cmpxchg(connp, NULL, conn)) {
+		/* Someone else created list structure for us */
 		kmem_cache_free(fsnotify_mark_connector_cachep, conn);
-	spin_unlock(lock);
+	}
 
 	return 0;
 }
@@ -371,35 +361,30 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
 	struct fsnotify_mark_connector **connp;
-	spinlock_t *lock;
 	int cmp;
 	int err = 0;
 
 	if (WARN_ON(!inode && !mnt))
 		return -EINVAL;
-	if (inode) {
+	if (inode)
 		connp = &inode->i_fsnotify_marks;
-		lock = &inode->i_lock;
-	} else {
+	else
 		connp = &real_mount(mnt)->mnt_fsnotify_marks;
-		lock = &mnt->mnt_root->d_lock;
-	}
 
 	if (!*connp) {
-		err = fsnotify_attach_connector_to_object(connp, lock,
-							  inode, mnt);
+		err = fsnotify_attach_connector_to_object(connp, inode, mnt);
 		if (err)
 			return err;
 	}
 	spin_lock(&mark->lock);
-	spin_lock(lock);
 	conn = *connp;
+	spin_lock(&conn->lock);
 
 	/* is mark the first mark? */
 	if (hlist_empty(&conn->list)) {
 		hlist_add_head_rcu(&mark->obj_list, &conn->list);
 		if (inode)
-			__iget(inode);
+			igrab(inode);
 		goto added;
 	}
 
@@ -425,7 +410,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 added:
 	mark->connector = conn;
 out_err:
-	spin_unlock(lock);
+	spin_unlock(&conn->lock);
 	spin_unlock(&mark->lock);
 	return err;
 }
@@ -449,7 +434,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	 * LOCKING ORDER!!!!
 	 * group->mark_mutex
 	 * mark->lock
-	 * inode->i_lock
+	 * mark->connector->lock
 	 */
 	spin_lock(&mark->lock);
 	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
@@ -505,24 +490,19 @@ struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector *conn,
 					 struct fsnotify_group *group)
 {
 	struct fsnotify_mark *mark;
-	spinlock_t *lock;
 
 	if (!conn)
 		return NULL;
 
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
-		lock = &conn->inode->i_lock;
-	else
-		lock = &conn->mnt->mnt_root->d_lock;
-	spin_lock(lock);
+	spin_lock(&conn->lock);
 	hlist_for_each_entry(mark, &conn->list, obj_list) {
 		if (mark->group == group) {
 			fsnotify_get_mark(mark);
-			spin_unlock(lock);
+			spin_unlock(&conn->lock);
 			return mark;
 		}
 	}
-	spin_unlock(lock);
+	spin_unlock(&conn->lock);
 	return NULL;
 }
 
@@ -595,16 +575,10 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn)
 {
 	struct fsnotify_mark *mark;
-	spinlock_t *lock;
 
 	if (!conn)
 		return;
 
-	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
-		lock = &conn->inode->i_lock;
-	else
-		lock = &conn->mnt->mnt_root->d_lock;
-
 	while (1) {
 		/*
 		 * We have to be careful since we can race with e.g.
@@ -613,15 +587,15 @@ void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn)
 		 * we are holding mark reference so mark cannot be freed and
 		 * calling fsnotify_destroy_mark() more than once is fine.
 		 */
-		spin_lock(lock);
+		spin_lock(&conn->lock);
 		if (hlist_empty(&conn->list)) {
-			spin_unlock(lock);
+			spin_unlock(&conn->lock);
 			break;
 		}
 		mark = hlist_entry(conn->list.first, struct fsnotify_mark,
 				   obj_list);
 		fsnotify_get_mark(mark);
-		spin_unlock(lock);
+		spin_unlock(&conn->lock);
 		fsnotify_destroy_mark(mark, mark->group);
 		fsnotify_put_mark(mark);
 	}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b954f1b2571c..02c6fac652a4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -201,6 +201,7 @@ struct fsnotify_group {
  * inode / vfsmount gets freed.
  */
 struct fsnotify_mark_connector {
+	spinlock_t lock;
 #define FSNOTIFY_OBJ_TYPE_INODE		0x01
 #define FSNOTIFY_OBJ_TYPE_VFSMOUNT	0x02
 	unsigned int flags;	/* Type of object [lock] */
@@ -240,7 +241,7 @@ struct fsnotify_mark {
 	struct list_head g_list;
 	/* Protects inode / mnt pointers, flags, masks */
 	spinlock_t lock;
-	/* List of marks for inode / vfsmount [obj_lock] */
+	/* List of marks for inode / vfsmount [connector->lock] */
 	struct hlist_node obj_list;
 	/* Head of list of marks for an object [mark->lock, group->mark_mutex] */
 	struct fsnotify_mark_connector *connector;
-- 
cgit v1.2.3


From 08991e83b7286635167bab40927665a90fb00d81 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Feb 2017 09:21:58 +0100
Subject: fsnotify: Free fsnotify_mark_connector when there is no mark attached

Currently we free fsnotify_mark_connector structure only when inode /
vfsmount is getting freed. This can however impose noticeable memory
overhead when marks get attached to inodes only temporarily. So free the
connector structure once the last mark is detached from the object.
Since notification infrastructure can be working with the connector
under the protection of fsnotify_mark_srcu, we have to be careful and
free the fsnotify_mark_connector only after SRCU period passes.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/inode.c                       |   3 -
 fs/mount.h                       |   2 +-
 fs/namespace.c                   |   3 -
 fs/notify/fsnotify.c             |   9 ++-
 fs/notify/fsnotify.h             |  10 +--
 fs/notify/inode_mark.c           |   2 +-
 fs/notify/mark.c                 | 152 ++++++++++++++++++++++++++++-----------
 fs/notify/vfsmount_mark.c        |   2 +-
 include/linux/fs.h               |   2 +-
 include/linux/fsnotify_backend.h |  11 +--
 kernel/auditsc.c                 |   6 +-
 11 files changed, 136 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 750e952d2918..131b2bcebc48 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -234,9 +234,6 @@ void __destroy_inode(struct inode *inode)
 	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
-#ifdef CONFIG_FSNOTIFY
-	fsnotify_connector_free(&inode->i_fsnotify_marks);
-#endif
 	locks_free_lock_context(inode);
 	if (!inode->i_nlink) {
 		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
diff --git a/fs/mount.h b/fs/mount.h
index bc409360a03b..bf1fda6eed8f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -59,7 +59,7 @@ struct mount {
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
 #ifdef CONFIG_FSNOTIFY
-	struct fsnotify_mark_connector *mnt_fsnotify_marks;
+	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
 #endif
 	int mnt_id;			/* mount identifier */
diff --git a/fs/namespace.c b/fs/namespace.c
index 2625e1d97a3a..b3b115bd4e1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1108,9 +1108,6 @@ static void cleanup_mnt(struct mount *mnt)
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
 	fsnotify_vfsmount_delete(&mnt->mnt);
-#ifdef CONFIG_FSNOTIFY
-	fsnotify_connector_free(&mnt->mnt_fsnotify_marks);
-#endif
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
 	mnt_free_id(mnt);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index eae621a18ac9..d512ef9f75fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -228,7 +228,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 
 	if ((mask & FS_MODIFY) ||
 	    (test_mask & to_tell->i_fsnotify_mask)) {
-		inode_conn = lockless_dereference(to_tell->i_fsnotify_marks);
+		inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
+					      &fsnotify_mark_srcu);
 		if (inode_conn)
 			inode_node = srcu_dereference(inode_conn->list.first,
 						      &fsnotify_mark_srcu);
@@ -236,11 +237,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 
 	if (mnt && ((mask & FS_MODIFY) ||
 		    (test_mask & mnt->mnt_fsnotify_mask))) {
-		inode_conn = lockless_dereference(to_tell->i_fsnotify_marks);
+		inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
+					      &fsnotify_mark_srcu);
 		if (inode_conn)
 			inode_node = srcu_dereference(inode_conn->list.first,
 						      &fsnotify_mark_srcu);
-		vfsmount_conn = lockless_dereference(mnt->mnt_fsnotify_marks);
+		vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks,
+					         &fsnotify_mark_srcu);
 		if (vfsmount_conn)
 			vfsmount_node = srcu_dereference(
 						vfsmount_conn->list.first,
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 510f027bdf0f..72050b75ca8c 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -20,19 +20,19 @@ extern int fsnotify_compare_groups(struct fsnotify_group *a,
 
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(
-					struct fsnotify_mark_connector *conn,
-					struct fsnotify_group *group);
+				struct fsnotify_mark_connector __rcu **connp,
+				struct fsnotify_group *group);
 /* Destroy all marks connected via given connector */
-extern void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn);
+extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp);
 /* run the list of all marks associated with inode and destroy them */
 static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-	fsnotify_destroy_marks(inode->i_fsnotify_marks);
+	fsnotify_destroy_marks(&inode->i_fsnotify_marks);
 }
 /* run the list of all marks associated with vfsmount and destroy them */
 static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 {
-	fsnotify_destroy_marks(real_mount(mnt)->mnt_fsnotify_marks);
+	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
 }
 /* prepare for freeing all marks associated with given group */
 extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 080b6d8b9973..b9370316727e 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -50,7 +50,7 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
 struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
 					       struct inode *inode)
 {
-	return fsnotify_find_mark(inode->i_fsnotify_marks, group);
+	return fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 }
 
 /**
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index bfb415d0d757..824095db5a3b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -89,10 +89,14 @@ struct kmem_cache *fsnotify_mark_connector_cachep;
 
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
+static struct fsnotify_mark_connector *connector_destroy_list;
 
 static void fsnotify_mark_destroy_workfn(struct work_struct *work);
 static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
 
+static void fsnotify_connector_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
+
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
 	atomic_inc(&mark->refcnt);
@@ -139,22 +143,73 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 		__fsnotify_update_child_dentry_flags(conn->inode);
 }
 
+/* Free all connectors queued for freeing once SRCU period ends */
+static void fsnotify_connector_destroy_workfn(struct work_struct *work)
+{
+	struct fsnotify_mark_connector *conn, *free;
+
+	spin_lock(&destroy_lock);
+	conn = connector_destroy_list;
+	connector_destroy_list = NULL;
+	spin_unlock(&destroy_lock);
+
+	synchronize_srcu(&fsnotify_mark_srcu);
+	while (conn) {
+		free = conn;
+		conn = conn->destroy_next;
+		kmem_cache_free(fsnotify_mark_connector_cachep, free);
+	}
+}
+
+
+static struct inode *fsnotify_detach_connector_from_object(
+					struct fsnotify_mark_connector *conn)
+{
+	struct inode *inode = NULL;
+
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+		inode = conn->inode;
+		rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
+		inode->i_fsnotify_mask = 0;
+		conn->inode = NULL;
+		conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
+	} else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+		rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks,
+				   NULL);
+		real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
+		conn->mnt = NULL;
+		conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+	}
+
+	return inode;
+}
+
 static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
 {
 	struct fsnotify_mark_connector *conn;
 	struct inode *inode = NULL;
+	bool free_conn = false;
 
 	conn = mark->connector;
 	spin_lock(&conn->lock);
 	hlist_del_init_rcu(&mark->obj_list);
 	if (hlist_empty(&conn->list)) {
-		if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
-			inode = conn->inode;
+		inode = fsnotify_detach_connector_from_object(conn);
+		free_conn = true;
+	} else {
+		__fsnotify_recalc_mask(conn);
 	}
-	__fsnotify_recalc_mask(conn);
 	mark->connector = NULL;
 	spin_unlock(&conn->lock);
 
+	if (free_conn) {
+		spin_lock(&destroy_lock);
+		conn->destroy_next = connector_destroy_list;
+		connector_destroy_list = conn;
+		spin_unlock(&destroy_lock);
+		queue_work(system_unbound_wq, &connector_reaper_work);
+	}
+
 	return inode;
 }
 
@@ -259,14 +314,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	fsnotify_free_mark(mark);
 }
 
-void fsnotify_connector_free(struct fsnotify_mark_connector **connp)
-{
-	if (*connp) {
-		kmem_cache_free(fsnotify_mark_connector_cachep, *connp);
-		*connp = NULL;
-	}
-}
-
 void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
 {
 	assert_spin_locked(&mark->lock);
@@ -318,9 +365,9 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 }
 
 static int fsnotify_attach_connector_to_object(
-					struct fsnotify_mark_connector **connp,
-					struct inode *inode,
-					struct vfsmount *mnt)
+				struct fsnotify_mark_connector __rcu **connp,
+				struct inode *inode,
+				struct vfsmount *mnt)
 {
 	struct fsnotify_mark_connector *conn;
 
@@ -331,7 +378,7 @@ static int fsnotify_attach_connector_to_object(
 	INIT_HLIST_HEAD(&conn->list);
 	if (inode) {
 		conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
-		conn->inode = inode;
+		conn->inode = igrab(inode);
 	} else {
 		conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
 		conn->mnt = mnt;
@@ -342,12 +389,42 @@ static int fsnotify_attach_connector_to_object(
 	 */
 	if (cmpxchg(connp, NULL, conn)) {
 		/* Someone else created list structure for us */
+		if (inode)
+			iput(inode);
 		kmem_cache_free(fsnotify_mark_connector_cachep, conn);
 	}
 
 	return 0;
 }
 
+/*
+ * Get mark connector, make sure it is alive and return with its lock held.
+ * This is for users that get connector pointer from inode or mount. Users that
+ * hold reference to a mark on the list may directly lock connector->lock as
+ * they are sure list cannot go away under them.
+ */
+static struct fsnotify_mark_connector *fsnotify_grab_connector(
+				struct fsnotify_mark_connector __rcu **connp)
+{
+	struct fsnotify_mark_connector *conn;
+	int idx;
+
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
+	conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
+	if (!conn)
+		goto out;
+	spin_lock(&conn->lock);
+	if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
+			     FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
+		spin_unlock(&conn->lock);
+		srcu_read_unlock(&fsnotify_mark_srcu, idx);
+		return NULL;
+	}
+out:
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
+	return conn;
+}
+
 /*
  * Add mark into proper place in given list of marks. These marks may be used
  * for the fsnotify backend to determine which event types should be delivered
@@ -360,7 +437,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 {
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
-	struct fsnotify_mark_connector **connp;
+	struct fsnotify_mark_connector __rcu **connp;
 	int cmp;
 	int err = 0;
 
@@ -370,21 +447,20 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
 		connp = &inode->i_fsnotify_marks;
 	else
 		connp = &real_mount(mnt)->mnt_fsnotify_marks;
-
-	if (!*connp) {
+restart:
+	spin_lock(&mark->lock);
+	conn = fsnotify_grab_connector(connp);
+	if (!conn) {
+		spin_unlock(&mark->lock);
 		err = fsnotify_attach_connector_to_object(connp, inode, mnt);
 		if (err)
 			return err;
+		goto restart;
 	}
-	spin_lock(&mark->lock);
-	conn = *connp;
-	spin_lock(&conn->lock);
 
 	/* is mark the first mark? */
 	if (hlist_empty(&conn->list)) {
 		hlist_add_head_rcu(&mark->obj_list, &conn->list);
-		if (inode)
-			igrab(inode);
 		goto added;
 	}
 
@@ -486,15 +562,17 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
  * Given a list of marks, find the mark associated with given group. If found
  * take a reference to that mark and return it, else return NULL.
  */
-struct fsnotify_mark *fsnotify_find_mark(struct fsnotify_mark_connector *conn,
-					 struct fsnotify_group *group)
+struct fsnotify_mark *fsnotify_find_mark(
+				struct fsnotify_mark_connector __rcu **connp,
+				struct fsnotify_group *group)
 {
+	struct fsnotify_mark_connector *conn;
 	struct fsnotify_mark *mark;
 
+	conn = fsnotify_grab_connector(connp);
 	if (!conn)
 		return NULL;
 
-	spin_lock(&conn->lock);
 	hlist_for_each_entry(mark, &conn->list, obj_list) {
 		if (mark->group == group) {
 			fsnotify_get_mark(mark);
@@ -572,26 +650,20 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 	}
 }
 
-void fsnotify_destroy_marks(struct fsnotify_mark_connector *conn)
+/* Destroy all marks attached to inode / vfsmount */
+void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
 {
+	struct fsnotify_mark_connector *conn;
 	struct fsnotify_mark *mark;
 
-	if (!conn)
-		return;
-
-	while (1) {
+	while ((conn = fsnotify_grab_connector(connp))) {
 		/*
 		 * We have to be careful since we can race with e.g.
-		 * fsnotify_clear_marks_by_group() and once we drop 'lock',
-		 * mark can get removed from the obj_list and destroyed. But
-		 * we are holding mark reference so mark cannot be freed and
-		 * calling fsnotify_destroy_mark() more than once is fine.
+		 * fsnotify_clear_marks_by_group() and once we drop the list
+		 * lock, mark can get removed from the obj_list and destroyed.
+		 * But we are holding mark reference so mark cannot be freed
+		 * and calling fsnotify_destroy_mark() more than once is fine.
 		 */
-		spin_lock(&conn->lock);
-		if (hlist_empty(&conn->list)) {
-			spin_unlock(&conn->lock);
-			break;
-		}
 		mark = hlist_entry(conn->list.first, struct fsnotify_mark,
 				   obj_list);
 		fsnotify_get_mark(mark);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 26da5c209944..dd5f3fcbccfb 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -48,5 +48,5 @@ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
 {
 	struct mount *m = real_mount(mnt);
 
-	return fsnotify_find_mark(m->mnt_fsnotify_marks, group);
+	return fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66e52342be2d..c0b6150c5fcc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -647,7 +647,7 @@ struct inode {
 
 #ifdef CONFIG_FSNOTIFY
 	__u32			i_fsnotify_mask; /* all events this inode cares about */
-	struct fsnotify_mark_connector	*i_fsnotify_marks;
+	struct fsnotify_mark_connector __rcu	*i_fsnotify_marks;
 #endif
 
 #if IS_ENABLED(CONFIG_FS_ENCRYPTION)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 02c6fac652a4..84d71b6f75f6 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -197,8 +197,8 @@ struct fsnotify_group {
 /*
  * Inode / vfsmount point to this structure which tracks all marks attached to
  * the inode / vfsmount. The reference to inode / vfsmount is held by this
- * structure whenever the list is non-empty. The structure is freed only when
- * inode / vfsmount gets freed.
+ * structure. We destroy this structure when there are no more marks attached
+ * to it. The structure is protected by fsnotify_mark_srcu.
  */
 struct fsnotify_mark_connector {
 	spinlock_t lock;
@@ -209,7 +209,11 @@ struct fsnotify_mark_connector {
 		struct inode *inode;
 		struct vfsmount *mnt;
 	};
-	struct hlist_head list;
+	union {
+		struct hlist_head list;
+		/* Used listing heads to free after srcu period expires */
+		struct fsnotify_mark_connector *destroy_next;
+	};
 };
 
 /*
@@ -361,7 +365,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks attached to given object type */
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-extern void fsnotify_connector_free(struct fsnotify_mark_connector **connp);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bf7b7ca295d0..d383c33540af 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1597,8 +1597,7 @@ static inline void handle_one(const struct inode *inode)
 	struct audit_tree_refs *p;
 	struct audit_chunk *chunk;
 	int count;
-	if (likely(!inode->i_fsnotify_marks ||
-		   hlist_empty(&inode->i_fsnotify_marks->list)))
+	if (likely(!inode->i_fsnotify_marks))
 		return;
 	context = current->audit_context;
 	p = context->trees;
@@ -1641,8 +1640,7 @@ retry:
 	seq = read_seqbegin(&rename_lock);
 	for(;;) {
 		struct inode *inode = d_backing_inode(d);
-		if (inode && unlikely(inode->i_fsnotify_marks &&
-		    !hlist_empty(&inode->i_fsnotify_marks->list))) {
+		if (inode && unlikely(inode->i_fsnotify_marks)) {
 			struct audit_chunk *chunk;
 			chunk = audit_tree_lookup(inode);
 			if (chunk) {
-- 
cgit v1.2.3


From 6b3f05d24d355f50f3d9814304650fcab0efb482 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 12:15:30 +0100
Subject: fsnotify: Detach mark from object list when last reference is dropped

Instead of removing mark from object list from fsnotify_detach_mark(),
remove the mark when last reference to the mark is dropped. This will
allow fanotify to wait for userspace response to event without having to
hold onto fsnotify_mark_srcu.

To avoid pinning inodes by elevated refcount (and thus e.g. delaying
file deletion) while someone holds mark reference, we detach connector
from the object also from fsnotify_destroy_marks() and not only after
removing last mark from the list as it was now.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/mark.c                 | 147 +++++++++++++++++++++++----------------
 include/linux/fsnotify_backend.h |   4 +-
 kernel/audit_tree.c              |  31 ++++-----
 3 files changed, 105 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index df66d708a7ec..21c7791362c8 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -49,7 +49,13 @@
  *
  * A list of notification marks relating to inode / mnt is contained in
  * fsnotify_mark_connector. That structure is alive as long as there are any
- * marks in the list and is also protected by fsnotify_mark_srcu.
+ * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
+ * detached from fsnotify_mark_connector when last reference to the mark is
+ * dropped.  Thus having mark reference is enough to protect mark->connector
+ * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
+ * because we remove mark from g_list before dropping mark reference associated
+ * with that, any mark found through g_list is guaranteed to have
+ * mark->connector set until we drop group->mark_mutex.
  *
  * LIFETIME:
  * Inode marks survive between when they are added to an inode and when their
@@ -103,26 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
 	atomic_inc(&mark->refcnt);
 }
 
-void fsnotify_put_mark(struct fsnotify_mark *mark)
-{
-	if (atomic_dec_and_test(&mark->refcnt)) {
-		spin_lock(&destroy_lock);
-		list_add(&mark->g_list, &destroy_list);
-		spin_unlock(&destroy_lock);
-		queue_delayed_work(system_unbound_wq, &reaper_work,
-				   FSNOTIFY_REAPER_DELAY);
-	}
-}
-
 static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	u32 new_mask = 0;
 	struct fsnotify_mark *mark;
 
 	assert_spin_locked(&conn->lock);
-	hlist_for_each_entry(mark, &conn->list, obj_list)
-		new_mask |= mark->mask;
-
+	hlist_for_each_entry(mark, &conn->list, obj_list) {
+		if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
+			new_mask |= mark->mask;
+	}
 	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
 		conn->inode->i_fsnotify_mask = new_mask;
 	else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
@@ -131,8 +127,9 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 
 /*
  * Calculate mask of events for a list of marks. The caller must make sure
- * connector cannot disappear under us (usually by holding a mark->lock or
- * mark->group->mark_mutex for a mark on this list).
+ * connector and connector->inode cannot disappear under us.  Callers achieve
+ * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
+ * list.
  */
 void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
@@ -164,7 +161,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
 	}
 }
 
-
 static struct inode *fsnotify_detach_connector_from_object(
 					struct fsnotify_mark_connector *conn)
 {
@@ -187,14 +183,34 @@ static struct inode *fsnotify_detach_connector_from_object(
 	return inode;
 }
 
-static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
+static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
+{
+	if (mark->group)
+		fsnotify_put_group(mark->group);
+	mark->free_mark(mark);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_mark_connector *conn;
 	struct inode *inode = NULL;
 	bool free_conn = false;
 
+	/* Catch marks that were actually never attached to object */
+	if (!mark->connector) {
+		if (atomic_dec_and_test(&mark->refcnt))
+			fsnotify_final_mark_destroy(mark);
+		return;
+	}
+
+	/*
+	 * We have to be careful so that traversals of obj_list under lock can
+	 * safely grab mark reference.
+	 */
+	if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+		return;
+
 	conn = mark->connector;
-	spin_lock(&conn->lock);
 	hlist_del_init_rcu(&mark->obj_list);
 	if (hlist_empty(&conn->list)) {
 		inode = fsnotify_detach_connector_from_object(conn);
@@ -205,6 +221,8 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
 	mark->connector = NULL;
 	spin_unlock(&conn->lock);
 
+	iput(inode);
+
 	if (free_conn) {
 		spin_lock(&destroy_lock);
 		conn->destroy_next = connector_destroy_list;
@@ -212,20 +230,31 @@ static struct inode *fsnotify_detach_from_object(struct fsnotify_mark *mark)
 		spin_unlock(&destroy_lock);
 		queue_work(system_unbound_wq, &connector_reaper_work);
 	}
-
-	return inode;
+	/*
+	 * Note that we didn't update flags telling whether inode cares about
+	 * what's happening with children. We update these flags from
+	 * __fsnotify_parent() lazily when next event happens on one of our
+	 * children.
+	 */
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	queue_delayed_work(system_unbound_wq, &reaper_work,
+			   FSNOTIFY_REAPER_DELAY);
 }
 
 /*
- * Remove mark from inode / vfsmount list, group list, drop inode reference
- * if we got one.
+ * Mark mark as detached, remove it from group list. Mark still stays in object
+ * list until its last reference is dropped. Note that we rely on mark being
+ * removed from group list before corresponding reference to it is dropped. In
+ * particular we rely on mark->connector being valid while we hold
+ * group->mark_mutex if we found the mark through g_list.
  *
  * Must be called with group->mark_mutex held. The caller must either hold
  * reference to the mark or be protected by fsnotify_mark_srcu.
  */
 void fsnotify_detach_mark(struct fsnotify_mark *mark)
 {
-	struct inode *inode = NULL;
 	struct fsnotify_group *group = mark->group;
 
 	WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
@@ -234,31 +263,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 			!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
 
 	spin_lock(&mark->lock);
-
 	/* something else already called this function on this mark */
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		spin_unlock(&mark->lock);
 		return;
 	}
-
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
-
-	inode = fsnotify_detach_from_object(mark);
-
-	/*
-	 * Note that we didn't update flags telling whether inode cares about
-	 * what's happening with children. We update these flags from
-	 * __fsnotify_parent() lazily when next event happens on one of our
-	 * children.
-	 */
-
 	list_del_init(&mark->g_list);
-
 	spin_unlock(&mark->lock);
 
-	if (inode)
-		iput(inode);
-
 	atomic_dec(&group->num_marks);
 
 	/* Drop mark reference acquired in fsnotify_add_mark_locked() */
@@ -458,7 +471,9 @@ restart:
 	hlist_for_each_entry(lmark, &conn->list, obj_list) {
 		last = lmark;
 
-		if ((lmark->group == mark->group) && !allow_dups) {
+		if ((lmark->group == mark->group) &&
+		    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
+		    !allow_dups) {
 			err = -EEXIST;
 			goto out_err;
 		}
@@ -509,7 +524,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	mark->group = group;
 	list_add(&mark->g_list, &group->marks_list);
 	atomic_inc(&group->num_marks);
-	fsnotify_get_mark(mark); /* for i_list and g_list */
+	fsnotify_get_mark(mark); /* for g_list */
 	spin_unlock(&mark->lock);
 
 	ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups);
@@ -557,7 +572,8 @@ struct fsnotify_mark *fsnotify_find_mark(
 		return NULL;
 
 	hlist_for_each_entry(mark, &conn->list, obj_list) {
-		if (mark->group == group) {
+		if (mark->group == group &&
+		    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 			fsnotify_get_mark(mark);
 			spin_unlock(&conn->lock);
 			return mark;
@@ -637,23 +653,38 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
 {
 	struct fsnotify_mark_connector *conn;
-	struct fsnotify_mark *mark;
+	struct fsnotify_mark *mark, *old_mark = NULL;
+	struct inode *inode;
 
-	while ((conn = fsnotify_grab_connector(connp))) {
-		/*
-		 * We have to be careful since we can race with e.g.
-		 * fsnotify_clear_marks_by_group() and once we drop the list
-		 * lock, mark can get removed from the obj_list and destroyed.
-		 * But we are holding mark reference so mark cannot be freed
-		 * and calling fsnotify_destroy_mark() more than once is fine.
-		 */
-		mark = hlist_entry(conn->list.first, struct fsnotify_mark,
-				   obj_list);
+	conn = fsnotify_grab_connector(connp);
+	if (!conn)
+		return;
+	/*
+	 * We have to be careful since we can race with e.g.
+	 * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
+	 * list can get modified. However we are holding mark reference and
+	 * thus our mark cannot be removed from obj_list so we can continue
+	 * iteration after regaining conn->lock.
+	 */
+	hlist_for_each_entry(mark, &conn->list, obj_list) {
 		fsnotify_get_mark(mark);
 		spin_unlock(&conn->lock);
+		if (old_mark)
+			fsnotify_put_mark(old_mark);
+		old_mark = mark;
 		fsnotify_destroy_mark(mark, mark->group);
-		fsnotify_put_mark(mark);
+		spin_lock(&conn->lock);
 	}
+	/*
+	 * Detach list from object now so that we don't pin inode until all
+	 * mark references get dropped. It would lead to strange results such
+	 * as delaying inode deletion or blocking unmount.
+	 */
+	inode = fsnotify_detach_connector_from_object(conn);
+	spin_unlock(&conn->lock);
+	if (old_mark)
+		fsnotify_put_mark(old_mark);
+	iput(inode);
 }
 
 /*
@@ -686,9 +717,7 @@ void fsnotify_mark_destroy_list(void)
 
 	list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
 		list_del_init(&mark->g_list);
-		if (mark->group)
-			fsnotify_put_group(mark->group);
-		mark->free_mark(mark);
+		fsnotify_final_mark_destroy(mark);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 84d71b6f75f6..a483614b25d0 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -245,9 +245,9 @@ struct fsnotify_mark {
 	struct list_head g_list;
 	/* Protects inode / mnt pointers, flags, masks */
 	spinlock_t lock;
-	/* List of marks for inode / vfsmount [connector->lock] */
+	/* List of marks for inode / vfsmount [connector->lock, mark ref] */
 	struct hlist_node obj_list;
-	/* Head of list of marks for an object [mark->lock, group->mark_mutex] */
+	/* Head of list of marks for an object [mark ref] */
 	struct fsnotify_mark_connector *connector;
 	/* Events types to ignore [mark->lock, group->mark_mutex] */
 	__u32 ignored_mask;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index c3b5fcb8eca4..2fa8d61b6fd2 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -172,27 +172,18 @@ static unsigned long inode_to_key(const struct inode *inode)
 /*
  * Function to return search key in our hash from chunk. Key 0 is special and
  * should never be present in the hash.
- *
- * Must be called with chunk->mark.lock held to protect from connector
- * becoming NULL.
  */
-static unsigned long __chunk_to_key(struct audit_chunk *chunk)
+static unsigned long chunk_to_key(struct audit_chunk *chunk)
 {
-	if (!chunk->mark.connector)
+	/*
+	 * We have a reference to the mark so it should be attached to a
+	 * connector.
+	 */
+	if (WARN_ON_ONCE(!chunk->mark.connector))
 		return 0;
 	return (unsigned long)chunk->mark.connector->inode;
 }
 
-static unsigned long chunk_to_key(struct audit_chunk *chunk)
-{
-	unsigned long key;
-
-	spin_lock(&chunk->mark.lock);
-	key = __chunk_to_key(chunk);
-	spin_unlock(&chunk->mark.lock);
-	return key;
-}
-
 static inline struct list_head *chunk_hash(unsigned long key)
 {
 	unsigned long n = key / L1_CACHE_BYTES;
@@ -202,7 +193,7 @@ static inline struct list_head *chunk_hash(unsigned long key)
 /* hash_lock & entry->lock is held by caller */
 static void insert_hash(struct audit_chunk *chunk)
 {
-	unsigned long key = __chunk_to_key(chunk);
+	unsigned long key = chunk_to_key(chunk);
 	struct list_head *list;
 
 	if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED))
@@ -263,6 +254,10 @@ static void untag_chunk(struct node *p)
 
 	mutex_lock(&entry->group->mark_mutex);
 	spin_lock(&entry->lock);
+	/*
+	 * mark_mutex protects mark from getting detached and thus also from
+	 * mark->connector->inode getting NULL.
+	 */
 	if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		spin_unlock(&entry->lock);
 		mutex_unlock(&entry->group->mark_mutex);
@@ -423,6 +418,10 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 
 	mutex_lock(&old_entry->group->mark_mutex);
 	spin_lock(&old_entry->lock);
+	/*
+	 * mark_mutex protects mark from getting detached and thus also from
+	 * mark->connector->inode getting NULL.
+	 */
 	if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		/* old_entry is being shot, lets just lie */
 		spin_unlock(&old_entry->lock);
-- 
cgit v1.2.3


From abc77577a669f424c5d0c185b9994f2621c52aa4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 10 Nov 2016 16:02:11 +0100
Subject: fsnotify: Provide framework for dropping SRCU lock in ->handle_event

fanotify wants to drop fsnotify_mark_srcu lock when waiting for response
from userspace so that the whole notification subsystem is not blocked
during that time. This patch provides a framework for safely getting
mark reference for a mark found in the object list which pins the mark
in that list. We can then drop fsnotify_mark_srcu, wait for userspace
response and then safely continue iteration of the object list once we
reaquire fsnotify_mark_srcu.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.h             |  6 +++
 fs/notify/group.c                |  1 +
 fs/notify/mark.c                 | 82 ++++++++++++++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h |  5 +++
 4 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 2a92dc06198c..86383c7865c0 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -8,6 +8,12 @@
 
 #include "../mount.h"
 
+struct fsnotify_iter_info {
+	struct fsnotify_mark *inode_mark;
+	struct fsnotify_mark *vfsmount_mark;
+	int srcu_idx;
+};
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0fb4aadcc19f..79439cdf16e0 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -126,6 +126,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	/* set to 0 when there a no external references to this group */
 	atomic_set(&group->refcnt, 1);
 	atomic_set(&group->num_marks, 0);
+	atomic_set(&group->user_waits, 0);
 
 	spin_lock_init(&group->notification_lock);
 	INIT_LIST_HEAD(&group->notification_list);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index f916b71c9139..c4f43a6acd9a 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -109,6 +109,16 @@ void fsnotify_get_mark(struct fsnotify_mark *mark)
 	atomic_inc(&mark->refcnt);
 }
 
+/*
+ * Get mark reference when we found the mark via lockless traversal of object
+ * list. Mark can be already removed from the list by now and on its way to be
+ * destroyed once SRCU period ends.
+ */
+static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
+{
+	return atomic_inc_not_zero(&mark->refcnt);
+}
+
 static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	u32 new_mask = 0;
@@ -243,6 +253,72 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 			   FSNOTIFY_REAPER_DELAY);
 }
 
+bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
+{
+	struct fsnotify_group *group;
+
+	if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark))
+		return false;
+
+	if (iter_info->inode_mark)
+		group = iter_info->inode_mark->group;
+	else
+		group = iter_info->vfsmount_mark->group;
+
+	/*
+	 * Since acquisition of mark reference is an atomic op as well, we can
+	 * be sure this inc is seen before any effect of refcount increment.
+	 */
+	atomic_inc(&group->user_waits);
+
+	if (iter_info->inode_mark) {
+		/* This can fail if mark is being removed */
+		if (!fsnotify_get_mark_safe(iter_info->inode_mark))
+			goto out_wait;
+	}
+	if (iter_info->vfsmount_mark) {
+		if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark))
+			goto out_inode;
+	}
+
+	/*
+	 * Now that both marks are pinned by refcount in the inode / vfsmount
+	 * lists, we can drop SRCU lock, and safely resume the list iteration
+	 * once userspace returns.
+	 */
+	srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);
+
+	return true;
+out_inode:
+	if (iter_info->inode_mark)
+		fsnotify_put_mark(iter_info->inode_mark);
+out_wait:
+	if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+		wake_up(&group->notification_waitq);
+	return false;
+}
+
+void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
+{
+	struct fsnotify_group *group = NULL;
+
+	iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
+	if (iter_info->inode_mark) {
+		group = iter_info->inode_mark->group;
+		fsnotify_put_mark(iter_info->inode_mark);
+	}
+	if (iter_info->vfsmount_mark) {
+		group = iter_info->vfsmount_mark->group;
+		fsnotify_put_mark(iter_info->vfsmount_mark);
+	}
+	/*
+	 * We abuse notification_waitq on group shutdown for waiting for all
+	 * marks pinned when waiting for userspace.
+	 */
+	if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+		wake_up(&group->notification_waitq);
+}
+
 /*
  * Mark mark as detached, remove it from group list. Mark still stays in object
  * list until its last reference is dropped. Note that we rely on mark being
@@ -647,6 +723,12 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 		fsnotify_free_mark(mark);
 		fsnotify_put_mark(mark);
 	}
+	/*
+	 * Some marks can still be pinned when waiting for response from
+	 * userspace. Wait for those now. fsnotify_prepare_user_wait() will
+	 * not succeed now so this wait is race-free.
+	 */
+	wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
 }
 
 /* Destroy all marks attached to inode / vfsmount */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a483614b25d0..5bb6d988b9f6 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -80,6 +80,7 @@ struct fsnotify_event;
 struct fsnotify_mark;
 struct fsnotify_event_private_data;
 struct fsnotify_fname;
+struct fsnotify_iter_info;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
@@ -163,6 +164,8 @@ struct fsnotify_group {
 	struct fsnotify_event *overflow_event;	/* Event we queue when the
 						 * notification list is too
 						 * full */
+	atomic_t user_waits;		/* Number of tasks waiting for user
+					 * response */
 
 	/* groups can define private fields here or use the void *private */
 	union {
@@ -368,6 +371,8 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
+extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
+extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern void fsnotify_init_event(struct fsnotify_event *event,
-- 
cgit v1.2.3


From 9385a84d7e1f658bb2d96ab798393e4b16268aaa Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 10 Nov 2016 17:51:50 +0100
Subject: fsnotify: Pass fsnotify_iter_info into handle_event handler

Pass fsnotify_iter_info into ->handle_event() handler so that it can
release and reacquire SRCU lock via fsnotify_prepare_user_wait() and
fsnotify_finish_user_wait() functions.  These functions also make sure
current marks are appropriately pinned so that iteration protected by
srcu in fsnotify() stays safe.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c          |  3 ++-
 fs/notify/fanotify/fanotify.c        |  3 ++-
 fs/notify/fsnotify.c                 | 19 +++++++++++++------
 fs/notify/inotify/inotify.h          |  3 ++-
 fs/notify/inotify/inotify_fsnotify.c |  3 ++-
 fs/notify/inotify/inotify_user.c     |  2 +-
 include/linux/fsnotify_backend.h     |  3 ++-
 kernel/audit_fsnotify.c              |  3 ++-
 kernel/audit_tree.c                  |  3 ++-
 kernel/audit_watch.c                 |  3 ++-
 10 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 41b2a070761c..aba165ae3397 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -85,7 +85,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, const void *data, int data_type,
-				const unsigned char *file_name, u32 cookie)
+				const unsigned char *file_name, u32 cookie,
+				struct fsnotify_iter_info *iter_info)
 {
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e5f7e47de68e..ec80a51cbb3d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -174,7 +174,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct fsnotify_mark *inode_mark,
 				 struct fsnotify_mark *fanotify_mark,
 				 u32 mask, const void *data, int data_type,
-				 const unsigned char *file_name, u32 cookie)
+				 const unsigned char *file_name, u32 cookie,
+				 struct fsnotify_iter_info *iter_info)
 {
 	int ret = 0;
 	struct fanotify_event_info *event;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d512ef9f75fc..c4afb6a88268 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -127,7 +127,8 @@ static int send_to_group(struct inode *to_tell,
 			 struct fsnotify_mark *vfsmount_mark,
 			 __u32 mask, const void *data,
 			 int data_is, u32 cookie,
-			 const unsigned char *file_name)
+			 const unsigned char *file_name,
+			 struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_group *group = NULL;
 	__u32 inode_test_mask = 0;
@@ -178,7 +179,7 @@ static int send_to_group(struct inode *to_tell,
 
 	return group->ops->handle_event(group, to_tell, inode_mark,
 					vfsmount_mark, mask, data, data_is,
-					file_name, cookie);
+					file_name, cookie, iter_info);
 }
 
 /*
@@ -194,8 +195,9 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
 	struct fsnotify_group *inode_group, *vfsmount_group;
 	struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
+	struct fsnotify_iter_info iter_info;
 	struct mount *mnt;
-	int idx, ret = 0;
+	int ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
@@ -224,7 +226,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
 		return 0;
 
-	idx = srcu_read_lock(&fsnotify_mark_srcu);
+	iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
 
 	if ((mask & FS_MODIFY) ||
 	    (test_mask & to_tell->i_fsnotify_mask)) {
@@ -284,8 +286,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 				vfsmount_mark = NULL;
 			}
 		}
+
+		iter_info.inode_mark = inode_mark;
+		iter_info.vfsmount_mark = vfsmount_mark;
+
 		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
-				    data, data_is, cookie, file_name);
+				    data, data_is, cookie, file_name,
+				    &iter_info);
 
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
 			goto out;
@@ -299,7 +306,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	}
 	ret = 0;
 out:
-	srcu_read_unlock(&fsnotify_mark_srcu, idx);
+	srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);
 
 	return ret;
 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7c461fd49c4c..7a966f456269 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -27,7 +27,8 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, const void *data, int data_type,
-				const unsigned char *file_name, u32 cookie);
+				const unsigned char *file_name, u32 cookie,
+				struct fsnotify_iter_info *iter_info);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
 
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index f310d8368a2d..ccd6a4055e0c 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group,
 			 struct fsnotify_mark *inode_mark,
 			 struct fsnotify_mark *vfsmount_mark,
 			 u32 mask, const void *data, int data_type,
-			 const unsigned char *file_name, u32 cookie)
+			 const unsigned char *file_name, u32 cookie,
+			 struct fsnotify_iter_info *iter_info)
 {
 	struct inotify_inode_mark *i_mark;
 	struct inotify_event_info *event;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 43cbd1b178c9..05b268ec0f5f 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -474,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 
 	/* Queue ignore event for the watch */
 	inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
-			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL);
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 	/* remove this mark from the idr */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 5bb6d988b9f6..744a4b9076f9 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -99,7 +99,8 @@ struct fsnotify_ops {
 			    struct fsnotify_mark *inode_mark,
 			    struct fsnotify_mark *vfsmount_mark,
 			    u32 mask, const void *data, int data_type,
-			    const unsigned char *file_name, u32 cookie);
+			    const unsigned char *file_name, u32 cookie,
+			    struct fsnotify_iter_info *iter_info);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event)(struct fsnotify_event *event);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 7ea57e516029..e8b371ff1e91 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 				    struct fsnotify_mark *inode_mark,
 				    struct fsnotify_mark *vfsmount_mark,
 				    u32 mask, const void *data, int data_type,
-				    const unsigned char *dname, u32 cookie)
+				    const unsigned char *dname, u32 cookie,
+				    struct fsnotify_iter_info *iter_info)
 {
 	struct audit_fsnotify_mark *audit_mark;
 	const struct inode *inode = NULL;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2fa8d61b6fd2..d59ed4c9037a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -989,7 +989,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 				   struct fsnotify_mark *inode_mark,
 				   struct fsnotify_mark *vfsmount_mark,
 				   u32 mask, const void *data, int data_type,
-				   const unsigned char *file_name, u32 cookie)
+				   const unsigned char *file_name, u32 cookie,
+				   struct fsnotify_iter_info *iter_info)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f79e4658433d..6caaf087801f 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -472,7 +472,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 				    struct fsnotify_mark *inode_mark,
 				    struct fsnotify_mark *vfsmount_mark,
 				    u32 mask, const void *data, int data_type,
-				    const unsigned char *dname, u32 cookie)
+				    const unsigned char *dname, u32 cookie,
+				    struct fsnotify_iter_info *iter_info)
 {
 	const struct inode *inode;
 	struct audit_parent *parent;
-- 
cgit v1.2.3


From 66d2b81bcb92c14b22a56a9ff936f2b40accc83c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 16:03:59 +0100
Subject: fsnotify: Remove fsnotify_set_mark_{,ignored_}mask_locked()

These helpers are now only a simple assignment and just obfuscate
what is going on. Remove them.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c        |  9 +++------
 fs/notify/fanotify/fanotify_user.c |  9 ++++-----
 fs/notify/inotify/inotify_user.c   |  6 ++----
 fs/notify/mark.c                   | 14 --------------
 include/linux/fsnotify_backend.h   |  4 ----
 5 files changed, 9 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index aba165ae3397..5940c75541a7 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -52,7 +52,7 @@ struct dnotify_mark {
  */
 static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 {
-	__u32 new_mask, old_mask;
+	__u32 new_mask = 0;
 	struct dnotify_struct *dn;
 	struct dnotify_mark *dn_mark  = container_of(fsn_mark,
 						     struct dnotify_mark,
@@ -60,14 +60,11 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 
 	assert_spin_locked(&fsn_mark->lock);
 
-	old_mask = fsn_mark->mask;
-	new_mask = 0;
 	for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
 		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-	fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
-
-	if (old_mask == new_mask)
+	if (fsn_mark->mask == new_mask)
 		return;
+	fsn_mark->mask = new_mask;
 
 	fsnotify_recalc_mask(fsn_mark->connector);
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c5e69870287f..cf38a345032f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -511,13 +511,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 			tmask &= ~FAN_ONDIR;
 
 		oldmask = fsn_mark->mask;
-		fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+		fsn_mark->mask = tmask;
 	} else {
 		__u32 tmask = fsn_mark->ignored_mask & ~mask;
 		if (flags & FAN_MARK_ONDIR)
 			tmask &= ~FAN_ONDIR;
-
-		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		fsn_mark->ignored_mask = tmask;
 	}
 	*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
 	spin_unlock(&fsn_mark->lock);
@@ -599,13 +598,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 			tmask |= FAN_ONDIR;
 
 		oldmask = fsn_mark->mask;
-		fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+		fsn_mark->mask = tmask;
 	} else {
 		__u32 tmask = fsn_mark->ignored_mask | mask;
 		if (flags & FAN_MARK_ONDIR)
 			tmask |= FAN_ONDIR;
 
-		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		fsn_mark->ignored_mask = tmask;
 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 	}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 05b268ec0f5f..69739b26c7e4 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -513,14 +513,12 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
 	spin_lock(&fsn_mark->lock);
-
 	old_mask = fsn_mark->mask;
 	if (add)
-		fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+		fsn_mark->mask |= mask;
 	else
-		fsnotify_set_mark_mask_locked(fsn_mark, mask);
+		fsn_mark->mask = mask;
 	new_mask = fsn_mark->mask;
-
 	spin_unlock(&fsn_mark->lock);
 
 	if (old_mask != new_mask) {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c4f43a6acd9a..ae33e9f91849 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -393,20 +393,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	fsnotify_free_mark(mark);
 }
 
-void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
-	assert_spin_locked(&mark->lock);
-
-	mark->mask = mask;
-}
-
-void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
-	assert_spin_locked(&mark->lock);
-
-	mark->ignored_mask = mask;
-}
-
 /*
  * Sorting function for lists of fsnotify marks.
  *
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 744a4b9076f9..63354cd86a7b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -347,10 +347,6 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(str
 extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
 /* find (and take a reference) to a mark associated with group and vfsmount */
 extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
-/* set the ignored_mask of a mark */
-extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask);
-/* set the mask of a mark (might pin the object into memory */
-extern void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask);
 /* attach the mark to both the group and the inode */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 			     struct inode *inode, struct vfsmount *mnt, int allow_dups);
-- 
cgit v1.2.3


From 8920d2734d9a1b68e1b53d8c12b289773cdbd971 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 16:13:54 +0100
Subject: fsnotify: Remove fsnotify_recalc_{inode|vfsmount}_mask()

These helpers are just very thin wrappers now. Remove them.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify_user.c | 8 ++++----
 fs/notify/inode_mark.c             | 5 -----
 fs/notify/inotify/inotify_user.c   | 2 +-
 fs/notify/vfsmount_mark.c          | 5 -----
 include/linux/fsnotify_backend.h   | 4 ----
 5 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf38a345032f..24fa3f24b9ad 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -542,7 +542,7 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
-		fsnotify_recalc_vfsmount_mask(mnt);
+		fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
 	if (destroy_mark)
 		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
@@ -571,7 +571,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (removed & inode->i_fsnotify_mask)
-		fsnotify_recalc_inode_mask(inode);
+		fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	if (destroy_mark)
 		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
@@ -656,7 +656,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
-		fsnotify_recalc_vfsmount_mask(mnt);
+		fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
 	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
@@ -693,7 +693,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 	if (added & ~inode->i_fsnotify_mask)
-		fsnotify_recalc_inode_mask(inode);
+		fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index b9370316727e..2188329da3c2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -30,11 +30,6 @@
 
 #include "../internal.h"
 
-void fsnotify_recalc_inode_mask(struct inode *inode)
-{
-	fsnotify_recalc_mask(inode->i_fsnotify_marks);
-}
-
 /*
  * Given a group clear all of the inode marks associated with that group.
  */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69739b26c7e4..b3b2a464a03c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -529,7 +529,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 
 		/* update the inode with this new fsn_mark */
 		if (dropped || do_inode)
-			fsnotify_recalc_inode_mask(inode);
+			fsnotify_recalc_mask(inode->i_fsnotify_marks);
 
 	}
 
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index dd5f3fcbccfb..41bff46576c2 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -34,11 +34,6 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
 }
 
-void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
-{
-	fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
-}
-
 /*
  * given a group and vfsmount, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 63354cd86a7b..6d09c6ff9810 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -338,10 +338,6 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group
 
 /* Calculate mask of events for a list of marks */
 extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
-/* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */
-extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt);
-/* run all marks associated with an inode and update inode->i_fsnotify_mask */
-extern void fsnotify_recalc_inode_mask(struct inode *inode);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
 /* find (and take a reference) to a mark associated with group and inode */
 extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
-- 
cgit v1.2.3


From 416bcdbcbbb4800f11f03e8baf570f9996219f67 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 16:20:32 +0100
Subject: fsnotify: Inline fsnotify_clear_{inode|vfsmount}_mark_group()

Inline these helpers as they are very thin. We still keep them as we
don't want to expose details about how list type is determined.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/inode_mark.c           |  8 --------
 fs/notify/vfsmount_mark.c        |  5 -----
 include/linux/fsnotify_backend.h | 14 ++++++++++----
 3 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 2188329da3c2..bdc15f736082 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -30,14 +30,6 @@
 
 #include "../internal.h"
 
-/*
- * Given a group clear all of the inode marks associated with that group.
- */
-void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
-{
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE);
-}
-
 /*
  * given a group and inode, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 41bff46576c2..1e692c56deec 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -29,11 +29,6 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
-void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
-{
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
-}
-
 /*
  * given a group and vfsmount, find the mark associated with that combination.
  * if found take a reference to that mark and return it, else return NULL
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 6d09c6ff9810..700b4fa991d4 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -355,12 +355,18 @@ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
 /* free mark */
 extern void fsnotify_free_mark(struct fsnotify_mark *mark);
-/* run all the marks in a group, and clear all of the vfsmount marks */
-extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
-/* run all the marks in a group, and clear all of the inode marks */
-extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks attached to given object type */
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
+/* run all the marks in a group, and clear all of the vfsmount marks */
+static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
+}
+/* run all the marks in a group, and clear all of the inode marks */
+static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+{
+	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE);
+}
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
-- 
cgit v1.2.3


From 18f2e0d3a43641889ac2ba9d7508d47359eec063 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 4 Jan 2017 10:33:18 +0100
Subject: fsnotify: Rename fsnotify_clear_marks_by_group_flags()

The _flags() suffix in the function name was more confusing than
explaining so just remove it. Also rename the argument from 'flags' to
'type' to better explain what the function expects.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Suggested-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/mark.c                 | 12 +++++-------
 include/linux/fsnotify_backend.h |  6 +++---
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index ae33e9f91849..89656abbf4f8 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -67,7 +67,7 @@
  * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
  * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
  * - The fsnotify_group associated with the mark is going away and all such marks
- *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *   need to be cleaned up. (fsnotify_detach_group_marks)
  *
  * This has the very interesting property of being able to run concurrently with
  * any (or all) other directions.
@@ -645,11 +645,9 @@ struct fsnotify_mark *fsnotify_find_mark(
 	return NULL;
 }
 
-/*
- * clear any marks in a group in which mark->flags & flags is true
- */
-void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
-					 unsigned int flags)
+/* Clear any marks in a group with given type */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+				   unsigned int type)
 {
 	struct fsnotify_mark *lmark, *mark;
 	LIST_HEAD(to_free);
@@ -665,7 +663,7 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 	 */
 	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
-		if (mark->connector->flags & flags)
+		if (mark->connector->flags & type)
 			list_move(&mark->g_list, &to_free);
 	}
 	mutex_unlock(&group->mark_mutex);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 700b4fa991d4..d6bbd5acdac1 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -356,16 +356,16 @@ extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
 /* free mark */
 extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and clear all of the marks attached to given object type */
-extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
+extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
 /* run all the marks in a group, and clear all of the vfsmount marks */
 static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
 }
 /* run all the marks in a group, and clear all of the inode marks */
 static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
 {
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_OBJ_TYPE_INODE);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
 }
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
-- 
cgit v1.2.3


From 2e37c6ca8d76c362e844c0cf3ebe8ba2e27940cb Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 4 Jan 2017 10:51:58 +0100
Subject: fsnotify: Remove fsnotify_detach_group_marks()

The function is already mostly contained in what
fsnotify_clear_marks_by_group() does. Just update that function to not
select marks when all of them should be destroyed and remove
fsnotify_detach_group_marks().

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fsnotify.h             |  2 --
 fs/notify/group.c                |  9 +++++++-
 fs/notify/mark.c                 | 45 +++++++++-------------------------------
 include/linux/fsnotify_backend.h |  2 ++
 4 files changed, 20 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 86383c7865c0..3ec593c32684 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -40,8 +40,6 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 {
 	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
 }
-/* prepare for freeing all marks associated with given group */
-extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
 /* Wait until all marks queued for destruction are destroyed */
 extern void fsnotify_wait_marks_destroyed(void);
 
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 79439cdf16e0..32357534de18 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -67,7 +67,14 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
 	fsnotify_group_stop_queueing(group);
 
 	/* Clear all marks for this group and queue them for destruction */
-	fsnotify_detach_group_marks(group);
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES);
+
+	/*
+	 * Some marks can still be pinned when waiting for response from
+	 * userspace. Wait for those now. fsnotify_prepare_user_wait() will
+	 * not succeed now so this wait is race-free.
+	 */
+	wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
 
 	/*
 	 * Wait until all marks get really destroyed. We could actually destroy
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 89656abbf4f8..9f3364ef19d3 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -67,7 +67,7 @@
  * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
  * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
  * - The fsnotify_group associated with the mark is going away and all such marks
- *   need to be cleaned up. (fsnotify_detach_group_marks)
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
  *
  * This has the very interesting property of being able to run concurrently with
  * any (or all) other directions.
@@ -651,7 +651,13 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
 {
 	struct fsnotify_mark *lmark, *mark;
 	LIST_HEAD(to_free);
+	struct list_head *head = &to_free;
 
+	/* Skip selection step if we want to clear all marks. */
+	if (type == FSNOTIFY_OBJ_ALL_TYPES) {
+		head = &group->marks_list;
+		goto clear;
+	}
 	/*
 	 * We have to be really careful here. Anytime we drop mark_mutex, e.g.
 	 * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
@@ -668,13 +674,14 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
 	}
 	mutex_unlock(&group->mark_mutex);
 
+clear:
 	while (1) {
 		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-		if (list_empty(&to_free)) {
+		if (list_empty(head)) {
 			mutex_unlock(&group->mark_mutex);
 			break;
 		}
-		mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
+		mark = list_first_entry(head, struct fsnotify_mark, g_list);
 		fsnotify_get_mark(mark);
 		fsnotify_detach_mark(mark);
 		mutex_unlock(&group->mark_mutex);
@@ -683,38 +690,6 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
 	}
 }
 
-/*
- * Given a group, prepare for freeing all the marks associated with that group.
- * The marks are attached to the list of marks prepared for destruction, the
- * caller is responsible for freeing marks in that list after SRCU period has
- * ended.
- */
-void fsnotify_detach_group_marks(struct fsnotify_group *group)
-{
-	struct fsnotify_mark *mark;
-
-	while (1) {
-		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-		if (list_empty(&group->marks_list)) {
-			mutex_unlock(&group->mark_mutex);
-			break;
-		}
-		mark = list_first_entry(&group->marks_list,
-					struct fsnotify_mark, g_list);
-		fsnotify_get_mark(mark);
-		fsnotify_detach_mark(mark);
-		mutex_unlock(&group->mark_mutex);
-		fsnotify_free_mark(mark);
-		fsnotify_put_mark(mark);
-	}
-	/*
-	 * Some marks can still be pinned when waiting for response from
-	 * userspace. Wait for those now. fsnotify_prepare_user_wait() will
-	 * not succeed now so this wait is race-free.
-	 */
-	wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
-}
-
 /* Destroy all marks attached to inode / vfsmount */
 void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d6bbd5acdac1..7287cba42a66 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -208,6 +208,8 @@ struct fsnotify_mark_connector {
 	spinlock_t lock;
 #define FSNOTIFY_OBJ_TYPE_INODE		0x01
 #define FSNOTIFY_OBJ_TYPE_VFSMOUNT	0x02
+#define FSNOTIFY_OBJ_ALL_TYPES		(FSNOTIFY_OBJ_TYPE_INODE | \
+					 FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 	unsigned int flags;	/* Type of object [lock] */
 	union {	/* Object pointer [lock] */
 		struct inode *inode;
-- 
cgit v1.2.3


From b1362edfe15b20edd3d116cec521aa420b7afb98 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 16:28:45 +0100
Subject: fsnotify: Remove fsnotify_find_{inode|vfsmount}_mark()

These are very thin wrappers, just remove them. Drop
fs/notify/vfsmount_mark.c as it is empty now.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/Makefile                 |  2 +-
 fs/notify/dnotify/dnotify.c        |  4 ++--
 fs/notify/fanotify/fanotify_user.c | 12 ++++++-----
 fs/notify/fsnotify.h               |  4 ----
 fs/notify/inode_mark.c             | 10 ---------
 fs/notify/inotify/inotify_user.c   |  2 +-
 fs/notify/vfsmount_mark.c          | 42 --------------------------------------
 include/linux/fsnotify_backend.h   |  8 ++++----
 kernel/audit_tree.c                |  3 ++-
 kernel/audit_watch.c               |  2 +-
 10 files changed, 18 insertions(+), 71 deletions(-)
 delete mode 100644 fs/notify/vfsmount_mark.c

(limited to 'include/linux')

diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 96d3420d0242..ebb64a0282d1 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
-				   mark.o vfsmount_mark.o fdinfo.o
+				   mark.o fdinfo.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5940c75541a7..b77d8d049e4d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -157,7 +157,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
-	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
 	if (!fsn_mark)
 		return;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -313,7 +313,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	mutex_lock(&dnotify_group->mark_mutex);
 
 	/* add the new_fsn_mark or find an old one. */
-	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
 	if (fsn_mark) {
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 24fa3f24b9ad..5a82bbb79f55 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -533,7 +533,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	int destroy_mark;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+				      group);
 	if (!fsn_mark) {
 		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
@@ -562,7 +563,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	int destroy_mark;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark) {
 		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
@@ -578,7 +579,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	if (destroy_mark)
 		fsnotify_free_mark(fsn_mark);
 
-	/* matches the fsnotify_find_inode_mark() */
+	/* matches the fsnotify_find_mark() */
 	fsnotify_put_mark(fsn_mark);
 
 	return 0;
@@ -646,7 +647,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 	__u32 added;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+				      group);
 	if (!fsn_mark) {
 		fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
 		if (IS_ERR(fsn_mark)) {
@@ -683,7 +685,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 		return 0;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark) {
 		fsn_mark = fanotify_add_new_mark(group, inode, NULL);
 		if (IS_ERR(fsn_mark)) {
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 3ec593c32684..bf012e8ecd14 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -24,10 +24,6 @@ extern struct srcu_struct fsnotify_mark_srcu;
 extern int fsnotify_compare_groups(struct fsnotify_group *a,
 				   struct fsnotify_group *b);
 
-/* Find mark belonging to given group in the list of marks */
-extern struct fsnotify_mark *fsnotify_find_mark(
-				struct fsnotify_mark_connector __rcu **connp,
-				struct fsnotify_group *group);
 /* Destroy all marks connected via given connector */
 extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp);
 /* run the list of all marks associated with inode and destroy them */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index bdc15f736082..5cc317bad082 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -30,16 +30,6 @@
 
 #include "../internal.h"
 
-/*
- * given a group and inode, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
-					       struct inode *inode)
-{
-	return fsnotify_find_mark(&inode->i_fsnotify_marks, group);
-}
-
 /**
  * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
  * @sb: superblock being unmounted.
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b3b2a464a03c..a5e4411362f2 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -506,7 +506,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 
 	mask = inotify_arg_to_mask(arg);
 
-	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark)
 		return -ENOENT;
 
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
deleted file mode 100644
index 1e692c56deec..000000000000
--- a/fs/notify/vfsmount_mark.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-/*
- * given a group and vfsmount, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
-						  struct vfsmount *mnt)
-{
-	struct mount *m = real_mount(mnt);
-
-	return fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
-}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7287cba42a66..2ef0e04c5a9d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -341,10 +341,10 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group
 /* Calculate mask of events for a list of marks */
 extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
-/* find (and take a reference) to a mark associated with group and inode */
-extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
-/* find (and take a reference) to a mark associated with group and vfsmount */
-extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
+/* Find mark belonging to given group in the list of marks */
+extern struct fsnotify_mark *fsnotify_find_mark(
+				struct fsnotify_mark_connector __rcu **connp,
+				struct fsnotify_group *group);
 /* attach the mark to both the group and the inode */
 extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 			     struct inode *inode, struct vfsmount *mnt, int allow_dups);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d59ed4c9037a..3cc5b92de765 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -391,7 +391,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	struct node *p;
 	int n;
 
-	old_entry = fsnotify_find_inode_mark(audit_tree_group, inode);
+	old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks,
+				       audit_tree_group);
 	if (!old_entry)
 		return create_chunk(inode, tree);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6caaf087801f..956fa584c239 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -102,7 +102,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
 	struct audit_parent *parent = NULL;
 	struct fsnotify_mark *entry;
 
-	entry = fsnotify_find_inode_mark(audit_watch_group, inode);
+	entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
 	if (entry)
 		parent = container_of(entry, struct audit_parent, mark);
 
-- 
cgit v1.2.3


From 7b1293234084ddb6469c4e9a5ef818f399b5786b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 18:32:48 +0100
Subject: fsnotify: Add group pointer in fsnotify_init_mark()

Currently we initialize mark->group only in fsnotify_add_mark_lock().
However we will need to access fsnotify_ops of corresponding group from
fsnotify_put_mark() so we need mark->group initialized earlier. Do that
in fsnotify_init_mark() which has a consequence that once
fsnotify_init_mark() is called on a mark, the mark has to be destroyed
by fsnotify_put_mark().

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c        |  5 ++---
 fs/notify/fanotify/fanotify_user.c |  4 ++--
 fs/notify/inotify/inotify_user.c   |  5 ++---
 fs/notify/mark.c                   | 17 ++++++++++-------
 include/linux/fsnotify_backend.h   | 12 +++++++-----
 kernel/audit_fsnotify.c            |  7 ++++---
 kernel/audit_tree.c                | 15 ++++++++-------
 kernel/audit_watch.c               |  5 +++--
 8 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b77d8d049e4d..f9d500fd7b9a 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -305,7 +305,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* set up the new_fsn_mark and new_dn_mark */
 	new_fsn_mark = &new_dn_mark->fsn_mark;
-	fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+	fsnotify_init_mark(new_fsn_mark, dnotify_group, dnotify_free_mark);
 	new_fsn_mark->mask = mask;
 	new_dn_mark->dn = NULL;
 
@@ -318,8 +318,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
 	} else {
-		fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
-					 NULL, 0);
+		fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0);
 		spin_lock(&new_fsn_mark->lock);
 		fsn_mark = new_fsn_mark;
 		dn_mark = new_dn_mark;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5a82bbb79f55..d5775f054be7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -628,8 +628,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	if (!mark)
 		return ERR_PTR(-ENOMEM);
 
-	fsnotify_init_mark(mark, fanotify_free_mark);
-	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+	fsnotify_init_mark(mark, group, fanotify_free_mark);
+	ret = fsnotify_add_mark_locked(mark, inode, mnt, 0);
 	if (ret) {
 		fsnotify_put_mark(mark);
 		return ERR_PTR(ret);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a5e4411362f2..07febafd826e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -558,7 +558,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	if (unlikely(!tmp_i_mark))
 		return -ENOMEM;
 
-	fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+	fsnotify_init_mark(&tmp_i_mark->fsn_mark, group, inotify_free_mark);
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
@@ -574,8 +574,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	}
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
-				       NULL, 0);
+	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_i_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 9f3364ef19d3..2f743e2035e4 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -563,10 +563,10 @@ out_err:
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group.
  */
-int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
-			     struct fsnotify_group *group, struct inode *inode,
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode,
 			     struct vfsmount *mnt, int allow_dups)
 {
+	struct fsnotify_group *group = mark->group;
 	int ret = 0;
 
 	BUG_ON(inode && mnt);
@@ -582,8 +582,6 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	spin_lock(&mark->lock);
 	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
 
-	fsnotify_get_group(group);
-	mark->group = group;
 	list_add(&mark->g_list, &group->marks_list);
 	atomic_inc(&group->num_marks);
 	fsnotify_get_mark(mark); /* for g_list */
@@ -608,12 +606,14 @@ err:
 	return ret;
 }
 
-int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
-		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
+int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode,
+		      struct vfsmount *mnt, int allow_dups)
 {
 	int ret;
+	struct fsnotify_group *group = mark->group;
+
 	mutex_lock(&group->mark_mutex);
-	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
+	ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups);
 	mutex_unlock(&group->mark_mutex);
 	return ret;
 }
@@ -732,12 +732,15 @@ void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
  * Nothing fancy, just initialize lists and locks and counters.
  */
 void fsnotify_init_mark(struct fsnotify_mark *mark,
+			struct fsnotify_group *group,
 			void (*free_mark)(struct fsnotify_mark *mark))
 {
 	memset(mark, 0, sizeof(*mark));
 	spin_lock_init(&mark->lock);
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
+	fsnotify_get_group(group);
+	mark->group = group;
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2ef0e04c5a9d..a64518e36bd5 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -340,15 +340,17 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group
 
 /* Calculate mask of events for a list of marks */
 extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
-extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark));
+extern void fsnotify_init_mark(struct fsnotify_mark *mark,
+			       struct fsnotify_group *group,
+			       void (*free_mark)(struct fsnotify_mark *mark));
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(
 				struct fsnotify_mark_connector __rcu **connp,
 				struct fsnotify_group *group);
-/* attach the mark to both the group and the inode */
-extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
-			     struct inode *inode, struct vfsmount *mnt, int allow_dups);
-extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_group *group,
+/* attach the mark to the inode or vfsmount */
+extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode,
+			     struct vfsmount *mnt, int allow_dups);
+extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 				    struct inode *inode, struct vfsmount *mnt, int allow_dups);
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index e8b371ff1e91..2522ceaca758 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -103,15 +103,16 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 		goto out;
 	}
 
-	fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+	fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group,
+			   audit_fsnotify_free_mark);
 	audit_mark->mark.mask = AUDIT_FS_EVENTS;
 	audit_mark->path = pathname;
 	audit_update_mark(audit_mark, dentry->d_inode);
 	audit_mark->rule = krule;
 
-	ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+	ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true);
 	if (ret < 0) {
-		audit_fsnotify_mark_free(audit_mark);
+		fsnotify_put_mark(&audit_mark->mark);
 		audit_mark = ERR_PTR(ret);
 	}
 out:
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 3cc5b92de765..da7f7a3e6a42 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,7 +154,8 @@ static struct audit_chunk *alloc_chunk(int count)
 		INIT_LIST_HEAD(&chunk->owners[i].list);
 		chunk->owners[i].index = i;
 	}
-	fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+	fsnotify_init_mark(&chunk->mark, audit_tree_group,
+			   audit_tree_destroy_watch);
 	chunk->mark.mask = FS_IN_IGNORED;
 	return chunk;
 }
@@ -262,7 +263,7 @@ static void untag_chunk(struct node *p)
 		spin_unlock(&entry->lock);
 		mutex_unlock(&entry->group->mark_mutex);
 		if (new)
-			free_chunk(new);
+			fsnotify_put_mark(&new->mark);
 		goto out;
 	}
 
@@ -286,8 +287,8 @@ static void untag_chunk(struct node *p)
 	if (!new)
 		goto Fallback;
 
-	if (fsnotify_add_mark_locked(&new->mark, entry->group,
-				     entry->connector->inode, NULL, 1)) {
+	if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode,
+				     NULL, 1)) {
 		fsnotify_put_mark(&new->mark);
 		goto Fallback;
 	}
@@ -352,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
 		return -ENOMEM;
 
 	entry = &chunk->mark;
-	if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
+	if (fsnotify_add_mark(entry, inode, NULL, 0)) {
 		fsnotify_put_mark(entry);
 		return -ENOSPC;
 	}
@@ -428,11 +429,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
 		fsnotify_put_mark(old_entry);
-		free_chunk(chunk);
+		fsnotify_put_mark(&chunk->mark);
 		return -ENOENT;
 	}
 
-	if (fsnotify_add_mark_locked(chunk_entry, old_entry->group,
+	if (fsnotify_add_mark_locked(chunk_entry,
 			     old_entry->connector->inode, NULL, 1)) {
 		spin_unlock(&old_entry->lock);
 		mutex_unlock(&old_entry->group->mark_mutex);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 956fa584c239..e32efed86828 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -157,9 +157,10 @@ static struct audit_parent *audit_init_parent(struct path *path)
 
 	INIT_LIST_HEAD(&parent->watches);
 
-	fsnotify_init_mark(&parent->mark, audit_watch_free_mark);
+	fsnotify_init_mark(&parent->mark, audit_watch_group,
+			   audit_watch_free_mark);
 	parent->mark.mask = AUDIT_FS_WATCH;
-	ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0);
+	ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
 	if (ret < 0) {
 		audit_free_parent(parent);
 		return ERR_PTR(ret);
-- 
cgit v1.2.3


From 054c636e5c8054884ede889be82ce059879945e6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 21 Dec 2016 18:06:12 +0100
Subject: fsnotify: Move ->free_mark callback to fsnotify_ops

Pointer to ->free_mark callback unnecessarily occupies one long in each
fsnotify_mark although they are the same for all marks from one
notification group. Move the callback pointer to fsnotify_ops.

Reviewed-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c          |  3 ++-
 fs/notify/fanotify/fanotify.c        |  6 ++++++
 fs/notify/fanotify/fanotify.h        |  1 +
 fs/notify/fanotify/fanotify_user.c   |  9 ++-------
 fs/notify/inotify/inotify.h          |  1 +
 fs/notify/inotify/inotify_fsnotify.c | 11 +++++++++++
 fs/notify/inotify/inotify_user.c     | 14 ++------------
 fs/notify/mark.c                     | 13 +++++++------
 include/linux/fsnotify_backend.h     |  6 +++---
 kernel/audit_fsnotify.c              |  4 ++--
 kernel/audit_tree.c                  |  4 ++--
 kernel/audit_watch.c                 |  4 ++--
 12 files changed, 41 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index f9d500fd7b9a..2430a0415995 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -135,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 
 static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.handle_event = dnotify_handle_event,
+	.free_mark = dnotify_free_mark,
 };
 
 /*
@@ -305,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* set up the new_fsn_mark and new_dn_mark */
 	new_fsn_mark = &new_dn_mark->fsn_mark;
-	fsnotify_init_mark(new_fsn_mark, dnotify_group, dnotify_free_mark);
+	fsnotify_init_mark(new_fsn_mark, dnotify_group);
 	new_fsn_mark->mask = mask;
 	new_dn_mark->dn = NULL;
 
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 461c21ebebeb..2fa99aeaa095 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -262,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
 	kmem_cache_free(fanotify_event_cachep, event);
 }
 
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.free_group_priv = fanotify_free_group_priv,
 	.free_event = fanotify_free_event,
+	.free_mark = fanotify_free_mark,
 };
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4500a74f8d38..4eb6f5efa282 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,6 +2,7 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 
+extern struct kmem_cache *fanotify_mark_cache;
 extern struct kmem_cache *fanotify_event_cachep;
 extern struct kmem_cache *fanotify_perm_event_cachep;
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index d5775f054be7..bf306d4f72f7 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -41,7 +41,7 @@
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
-static struct kmem_cache *fanotify_mark_cache __read_mostly;
+struct kmem_cache *fanotify_mark_cache __read_mostly;
 struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 
@@ -445,11 +445,6 @@ static const struct file_operations fanotify_fops = {
 	.llseek		= noop_llseek,
 };
 
-static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
-	kmem_cache_free(fanotify_mark_cache, fsn_mark);
-}
-
 static int fanotify_find_path(int dfd, const char __user *filename,
 			      struct path *path, unsigned int flags)
 {
@@ -628,7 +623,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	if (!mark)
 		return ERR_PTR(-ENOMEM);
 
-	fsnotify_init_mark(mark, group, fanotify_free_mark);
+	fsnotify_init_mark(mark, group);
 	ret = fsnotify_add_mark_locked(mark, inode, mnt, 0);
 	if (ret) {
 		fsnotify_put_mark(mark);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7a966f456269..9ff67b61da8a 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -31,6 +31,7 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_iter_info *iter_info);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+extern struct kmem_cache *inotify_inode_mark_cachep;
 
 #ifdef CONFIG_INOTIFY_USER
 static inline void dec_inotify_instances(struct ucounts *ucounts)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index ccd6a4055e0c..8b73332735ba 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -176,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event)
 	kfree(INOTIFY_E(fsn_event));
 }
 
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	struct inotify_inode_mark *i_mark;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
+}
+
 const struct fsnotify_ops inotify_fsnotify_ops = {
 	.handle_event = inotify_handle_event,
 	.free_group_priv = inotify_free_group_priv,
 	.free_event = inotify_free_event,
 	.freeing_mark = inotify_freeing_mark,
+	.free_mark = inotify_free_mark,
 };
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 07febafd826e..7cc7d3fb1862 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,7 +47,7 @@
 /* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
 
-static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #ifdef CONFIG_SYSCTL
 
@@ -483,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	dec_inotify_watches(group->inotify_data.ucounts);
 }
 
-/* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
-	struct inotify_inode_mark *i_mark;
-
-	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
-	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
-}
-
 static int inotify_update_existing_watch(struct fsnotify_group *group,
 					 struct inode *inode,
 					 u32 arg)
@@ -558,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	if (unlikely(!tmp_i_mark))
 		return -ENOMEM;
 
-	fsnotify_init_mark(&tmp_i_mark->fsn_mark, group, inotify_free_mark);
+	fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 2f743e2035e4..55955ded338d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -195,9 +195,12 @@ static struct inode *fsnotify_detach_connector_from_object(
 
 static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
 {
-	if (mark->group)
-		fsnotify_put_group(mark->group);
-	mark->free_mark(mark);
+	struct fsnotify_group *group = mark->group;
+
+	if (WARN_ON_ONCE(!group))
+		return;
+	group->ops->free_mark(mark);
+	fsnotify_put_group(group);
 }
 
 void fsnotify_put_mark(struct fsnotify_mark *mark)
@@ -732,13 +735,11 @@ void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
  * Nothing fancy, just initialize lists and locks and counters.
  */
 void fsnotify_init_mark(struct fsnotify_mark *mark,
-			struct fsnotify_group *group,
-			void (*free_mark)(struct fsnotify_mark *mark))
+			struct fsnotify_group *group)
 {
 	memset(mark, 0, sizeof(*mark));
 	spin_lock_init(&mark->lock);
 	atomic_set(&mark->refcnt, 1);
-	mark->free_mark = free_mark;
 	fsnotify_get_group(group);
 	mark->group = group;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index a64518e36bd5..c6c69318752b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -104,6 +104,8 @@ struct fsnotify_ops {
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event)(struct fsnotify_event *event);
+	/* called on final put+free to free memory */
+	void (*free_mark)(struct fsnotify_mark *mark);
 };
 
 /*
@@ -261,7 +263,6 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x02
 #define FSNOTIFY_MARK_FLAG_ATTACHED		0x04
 	unsigned int flags;		/* flags [mark->lock] */
-	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
 
 #ifdef CONFIG_FSNOTIFY
@@ -341,8 +342,7 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group
 /* Calculate mask of events for a list of marks */
 extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark,
-			       struct fsnotify_group *group,
-			       void (*free_mark)(struct fsnotify_mark *mark));
+			       struct fsnotify_group *group);
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(
 				struct fsnotify_mark_connector __rcu **connp,
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 2522ceaca758..4aad0a467fed 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -103,8 +103,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
 		goto out;
 	}
 
-	fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group,
-			   audit_fsnotify_free_mark);
+	fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group);
 	audit_mark->mark.mask = AUDIT_FS_EVENTS;
 	audit_mark->path = pathname;
 	audit_update_mark(audit_mark, dentry->d_inode);
@@ -203,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group,
 
 static const struct fsnotify_ops audit_mark_fsnotify_ops = {
 	.handle_event =	audit_mark_handle_event,
+	.free_mark = audit_fsnotify_free_mark,
 };
 
 static int __init audit_fsnotify_init(void)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index da7f7a3e6a42..a14cff67a148 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,8 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
 		INIT_LIST_HEAD(&chunk->owners[i].list);
 		chunk->owners[i].index = i;
 	}
-	fsnotify_init_mark(&chunk->mark, audit_tree_group,
-			   audit_tree_destroy_watch);
+	fsnotify_init_mark(&chunk->mark, audit_tree_group);
 	chunk->mark.mask = FS_IN_IGNORED;
 	return chunk;
 }
@@ -1013,6 +1012,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 static const struct fsnotify_ops audit_tree_ops = {
 	.handle_event = audit_tree_handle_event,
 	.freeing_mark = audit_tree_freeing_mark,
+	.free_mark = audit_tree_destroy_watch,
 };
 
 static int __init audit_tree_init(void)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e32efed86828..13d30a8dfc56 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -157,8 +157,7 @@ static struct audit_parent *audit_init_parent(struct path *path)
 
 	INIT_LIST_HEAD(&parent->watches);
 
-	fsnotify_init_mark(&parent->mark, audit_watch_group,
-			   audit_watch_free_mark);
+	fsnotify_init_mark(&parent->mark, audit_watch_group);
 	parent->mark.mask = AUDIT_FS_WATCH;
 	ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0);
 	if (ret < 0) {
@@ -508,6 +507,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 
 static const struct fsnotify_ops audit_watch_fsnotify_ops = {
 	.handle_event = 	audit_watch_handle_event,
+	.free_mark =		audit_watch_free_mark,
 };
 
 static int __init audit_watch_init(void)
-- 
cgit v1.2.3


From 0c08aaf873174c95e674cf21ffcd041c589d2e5b Mon Sep 17 00:00:00 2001
From: Vincent Stehlé <vincent.stehle@laposte.net>
Date: Sun, 9 Apr 2017 22:05:05 +0200
Subject: regulator: isl9305: fix array size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ISL9305_MAX_REGULATOR is the last index used to access the init_data[]
array, so we need to add one to this last index to obtain the necessary
array size.

This fixes the following smatch error:

  drivers/regulator/isl9305.c:160 isl9305_i2c_probe() error: buffer overflow 'pdata->init_data' 3 <= 3

Fixes: dec38b5ce6a9edb4 ("regulator: isl9305: Add Intersil ISL9305/H driver")
Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Cc: Mark Brown <broonie@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/isl9305.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/isl9305.h b/include/linux/platform_data/isl9305.h
index 1419133fa69e..4ac1a070af0a 100644
--- a/include/linux/platform_data/isl9305.h
+++ b/include/linux/platform_data/isl9305.h
@@ -24,7 +24,7 @@
 struct regulator_init_data;
 
 struct isl9305_pdata {
-	struct regulator_init_data *init_data[ISL9305_MAX_REGULATOR];
+	struct regulator_init_data *init_data[ISL9305_MAX_REGULATOR + 1];
 };
 
 #endif
-- 
cgit v1.2.3


From 5367278cb7ba74537bcad1470d75f30d95b09c14 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 6 Apr 2017 12:26:20 -0400
Subject: tracing: Add stack_tracer_disable/enable() functions

There are certain parts of the kernel that cannot let stack tracing
proceed (namely in RCU), because the stack tracer uses RCU, and parts of RCU
internals cannot handle having RCU read side locks taken.

Add stack_tracer_disable() and stack_tracer_enable() functions to let RCU
stop stack tracing on the current CPU when it is in those critical sections.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h     |  6 ++++++
 kernel/trace/trace_stack.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ef7123219f14..7b4e6572ab21 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -286,6 +286,12 @@ int
 stack_trace_sysctl(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos);
+
+void stack_tracer_disable(void);
+void stack_tracer_enable(void);
+#else
+static inline void stack_tracer_disable(void) { }
+static inline void stack_tracer_enable(void) { }
 #endif
 
 struct ftrace_func_command {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 338d076a06da..21e536cf66e4 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -41,6 +41,38 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
 int stack_tracer_enabled;
 static int last_stack_tracer_enabled;
 
+/**
+ * stack_tracer_disable - temporarily disable the stack tracer
+ *
+ * There's a few locations (namely in RCU) where stack tracing
+ * cannot be executed. This function is used to disable stack
+ * tracing during those critical sections.
+ *
+ * This function must be called with preemption or interrupts
+ * disabled and stack_tracer_enable() must be called shortly after
+ * while preemption or interrupts are still disabled.
+ */
+void stack_tracer_disable(void)
+{
+	/* Preemption or interupts must be disabled */
+	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
+		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
+	this_cpu_inc(trace_active);
+}
+
+/**
+ * stack_tracer_enable - re-enable the stack tracer
+ *
+ * After stack_tracer_disable() is called, stack_tracer_enable()
+ * must be called shortly afterward.
+ */
+void stack_tracer_enable(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
+		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
+	this_cpu_dec(trace_active);
+}
+
 void stack_trace_print(void)
 {
 	long i;
-- 
cgit v1.2.3


From 8aaf1ee70e19ac74cbbb81098edfa328d1ab4bd7 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 6 Apr 2017 15:47:32 -0400
Subject: tracing: Rename trace_active to disable_stack_tracer and inline its
 modification

In order to eliminate a function call, make "trace_active" into
"disable_stack_tracer" and convert stack_tracer_disable() and friends into
static inline functions.

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h     | 36 +++++++++++++++++++++++++++++++--
 kernel/trace/trace_stack.c | 50 +++++++++-------------------------------------
 2 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7b4e6572ab21..06b2990a35e4 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -287,8 +287,40 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 		   void __user *buffer, size_t *lenp,
 		   loff_t *ppos);
 
-void stack_tracer_disable(void);
-void stack_tracer_enable(void);
+/* DO NOT MODIFY THIS VARIABLE DIRECTLY! */
+DECLARE_PER_CPU(int, disable_stack_tracer);
+
+/**
+ * stack_tracer_disable - temporarily disable the stack tracer
+ *
+ * There's a few locations (namely in RCU) where stack tracing
+ * cannot be executed. This function is used to disable stack
+ * tracing during those critical sections.
+ *
+ * This function must be called with preemption or interrupts
+ * disabled and stack_tracer_enable() must be called shortly after
+ * while preemption or interrupts are still disabled.
+ */
+static inline void stack_tracer_disable(void)
+{
+	/* Preemption or interupts must be disabled */
+	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
+		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
+	this_cpu_inc(disable_stack_tracer);
+}
+
+/**
+ * stack_tracer_enable - re-enable the stack tracer
+ *
+ * After stack_tracer_disable() is called, stack_tracer_enable()
+ * must be called shortly afterward.
+ */
+static inline void stack_tracer_enable(void)
+{
+	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
+		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
+	this_cpu_dec(disable_stack_tracer);
+}
 #else
 static inline void stack_tracer_disable(void) { }
 static inline void stack_tracer_enable(void) { }
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 21e536cf66e4..f2f02ff350d4 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -35,44 +35,12 @@ unsigned long stack_trace_max_size;
 arch_spinlock_t stack_trace_max_lock =
 	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 
-static DEFINE_PER_CPU(int, trace_active);
+DEFINE_PER_CPU(int, disable_stack_tracer);
 static DEFINE_MUTEX(stack_sysctl_mutex);
 
 int stack_tracer_enabled;
 static int last_stack_tracer_enabled;
 
-/**
- * stack_tracer_disable - temporarily disable the stack tracer
- *
- * There's a few locations (namely in RCU) where stack tracing
- * cannot be executed. This function is used to disable stack
- * tracing during those critical sections.
- *
- * This function must be called with preemption or interrupts
- * disabled and stack_tracer_enable() must be called shortly after
- * while preemption or interrupts are still disabled.
- */
-void stack_tracer_disable(void)
-{
-	/* Preemption or interupts must be disabled */
-	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
-		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
-	this_cpu_inc(trace_active);
-}
-
-/**
- * stack_tracer_enable - re-enable the stack tracer
- *
- * After stack_tracer_disable() is called, stack_tracer_enable()
- * must be called shortly afterward.
- */
-void stack_tracer_enable(void)
-{
-	if (IS_ENABLED(CONFIG_PREEMPT_DEBUG))
-		WARN_ON_ONCE(!preempt_count() || !irqs_disabled());
-	this_cpu_dec(trace_active);
-}
-
 void stack_trace_print(void)
 {
 	long i;
@@ -243,8 +211,8 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 
 	/* no atomic needed, we only modify this variable by this cpu */
-	__this_cpu_inc(trace_active);
-	if (__this_cpu_read(trace_active) != 1)
+	__this_cpu_inc(disable_stack_tracer);
+	if (__this_cpu_read(disable_stack_tracer) != 1)
 		goto out;
 
 	ip += MCOUNT_INSN_SIZE;
@@ -252,7 +220,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	check_stack(ip, &stack);
 
  out:
-	__this_cpu_dec(trace_active);
+	__this_cpu_dec(disable_stack_tracer);
 	/* prevent recursion in schedule */
 	preempt_enable_notrace();
 }
@@ -294,15 +262,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
 	/*
 	 * In case we trace inside arch_spin_lock() or after (NMI),
 	 * we will cause circular lock, so we also need to increase
-	 * the percpu trace_active here.
+	 * the percpu disable_stack_tracer here.
 	 */
-	__this_cpu_inc(trace_active);
+	__this_cpu_inc(disable_stack_tracer);
 
 	arch_spin_lock(&stack_trace_max_lock);
 	*ptr = val;
 	arch_spin_unlock(&stack_trace_max_lock);
 
-	__this_cpu_dec(trace_active);
+	__this_cpu_dec(disable_stack_tracer);
 	local_irq_restore(flags);
 
 	return count;
@@ -338,7 +306,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
 	local_irq_disable();
 
-	__this_cpu_inc(trace_active);
+	__this_cpu_inc(disable_stack_tracer);
 
 	arch_spin_lock(&stack_trace_max_lock);
 
@@ -352,7 +320,7 @@ static void t_stop(struct seq_file *m, void *p)
 {
 	arch_spin_unlock(&stack_trace_max_lock);
 
-	__this_cpu_dec(trace_active);
+	__this_cpu_dec(disable_stack_tracer);
 
 	local_irq_enable();
 }
-- 
cgit v1.2.3


From 03ecd3f48e57f2e6154584e0ee7450d7a05e2d3b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 7 Apr 2017 12:20:36 -0400
Subject: rcu/tracing: Add rcu_disabled to denote when rcu_irq_enter() will not
 work

Tracing uses rcu_irq_enter() as a way to make sure that RCU is watching when
it needs to use rcu_read_lock() and friends. This is because tracing can
happen as RCU is about to enter user space, or about to go idle, and RCU
does not watch for RCU read side critical sections as it makes the
transition.

There is a small location within the RCU infrastructure that rcu_irq_enter()
itself will not work. If tracing were to occur in that section it will break
if it tries to use rcu_irq_enter().

Originally, this happens with the stack_tracer, because it will call
save_stack_trace when it encounters stack usage that is greater than any
stack usage it had encountered previously. There was a case where that
happened in the RCU section where rcu_irq_enter() did not work, and lockdep
complained loudly about it. To fix it, stack tracing added a call to be
disabled and RCU would disable stack tracing during the critical section
that rcu_irq_enter() was inoperable. This solution worked, but there are
other cases that use rcu_irq_enter() and it would be a good idea to let RCU
give a way to let others know that rcu_irq_enter() will not work. For
example, in trace events.

Another helpful aspect of this change is that it also moves the per cpu
variable called in the RCU critical section into a cache locale along with
other RCU per cpu variables used in that same location.

I'm keeping the stack_trace_disable() code, as that still could be used in
the future by places that really need to disable it. And since it's only a
static inline, it wont take up any kernel text if it is not used.

Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/rcupdate.h   |  5 +++++
 kernel/rcu/tree.c          | 18 ++++++++++++++++--
 kernel/trace/trace_stack.c |  8 ++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de88b33c0974..dea8f17b2fe3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -97,6 +97,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
 			       unsigned long secs,
 			       unsigned long c_old,
 			       unsigned long c);
+bool rcu_irq_enter_disabled(void);
 #else
 static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
 					  int *flags,
@@ -113,6 +114,10 @@ static inline void rcutorture_record_test_transition(void)
 static inline void rcutorture_record_progress(unsigned long vernum)
 {
 }
+static inline bool rcu_irq_enter_disabled(void)
+{
+	return false;
+}
 #ifdef CONFIG_RCU_TRACE
 void do_trace_rcu_torture_read(const char *rcutorturename,
 			       struct rcu_head *rhp,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8b4d273331e4..a6dcf3bd244f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -284,6 +284,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+/*
+ * There's a few places, currently just in the tracing infrastructure,
+ * that uses rcu_irq_enter() to make sure RCU is watching. But there's
+ * a small location where that will not even work. In those cases
+ * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter()
+ * can be called.
+ */
+static DEFINE_PER_CPU(bool, disable_rcu_irq_enter);
+
+bool rcu_irq_enter_disabled(void)
+{
+	return this_cpu_read(disable_rcu_irq_enter);
+}
+
 /*
  * Record entry into an extended quiescent state.  This is only to be
  * called when not already in an extended quiescent state.
@@ -800,10 +814,10 @@ static void rcu_eqs_enter_common(bool user)
 		do_nocb_deferred_wakeup(rdp);
 	}
 	rcu_prepare_for_idle();
-	stack_tracer_disable();
+	__this_cpu_inc(disable_rcu_irq_enter);
 	rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */
 	rcu_dynticks_eqs_enter(); /* After this, tracing works again. */
-	stack_tracer_enable();
+	__this_cpu_dec(disable_rcu_irq_enter);
 	rcu_dynticks_task_enter();
 
 	/*
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index f2f02ff350d4..76aa04d4c925 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack)
 	if (in_nmi())
 		return;
 
+	/*
+	 * There's a slight chance that we are tracing inside the
+	 * RCU infrastructure, and rcu_irq_enter() will not work
+	 * as expected.
+	 */
+	if (unlikely(rcu_irq_enter_disabled()))
+		return;
+
 	local_irq_save(flags);
 	arch_spin_lock(&stack_trace_max_lock);
 
-- 
cgit v1.2.3


From d54b6eeb553c89ed8d4c5a2ed73df374a45b9562 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 7 Apr 2017 12:40:49 -0400
Subject: tracing: Make sure rcu_irq_enter() can work for trace_*_rcuidle()
 trace events

Stack tracing discovered that there's a small location inside the RCU
infrastructure where calling rcu_irq_enter() does not work. As trace events
use rcu_irq_enter() it must make sure that it is functionable. A check
against rcu_irq_enter_disabled() is added with a WARN_ON_ONCE() as no trace
event should ever be used in that part of RCU. If the warning is triggered,
then the trace event is ignored.

Restructure the __DO_TRACE() a bit to get rid of the prercu and postrcu,
and just have an rcucheck that does the work from within the _DO_TRACE()
macro. gcc optimization will compile out the rcucheck=0 case.

Link: http://lkml.kernel.org/r/20170405093207.404f8deb@gandalf.local.home

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index f72fcfe0e66a..cc48cb2ce209 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -128,7 +128,7 @@ extern void syscall_unregfunc(void);
  * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just
  * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto".
  */
-#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu)		\
+#define __DO_TRACE(tp, proto, args, cond, rcucheck)			\
 	do {								\
 		struct tracepoint_func *it_func_ptr;			\
 		void *it_func;						\
@@ -136,7 +136,11 @@ extern void syscall_unregfunc(void);
 									\
 		if (!(cond))						\
 			return;						\
-		prercu;							\
+		if (rcucheck) {						\
+			if (WARN_ON_ONCE(rcu_irq_enter_disabled()))	\
+				return;					\
+			rcu_irq_enter_irqson();				\
+		}							\
 		rcu_read_lock_sched_notrace();				\
 		it_func_ptr = rcu_dereference_sched((tp)->funcs);	\
 		if (it_func_ptr) {					\
@@ -147,20 +151,19 @@ extern void syscall_unregfunc(void);
 			} while ((++it_func_ptr)->func);		\
 		}							\
 		rcu_read_unlock_sched_notrace();			\
-		postrcu;						\
+		if (rcucheck)						\
+			rcu_irq_exit_irqson();				\
 	} while (0)
 
 #ifndef MODULE
-#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)	\
+#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \
 	static inline void trace_##name##_rcuidle(proto)		\
 	{								\
 		if (static_key_false(&__tracepoint_##name.key))		\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
-				TP_CONDITION(cond),			\
-				rcu_irq_enter_irqson(),			\
-				rcu_irq_exit_irqson());			\
+				TP_CONDITION(cond), 1);			\
 	}
 #else
 #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
@@ -186,7 +189,7 @@ extern void syscall_unregfunc(void);
 			__DO_TRACE(&__tracepoint_##name,		\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
-				TP_CONDITION(cond),,);			\
+				TP_CONDITION(cond), 0);			\
 		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			rcu_read_lock_sched_notrace();			\
 			rcu_dereference_sched(__tracepoint_##name.funcs);\
-- 
cgit v1.2.3


From 30e03acda5fd9c77ec9bf8b3c5def9540c6b0486 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Sun, 9 Apr 2017 07:36:14 +0600
Subject: cpuset: Remove cpuset_update_active_cpus()'s parameter.

In cpuset_update_active_cpus(), cpu_online isn't used anymore. Remove
it.

Signed-off-by: Rakib Mullick<rakib.mullick@gmail.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 4 ++--
 kernel/cgroup/cpuset.c | 2 +-
 kernel/sched/core.c    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 611fce58d670..119a3f9604b0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -42,7 +42,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(bool cpu_online);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_update_active_cpus(bool cpu_online)
+static inline void cpuset_update_active_cpus(void)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8b84db2d2d17..f6501f4f6040 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2351,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		rebuild_sched_domains();
 }
 
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
 	/*
 	 * We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 956383844116..e62802c044e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5727,7 +5727,7 @@ static void cpuset_cpu_active(void)
 		 * cpuset configurations.
 		 */
 	}
-	cpuset_update_active_cpus(true);
+	cpuset_update_active_cpus();
 }
 
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5750,7 +5750,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
 		if (overflow)
 			return -EBUSY;
-		cpuset_update_active_cpus(false);
+		cpuset_update_active_cpus();
 	} else {
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
-- 
cgit v1.2.3


From b8b1a2e5eca6bbf20e3a29c5f9db8331ec52af2d Mon Sep 17 00:00:00 2001
From: Todd Poynor <toddpoynor@google.com>
Date: Thu, 6 Apr 2017 18:47:57 -0700
Subject: cgroup: move cgroup_subsys_state parent field for cache locality

Various structures embed a struct cgroup_subsys_state, typically at
the top of the containing structure.  It is common for code that
accesses the structures to perform operations that iterate over the
chain of parent css pointers, also accessing data in each containing
structure.  In particular, struct cpuacct is used by fairly hot code
paths in the scheduler such as cpuacct_charge().

Move the parent css pointer field to the end of the structure to
increase the chances of residing in the same cache line as the data
from the containing structure.

Signed-off-by: Todd Poynor <toddpoynor@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c74b78ecd583..21745946cae1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -107,9 +107,6 @@ struct cgroup_subsys_state {
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 
-	/* PI: the parent css */
-	struct cgroup_subsys_state *parent;
-
 	/* siblings list anchored at the parent's ->children */
 	struct list_head sibling;
 	struct list_head children;
@@ -139,6 +136,12 @@ struct cgroup_subsys_state {
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
+
+	/*
+	 * PI: the parent css.	Placed here for cache proximity to following
+	 * fields of the containing structure.
+	 */
+	struct cgroup_subsys_state *parent;
 };
 
 /*
-- 
cgit v1.2.3


From 717a94b5fc7092afebe9c93791f29b2d8e5d297a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 7 Apr 2017 10:03:26 +1000
Subject: sched/core: Remove 'task' parameter and rename tsk_restore_flags() to
 current_restore_flags()

It is not safe for one thread to modify the ->flags
of another thread as there is no locking that can protect
the update.

So tsk_restore_flags(), which takes a task pointer and modifies
the flags, is an invitation to do the wrong thing.

All current users pass "current" as the task, so no developers have
accepted that invitation.  It would be best to ensure it remains
that way.

So rename tsk_restore_flags() to current_restore_flags() and don't
pass in a task_struct pointer.  Always operate on current->flags.

Signed-off-by: NeilBrown <neilb@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/nbd.c      | 2 +-
 drivers/scsi/iscsi_tcp.c | 2 +-
 fs/nfsd/vfs.c            | 2 +-
 include/linux/sched.h    | 6 +++---
 kernel/softirq.c         | 2 +-
 net/core/dev.c           | 2 +-
 net/core/sock.c          | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d8a23561b4cb..3c9052bf2327 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -244,7 +244,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 			*sent += result;
 	} while (msg_data_left(&msg));
 
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 
 	return result;
 }
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 4228aba1f654..bbea8eac9abb 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
 		rc = 0;
 	}
 
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 	return rc;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..9aaf6ca77569 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ out_nfserr:
 	else
 		err = nfserrno(host_err);
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
-		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+		current_restore_flags(pflags, PF_LESS_THROTTLE);
 	return err;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..0978fb74e45a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1286,10 +1286,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
 static inline void
-tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags)
+current_restore_flags(unsigned long orig_flags, unsigned long flags)
 {
-	task->flags &= ~flags;
-	task->flags |= orig_flags & flags;
+	current->flags &= ~flags;
+	current->flags |= orig_flags & flags;
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ restart:
 	account_irq_exit_time(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
 	WARN_ON_ONCE(in_interrupt());
-	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+	current_restore_flags(old_flags, PF_MEMALLOC);
 }
 
 asmlinkage __visible void do_softirq(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 7869ae3837ca..e8a366387a99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4240,7 +4240,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 		 */
 		current->flags |= PF_MEMALLOC;
 		ret = __netif_receive_skb_core(skb, true);
-		tsk_restore_flags(current, pflags, PF_MEMALLOC);
+		current_restore_flags(pflags, PF_MEMALLOC);
 	} else
 		ret = __netif_receive_skb_core(skb, false);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 2c4f574168fb..b416a537bd0a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 
 	current->flags |= PF_MEMALLOC;
 	ret = sk->sk_backlog_rcv(sk, skb);
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 
 	return ret;
 }
-- 
cgit v1.2.3


From e92634cd4d37d5a2ea2fb0d55d25d50bbba1e8e0 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Thu, 30 Mar 2017 09:46:41 +0300
Subject: usb: otg-fsm: Prevent build warning "VDBG" redefined

If usb/otg-fsm.h and usb/composite.h are included together
then it results in the build warning [1].

Prevent that by defining VDBG locally.

Also get rid of MPC_LOC which doesn't seem to be used
by anyone.

[1] - warning fixed by this patch:

In file included from drivers/usb/dwc3/core.h:33,
   from drivers/usb/dwc3/ep0.c:33:
   include/linux/usb/otg-fsm.h:30:1: warning: "VDBG" redefined
   In file included from drivers/usb/dwc3/ep0.c:31:
   include/linux/usb/composite.h:615:1: warning: this is the location
   of the previous definition

Signed-off-by: Roger Quadros <rogerq@ti.com>
Reviewed-by: Jun Li <jun.li@nxp.com>
Acked-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/usb/common/usb-otg-fsm.c |  7 +++++++
 drivers/usb/phy/phy-fsl-usb.c    |  7 +++++++
 include/linux/usb/otg-fsm.h      | 15 ---------------
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c
index 2f537bbdda09..b8fe31e409a5 100644
--- a/drivers/usb/common/usb-otg-fsm.c
+++ b/drivers/usb/common/usb-otg-fsm.c
@@ -31,6 +31,13 @@
 #include <linux/usb/otg.h>
 #include <linux/usb/otg-fsm.h>
 
+#ifdef VERBOSE
+#define VDBG(fmt, args...) pr_debug("[%s]  " fmt, \
+				 __func__, ## args)
+#else
+#define VDBG(stuff...)	do {} while (0)
+#endif
+
 /* Change USB protocol when there is a protocol change */
 static int otg_set_protocol(struct otg_fsm *fsm, int protocol)
 {
diff --git a/drivers/usb/phy/phy-fsl-usb.c b/drivers/usb/phy/phy-fsl-usb.c
index 392ab422163c..cf8f40ae6e01 100644
--- a/drivers/usb/phy/phy-fsl-usb.c
+++ b/drivers/usb/phy/phy-fsl-usb.c
@@ -44,6 +44,13 @@
 
 #include "phy-fsl-usb.h"
 
+#ifdef VERBOSE
+#define VDBG(fmt, args...) pr_debug("[%s]  " fmt, \
+				 __func__, ## args)
+#else
+#define VDBG(stuff...)	do {} while (0)
+#endif
+
 #define DRIVER_VERSION "Rev. 1.55"
 #define DRIVER_AUTHOR "Jerry Huang/Li Yang"
 #define DRIVER_DESC "Freescale USB OTG Transceiver Driver"
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 7a0350535cb1..a0a8f878503c 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -21,21 +21,6 @@
 #include <linux/mutex.h>
 #include <linux/errno.h>
 
-#undef VERBOSE
-
-#ifdef VERBOSE
-#define VDBG(fmt, args...) pr_debug("[%s]  " fmt , \
-				 __func__, ## args)
-#else
-#define VDBG(stuff...)	do {} while (0)
-#endif
-
-#ifdef VERBOSE
-#define MPC_LOC printk("Current Location [%s]:[%d]\n", __FILE__, __LINE__)
-#else
-#define MPC_LOC do {} while (0)
-#endif
-
 #define PROTO_UNDEF	(0)
 #define PROTO_HOST	(1)
 #define PROTO_GADGET	(2)
-- 
cgit v1.2.3


From 8c58f1a7a4b6d1d723bf25fef9d842d5a11200d0 Mon Sep 17 00:00:00 2001
From: Jacopo Mondi <jacopo+renesas@jmondi.org>
Date: Wed, 5 Apr 2017 16:07:19 +0200
Subject: pinctrl: generic: Add bi-directional and output-enable

Add bi-directional and output-enable pin configuration properties.

bi-directional allows to specify when a pin shall operate in input and
output mode at the same time. This is particularly useful in platforms
where input and output buffers have to be manually enabled.

output-enable is just syntactic sugar to specify that a pin shall
operate in output mode, ignoring the provided argument.
This pairs with input-enable pin configuration option.

Signed-off-by: Jacopo Mondi <jacopo+renesas@jmondi.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt | 2 ++
 drivers/pinctrl/pinconf-generic.c                              | 3 +++
 include/linux/pinctrl/pinconf-generic.h                        | 3 +++
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index f01d154090da..71a3c134af1b 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -247,6 +247,7 @@ bias-bus-hold		- latch weakly
 bias-pull-up		- pull up the pin
 bias-pull-down		- pull down the pin
 bias-pull-pin-default	- use pin-default pull state
+bi-directional		- pin supports simultaneous input/output operations
 drive-push-pull		- drive actively high and low
 drive-open-drain	- drive with open drain
 drive-open-source	- drive with open source
@@ -259,6 +260,7 @@ input-debounce		- debounce mode with debound time X
 power-source		- select between different power supplies
 low-power-enable	- enable low power mode
 low-power-disable	- disable low power mode
+output-enable		- enable output on pin regardless of output value
 output-low		- set the pin to output mode with low level
 output-high		- set the pin to output mode with high level
 slew-rate		- set the slew rate
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 720a19fd38d2..0d6b7f4b82af 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -35,6 +35,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_BIAS_PULL_PIN_DEFAULT,
 				"input bias pull to pin specific state", NULL, false),
 	PCONFDUMP(PIN_CONFIG_BIAS_PULL_UP, "input bias pull up", NULL, false),
+	PCONFDUMP(PIN_CONFIG_BIDIRECTIONAL, "bi-directional pin operations", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_DRAIN, "output drive open drain", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_SOURCE, "output drive open source", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL, false),
@@ -160,6 +161,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "bias-pull-up", PIN_CONFIG_BIAS_PULL_UP, 1 },
 	{ "bias-pull-pin-default", PIN_CONFIG_BIAS_PULL_PIN_DEFAULT, 1 },
 	{ "bias-pull-down", PIN_CONFIG_BIAS_PULL_DOWN, 1 },
+	{ "bi-directional", PIN_CONFIG_BIDIRECTIONAL, 1 },
 	{ "drive-open-drain", PIN_CONFIG_DRIVE_OPEN_DRAIN, 0 },
 	{ "drive-open-source", PIN_CONFIG_DRIVE_OPEN_SOURCE, 0 },
 	{ "drive-push-pull", PIN_CONFIG_DRIVE_PUSH_PULL, 0 },
@@ -172,6 +174,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "input-schmitt-enable", PIN_CONFIG_INPUT_SCHMITT_ENABLE, 1 },
 	{ "low-power-disable", PIN_CONFIG_LOW_POWER_MODE, 0 },
 	{ "low-power-enable", PIN_CONFIG_LOW_POWER_MODE, 1 },
+	{ "output-enable", PIN_CONFIG_OUTPUT, 1, },
 	{ "output-high", PIN_CONFIG_OUTPUT, 1, },
 	{ "output-low", PIN_CONFIG_OUTPUT, 0, },
 	{ "power-source", PIN_CONFIG_POWER_SOURCE, 0 },
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 7620eb127cff..279e3c5326e3 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -42,6 +42,8 @@
  * @PIN_CONFIG_BIAS_PULL_UP: the pin will be pulled up (usually with high
  *	impedance to VDD). If the argument is != 0 pull-up is enabled,
  *	if it is 0, pull-up is total, i.e. the pin is connected to VDD.
+ * @PIN_CONFIG_BIDIRECTIONAL: the pin will be configured to allow simultaneous
+ *	input and output operations.
  * @PIN_CONFIG_DRIVE_OPEN_DRAIN: the pin will be driven with open drain (open
  *	collector) which means it is usually wired with other output ports
  *	which are then pulled up with an external resistor. Setting this
@@ -96,6 +98,7 @@ enum pin_config_param {
 	PIN_CONFIG_BIAS_PULL_DOWN,
 	PIN_CONFIG_BIAS_PULL_PIN_DEFAULT,
 	PIN_CONFIG_BIAS_PULL_UP,
+	PIN_CONFIG_BIDIRECTIONAL,
 	PIN_CONFIG_DRIVE_OPEN_DRAIN,
 	PIN_CONFIG_DRIVE_OPEN_SOURCE,
 	PIN_CONFIG_DRIVE_PUSH_PULL,
-- 
cgit v1.2.3


From ab781ec0e5e781849bd14291608c8626bac871e1 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Wed, 29 Mar 2017 14:18:20 +0200
Subject: mfd: cpcap: Implement IRQ sense helper

CPCAP can sense if IRQ is currently set or not. This
functionality is required for a few subdevices, such
as the power button and usb phy modules.

Signed-off-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/motorola-cpcap.c       | 28 ++++++++++++++++++++++++++++
 include/linux/mfd/motorola-cpcap.h |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/motorola-cpcap.c b/drivers/mfd/motorola-cpcap.c
index 6aeada7d7ce5..a9097efcefa5 100644
--- a/drivers/mfd/motorola-cpcap.c
+++ b/drivers/mfd/motorola-cpcap.c
@@ -23,6 +23,8 @@
 
 #define CPCAP_NR_IRQ_REG_BANKS	6
 #define CPCAP_NR_IRQ_CHIPS	3
+#define CPCAP_REGISTER_SIZE	4
+#define CPCAP_REGISTER_BITS	16
 
 struct cpcap_ddata {
 	struct spi_device *spi;
@@ -32,6 +34,32 @@ struct cpcap_ddata {
 	struct regmap *regmap;
 };
 
+static int cpcap_sense_irq(struct regmap *regmap, int irq)
+{
+	int regnum = irq / CPCAP_REGISTER_BITS;
+	int mask = BIT(irq % CPCAP_REGISTER_BITS);
+	int reg = CPCAP_REG_INTS1 + (regnum * CPCAP_REGISTER_SIZE);
+	int err, val;
+
+	if (reg < CPCAP_REG_INTS1 || reg > CPCAP_REG_INTS4)
+		return -EINVAL;
+
+	err = regmap_read(regmap, reg, &val);
+	if (err)
+		return err;
+
+	return !!(val & mask);
+}
+
+int cpcap_sense_virq(struct regmap *regmap, int virq)
+{
+	struct regmap_irq_chip_data *d = irq_get_chip_data(virq);
+	int irq_base = regmap_irq_chip_get_base(d);
+
+	return cpcap_sense_irq(regmap, virq - irq_base);
+}
+EXPORT_SYMBOL_GPL(cpcap_sense_virq);
+
 static int cpcap_check_revision(struct cpcap_ddata *cpcap)
 {
 	u16 vendor, rev;
diff --git a/include/linux/mfd/motorola-cpcap.h b/include/linux/mfd/motorola-cpcap.h
index b4031c2b2214..793aa695faa0 100644
--- a/include/linux/mfd/motorola-cpcap.h
+++ b/include/linux/mfd/motorola-cpcap.h
@@ -290,3 +290,5 @@ static inline int cpcap_get_vendor(struct device *dev,
 
 	return 0;
 }
+
+extern int cpcap_sense_virq(struct regmap *regmap, int virq);
-- 
cgit v1.2.3


From 4244de1c64ded7f5438717bdce3fa074efd20efb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 13:01:24 +0200
Subject: PCI: remove pci_enable_msix

Unused now that all callers switched to pci_alloc_irq_vectors.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/pci/msi.c   | 21 ---------------------
 include/linux/pci.h |  4 ----
 2 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index d571bc330686..0042c365b29b 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -973,27 +973,6 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	return msix_capability_init(dev, entries, nvec, affd);
 }
 
-/**
- * pci_enable_msix - configure device's MSI-X capability structure
- * @dev: pointer to the pci_dev data structure of MSI-X device function
- * @entries: pointer to an array of MSI-X entries (optional)
- * @nvec: number of MSI-X irqs requested for allocation by device driver
- *
- * Setup the MSI-X capability structure of device function with the number
- * of requested irqs upon its software driver call to request for
- * MSI-X mode enabled on its hardware device function. A return of zero
- * indicates the successful configuration of MSI-X capability structure
- * with new allocated MSI-X irqs. A return of < 0 indicates a failure.
- * Or a return of > 0 indicates that driver request is exceeding the number
- * of irqs or MSI-X vectors available. Driver should use the returned value to
- * re-send its request.
- **/
-int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
-{
-	return __pci_enable_msix(dev, entries, nvec, NULL);
-}
-EXPORT_SYMBOL(pci_enable_msix);
-
 void pci_msix_shutdown(struct pci_dev *dev)
 {
 	struct msi_desc *entry;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..82dec36845e6 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1300,7 +1300,6 @@ int pci_msi_vec_count(struct pci_dev *dev);
 void pci_msi_shutdown(struct pci_dev *dev);
 void pci_disable_msi(struct pci_dev *dev);
 int pci_msix_vec_count(struct pci_dev *dev);
-int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec);
 void pci_msix_shutdown(struct pci_dev *dev);
 void pci_disable_msix(struct pci_dev *dev);
 void pci_restore_msi_state(struct pci_dev *dev);
@@ -1330,9 +1329,6 @@ static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
 static inline void pci_msi_shutdown(struct pci_dev *dev) { }
 static inline void pci_disable_msi(struct pci_dev *dev) { }
 static inline int pci_msix_vec_count(struct pci_dev *dev) { return -ENOSYS; }
-static inline int pci_enable_msix(struct pci_dev *dev,
-				  struct msix_entry *entries, int nvec)
-{ return -ENOSYS; }
 static inline void pci_msix_shutdown(struct pci_dev *dev) { }
 static inline void pci_disable_msix(struct pci_dev *dev) { }
 static inline void pci_restore_msi_state(struct pci_dev *dev) { }
-- 
cgit v1.2.3


From 083c52144a19c69b7956aa53c913ba621f7c5ae2 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 11 Apr 2017 09:39:45 +0100
Subject: drivers/perf: arm_pmu: define armpmu_init_fn

We expect an ARM PMU's init function to have a particular prototype,
which we open-code in a few places. This is less than ideal, considering
that we cast a void value to this type in one location, and a mismatch
could easily be missed.

Add a typedef so that we can ensure this is consistent.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/arm_pmu.c       | 2 +-
 include/linux/perf/arm_pmu.h | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 316c4dcc9856..71a825df47ed 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -989,7 +989,7 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 			 const struct pmu_probe_info *probe_table)
 {
 	const struct of_device_id *of_id;
-	const int (*init_fn)(struct arm_pmu *);
+	armpmu_init_fn init_fn;
 	struct device_node *node = pdev->dev.of_node;
 	struct arm_pmu *pmu;
 	int ret = -ENODEV;
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 44f43fcf2524..4249914315a4 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -132,10 +132,12 @@ int armpmu_map_event(struct perf_event *event,
 						[PERF_COUNT_HW_CACHE_RESULT_MAX],
 		     u32 raw_event_mask);
 
+typedef int (*armpmu_init_fn)(struct arm_pmu *);
+
 struct pmu_probe_info {
 	unsigned int cpuid;
 	unsigned int mask;
-	int (*init)(struct arm_pmu *);
+	armpmu_init_fn init;
 };
 
 #define PMU_PROBE(_cpuid, _mask, _fn)	\
-- 
cgit v1.2.3


From 18bfcfe51b8f60b69ab012888dea8061a9cd3381 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 11 Apr 2017 09:39:53 +0100
Subject: drivers/perf: arm_pmu: split out platform device probe logic

Now that we've split the pdev and DT probing logic from the runtime
management, let's move the former into its own file. We gain a few lines
due to the copyright header and includes, but this should keep the logic
clearly separated, and paves the way for adding ACPI support in a
similar fashion.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
[will: rename nr_irqs to avoid conflict with global variable]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/Makefile           |   2 +-
 drivers/perf/arm_pmu.c          | 226 +-------------------------------------
 drivers/perf/arm_pmu_platform.c | 235 ++++++++++++++++++++++++++++++++++++++++
 include/linux/perf/arm_pmu.h    |   7 ++
 4 files changed, 247 insertions(+), 223 deletions(-)
 create mode 100644 drivers/perf/arm_pmu_platform.c

(limited to 'include/linux')

diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index ef0c6b210345..925cd3903029 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_ARM_PMU) += arm_pmu.o
+obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_QCOM_L2_PMU)	+= qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index f387d6153099..b3bedfa512eb 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -16,7 +16,6 @@
 #include <linux/cpu_pm.h>
 #include <linux/export.h>
 #include <linux/kernel.h>
-#include <linux/of_device.h>
 #include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -25,7 +24,6 @@
 #include <linux/irq.h>
 #include <linux/irqdesc.h>
 
-#include <asm/cputype.h>
 #include <asm/irq_regs.h>
 
 static int
@@ -544,7 +542,7 @@ static void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
 	free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, cpu));
 }
 
-static void armpmu_free_irqs(struct arm_pmu *armpmu)
+void armpmu_free_irqs(struct arm_pmu *armpmu)
 {
 	int cpu;
 
@@ -589,7 +587,7 @@ static int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
 	return 0;
 }
 
-static int armpmu_request_irqs(struct arm_pmu *armpmu)
+int armpmu_request_irqs(struct arm_pmu *armpmu)
 {
 	int cpu, err;
 
@@ -783,161 +781,7 @@ static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
 					    &cpu_pmu->node);
 }
 
-/*
- * CPU PMU identification and probing.
- */
-static int probe_current_pmu(struct arm_pmu *pmu,
-			     const struct pmu_probe_info *info)
-{
-	int cpu = get_cpu();
-	unsigned int cpuid = read_cpuid_id();
-	int ret = -ENODEV;
-
-	pr_info("probing PMU on CPU %d\n", cpu);
-
-	for (; info->init != NULL; info++) {
-		if ((cpuid & info->mask) != info->cpuid)
-			continue;
-		ret = info->init(pmu);
-		break;
-	}
-
-	put_cpu();
-	return ret;
-}
-
-static int pmu_parse_percpu_irq(struct arm_pmu *pmu, int irq)
-{
-	int cpu, ret;
-	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
-
-	ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus);
-	if (ret)
-		return ret;
-
-	for_each_cpu(cpu, &pmu->supported_cpus)
-		per_cpu(hw_events->irq, cpu) = irq;
-
-	return 0;
-}
-
-static bool pmu_has_irq_affinity(struct device_node *node)
-{
-	return !!of_find_property(node, "interrupt-affinity", NULL);
-}
-
-static int pmu_parse_irq_affinity(struct device_node *node, int i)
-{
-	struct device_node *dn;
-	int cpu;
-
-	/*
-	 * If we don't have an interrupt-affinity property, we guess irq
-	 * affinity matches our logical CPU order, as we used to assume.
-	 * This is fragile, so we'll warn in pmu_parse_irqs().
-	 */
-	if (!pmu_has_irq_affinity(node))
-		return i;
-
-	dn = of_parse_phandle(node, "interrupt-affinity", i);
-	if (!dn) {
-		pr_warn("failed to parse interrupt-affinity[%d] for %s\n",
-			i, node->name);
-		return -EINVAL;
-	}
-
-	/* Now look up the logical CPU number */
-	for_each_possible_cpu(cpu) {
-		struct device_node *cpu_dn;
-
-		cpu_dn = of_cpu_device_node_get(cpu);
-		of_node_put(cpu_dn);
-
-		if (dn == cpu_dn)
-			break;
-	}
-
-	if (cpu >= nr_cpu_ids) {
-		pr_warn("failed to find logical CPU for %s\n", dn->name);
-	}
-
-	of_node_put(dn);
-
-	return cpu;
-}
-
-static int pmu_parse_irqs(struct arm_pmu *pmu)
-{
-	int i = 0, irqs;
-	struct platform_device *pdev = pmu->plat_device;
-	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
-
-	irqs = platform_irq_count(pdev);
-	if (irqs < 0) {
-		pr_err("unable to count PMU IRQs\n");
-		return irqs;
-	}
-
-	/*
-	 * In this case we have no idea which CPUs are covered by the PMU.
-	 * To match our prior behaviour, we assume all CPUs in this case.
-	 */
-	if (irqs == 0) {
-		pr_warn("no irqs for PMU, sampling events not supported\n");
-		pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-		cpumask_setall(&pmu->supported_cpus);
-		return 0;
-	}
-
-	if (irqs == 1) {
-		int irq = platform_get_irq(pdev, 0);
-		if (irq && irq_is_percpu(irq))
-			return pmu_parse_percpu_irq(pmu, irq);
-	}
-
-	if (!pmu_has_irq_affinity(pdev->dev.of_node)) {
-		pr_warn("no interrupt-affinity property for %s, guessing.\n",
-			of_node_full_name(pdev->dev.of_node));
-	}
-
-	/*
-	 * Some platforms have all PMU IRQs OR'd into a single IRQ, with a
-	 * special platdata function that attempts to demux them.
-	 */
-	if (dev_get_platdata(&pdev->dev))
-		cpumask_setall(&pmu->supported_cpus);
-
-	for (i = 0; i < irqs; i++) {
-		int cpu, irq;
-
-		irq = platform_get_irq(pdev, i);
-		if (WARN_ON(irq <= 0))
-			continue;
-
-		if (irq_is_percpu(irq)) {
-			pr_warn("multiple PPIs or mismatched SPI/PPI detected\n");
-			return -EINVAL;
-		}
-
-		cpu = pmu_parse_irq_affinity(pdev->dev.of_node, i);
-		if (cpu < 0)
-			return cpu;
-		if (cpu >= nr_cpu_ids)
-			continue;
-
-		if (per_cpu(hw_events->irq, cpu)) {
-			pr_warn("multiple PMU IRQs for the same CPU detected\n");
-			return -EINVAL;
-		}
-
-		per_cpu(hw_events->irq, cpu) = irq;
-		cpumask_set_cpu(cpu, &pmu->supported_cpus);
-	}
-
-	return 0;
-}
-
-static struct arm_pmu *armpmu_alloc(void)
+struct arm_pmu *armpmu_alloc(void)
 {
 	struct arm_pmu *pmu;
 	int cpu;
@@ -994,7 +838,7 @@ out:
 	return NULL;
 }
 
-static void armpmu_free(struct arm_pmu *pmu)
+void armpmu_free(struct arm_pmu *pmu)
 {
 	free_percpu(pmu->hw_events);
 	kfree(pmu);
@@ -1025,68 +869,6 @@ out_destroy:
 	return ret;
 }
 
-int arm_pmu_device_probe(struct platform_device *pdev,
-			 const struct of_device_id *of_table,
-			 const struct pmu_probe_info *probe_table)
-{
-	const struct of_device_id *of_id;
-	armpmu_init_fn init_fn;
-	struct device_node *node = pdev->dev.of_node;
-	struct arm_pmu *pmu;
-	int ret = -ENODEV;
-
-	pmu = armpmu_alloc();
-	if (!pmu)
-		return -ENOMEM;
-
-	pmu->plat_device = pdev;
-
-	ret = pmu_parse_irqs(pmu);
-	if (ret)
-		goto out_free;
-
-	if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
-		init_fn = of_id->data;
-
-		pmu->secure_access = of_property_read_bool(pdev->dev.of_node,
-							   "secure-reg-access");
-
-		/* arm64 systems boot only as non-secure */
-		if (IS_ENABLED(CONFIG_ARM64) && pmu->secure_access) {
-			pr_warn("ignoring \"secure-reg-access\" property for arm64\n");
-			pmu->secure_access = false;
-		}
-
-		ret = init_fn(pmu);
-	} else if (probe_table) {
-		cpumask_setall(&pmu->supported_cpus);
-		ret = probe_current_pmu(pmu, probe_table);
-	}
-
-	if (ret) {
-		pr_info("%s: failed to probe PMU!\n", of_node_full_name(node));
-		goto out_free;
-	}
-
-	ret = armpmu_request_irqs(pmu);
-	if (ret)
-		goto out_free_irqs;
-
-	ret = armpmu_register(pmu);
-	if (ret)
-		goto out_free;
-
-	return 0;
-
-out_free_irqs:
-	armpmu_free_irqs(pmu);
-out_free:
-	pr_info("%s: failed to register PMU devices!\n",
-		of_node_full_name(node));
-	armpmu_free(pmu);
-	return ret;
-}
-
 static int arm_pmu_hp_init(void)
 {
 	int ret;
diff --git a/drivers/perf/arm_pmu_platform.c b/drivers/perf/arm_pmu_platform.c
new file mode 100644
index 000000000000..69255f53057a
--- /dev/null
+++ b/drivers/perf/arm_pmu_platform.c
@@ -0,0 +1,235 @@
+/*
+ * platform_device probing code for ARM performance counters.
+ *
+ * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
+ * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com>
+ */
+#define pr_fmt(fmt) "hw perfevents: " fmt
+
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/kconfig.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/percpu.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+
+static int probe_current_pmu(struct arm_pmu *pmu,
+			     const struct pmu_probe_info *info)
+{
+	int cpu = get_cpu();
+	unsigned int cpuid = read_cpuid_id();
+	int ret = -ENODEV;
+
+	pr_info("probing PMU on CPU %d\n", cpu);
+
+	for (; info->init != NULL; info++) {
+		if ((cpuid & info->mask) != info->cpuid)
+			continue;
+		ret = info->init(pmu);
+		break;
+	}
+
+	put_cpu();
+	return ret;
+}
+
+static int pmu_parse_percpu_irq(struct arm_pmu *pmu, int irq)
+{
+	int cpu, ret;
+	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
+
+	ret = irq_get_percpu_devid_partition(irq, &pmu->supported_cpus);
+	if (ret)
+		return ret;
+
+	for_each_cpu(cpu, &pmu->supported_cpus)
+		per_cpu(hw_events->irq, cpu) = irq;
+
+	return 0;
+}
+
+static bool pmu_has_irq_affinity(struct device_node *node)
+{
+	return !!of_find_property(node, "interrupt-affinity", NULL);
+}
+
+static int pmu_parse_irq_affinity(struct device_node *node, int i)
+{
+	struct device_node *dn;
+	int cpu;
+
+	/*
+	 * If we don't have an interrupt-affinity property, we guess irq
+	 * affinity matches our logical CPU order, as we used to assume.
+	 * This is fragile, so we'll warn in pmu_parse_irqs().
+	 */
+	if (!pmu_has_irq_affinity(node))
+		return i;
+
+	dn = of_parse_phandle(node, "interrupt-affinity", i);
+	if (!dn) {
+		pr_warn("failed to parse interrupt-affinity[%d] for %s\n",
+			i, node->name);
+		return -EINVAL;
+	}
+
+	/* Now look up the logical CPU number */
+	for_each_possible_cpu(cpu) {
+		struct device_node *cpu_dn;
+
+		cpu_dn = of_cpu_device_node_get(cpu);
+		of_node_put(cpu_dn);
+
+		if (dn == cpu_dn)
+			break;
+	}
+
+	if (cpu >= nr_cpu_ids) {
+		pr_warn("failed to find logical CPU for %s\n", dn->name);
+	}
+
+	of_node_put(dn);
+
+	return cpu;
+}
+
+static int pmu_parse_irqs(struct arm_pmu *pmu)
+{
+	int i = 0, num_irqs;
+	struct platform_device *pdev = pmu->plat_device;
+	struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
+
+	num_irqs = platform_irq_count(pdev);
+	if (num_irqs < 0) {
+		pr_err("unable to count PMU IRQs\n");
+		return num_irqs;
+	}
+
+	/*
+	 * In this case we have no idea which CPUs are covered by the PMU.
+	 * To match our prior behaviour, we assume all CPUs in this case.
+	 */
+	if (num_irqs == 0) {
+		pr_warn("no irqs for PMU, sampling events not supported\n");
+		pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+		cpumask_setall(&pmu->supported_cpus);
+		return 0;
+	}
+
+	if (num_irqs == 1) {
+		int irq = platform_get_irq(pdev, 0);
+		if (irq && irq_is_percpu(irq))
+			return pmu_parse_percpu_irq(pmu, irq);
+	}
+
+	if (!pmu_has_irq_affinity(pdev->dev.of_node)) {
+		pr_warn("no interrupt-affinity property for %s, guessing.\n",
+			of_node_full_name(pdev->dev.of_node));
+	}
+
+	/*
+	 * Some platforms have all PMU IRQs OR'd into a single IRQ, with a
+	 * special platdata function that attempts to demux them.
+	 */
+	if (dev_get_platdata(&pdev->dev))
+		cpumask_setall(&pmu->supported_cpus);
+
+	for (i = 0; i < num_irqs; i++) {
+		int cpu, irq;
+
+		irq = platform_get_irq(pdev, i);
+		if (WARN_ON(irq <= 0))
+			continue;
+
+		if (irq_is_percpu(irq)) {
+			pr_warn("multiple PPIs or mismatched SPI/PPI detected\n");
+			return -EINVAL;
+		}
+
+		cpu = pmu_parse_irq_affinity(pdev->dev.of_node, i);
+		if (cpu < 0)
+			return cpu;
+		if (cpu >= nr_cpu_ids)
+			continue;
+
+		if (per_cpu(hw_events->irq, cpu)) {
+			pr_warn("multiple PMU IRQs for the same CPU detected\n");
+			return -EINVAL;
+		}
+
+		per_cpu(hw_events->irq, cpu) = irq;
+		cpumask_set_cpu(cpu, &pmu->supported_cpus);
+	}
+
+	return 0;
+}
+
+int arm_pmu_device_probe(struct platform_device *pdev,
+			 const struct of_device_id *of_table,
+			 const struct pmu_probe_info *probe_table)
+{
+	const struct of_device_id *of_id;
+	armpmu_init_fn init_fn;
+	struct device_node *node = pdev->dev.of_node;
+	struct arm_pmu *pmu;
+	int ret = -ENODEV;
+
+	pmu = armpmu_alloc();
+	if (!pmu)
+		return -ENOMEM;
+
+	pmu->plat_device = pdev;
+
+	ret = pmu_parse_irqs(pmu);
+	if (ret)
+		goto out_free;
+
+	if (node && (of_id = of_match_node(of_table, pdev->dev.of_node))) {
+		init_fn = of_id->data;
+
+		pmu->secure_access = of_property_read_bool(pdev->dev.of_node,
+							   "secure-reg-access");
+
+		/* arm64 systems boot only as non-secure */
+		if (IS_ENABLED(CONFIG_ARM64) && pmu->secure_access) {
+			pr_warn("ignoring \"secure-reg-access\" property for arm64\n");
+			pmu->secure_access = false;
+		}
+
+		ret = init_fn(pmu);
+	} else if (probe_table) {
+		cpumask_setall(&pmu->supported_cpus);
+		ret = probe_current_pmu(pmu, probe_table);
+	}
+
+	if (ret) {
+		pr_info("%s: failed to probe PMU!\n", of_node_full_name(node));
+		goto out_free;
+	}
+
+	ret = armpmu_request_irqs(pmu);
+	if (ret)
+		goto out_free_irqs;
+
+	ret = armpmu_register(pmu);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free_irqs:
+	armpmu_free_irqs(pmu);
+out_free:
+	pr_info("%s: failed to register PMU devices!\n",
+		of_node_full_name(node));
+	armpmu_free(pmu);
+	return ret;
+}
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 4249914315a4..25556ebb1c7b 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -159,6 +159,13 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 			 const struct of_device_id *of_table,
 			 const struct pmu_probe_info *probe_table);
 
+/* Internal functions only for core arm_pmu code */
+struct arm_pmu *armpmu_alloc(void);
+void armpmu_free(struct arm_pmu *pmu);
+int armpmu_register(struct arm_pmu *pmu);
+int armpmu_request_irqs(struct arm_pmu *armpmu);
+void armpmu_free_irqs(struct arm_pmu *armpmu);
+
 #define ARMV8_PMU_PDEV_NAME "armv8-pmu"
 
 #endif /* CONFIG_ARM_PMU */
-- 
cgit v1.2.3


From 45736a72fb79b204c1fbdb08a1e1a2aa52c7281a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 11 Apr 2017 09:39:55 +0100
Subject: drivers/perf: arm_pmu: add ACPI framework

This patch adds framework code to handle parsing PMU data out of the
MADT, sanity checking this, and managing the association of CPUs (and
their interrupts) with appropriate logical PMUs.

For the time being, we expect that only one PMU driver (PMUv3) will make
use of this, and we simply pass in a single probe function.

This is based on an earlier patch from Jeremy Linton.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 drivers/perf/Kconfig         |   4 +
 drivers/perf/Makefile        |   1 +
 drivers/perf/arm_pmu.c       |   4 +-
 drivers/perf/arm_pmu_acpi.c  | 256 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/cpuhotplug.h   |   1 +
 include/linux/perf/arm_pmu.h |  11 ++
 6 files changed, 275 insertions(+), 2 deletions(-)
 create mode 100644 drivers/perf/arm_pmu_acpi.c

(limited to 'include/linux')

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index c436e0d303e7..aa587edaf9ea 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -12,6 +12,10 @@ config ARM_PMU
 	  Say y if you want to use CPU performance monitors on ARM-based
 	  systems.
 
+config ARM_PMU_ACPI
+	depends on ARM_PMU && ACPI
+	def_bool y
+
 config QCOM_L2_PMU
 	bool "Qualcomm Technologies L2-cache PMU"
 	depends on ARCH_QCOM && ARM64 && PERF_EVENTS && ACPI
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 925cd3903029..6420bd4394d5 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
+obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
 obj-$(CONFIG_QCOM_L2_PMU)	+= qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index b3bedfa512eb..dc459eb1246b 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -525,7 +525,7 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);
 
-static void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
+void armpmu_free_irq(struct arm_pmu *armpmu, int cpu)
 {
 	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
 	int irq = per_cpu(hw_events->irq, cpu);
@@ -550,7 +550,7 @@ void armpmu_free_irqs(struct arm_pmu *armpmu)
 		armpmu_free_irq(armpmu, cpu);
 }
 
-static int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
+int armpmu_request_irq(struct arm_pmu *armpmu, int cpu)
 {
 	int err = 0;
 	struct pmu_hw_events __percpu *hw_events = armpmu->hw_events;
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
new file mode 100644
index 000000000000..34c862f213c7
--- /dev/null
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -0,0 +1,256 @@
+/*
+ * ACPI probing code for ARM performance counters.
+ *
+ * Copyright (C) 2017 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/perf/arm_pmu.h>
+
+#include <asm/cputype.h>
+
+static DEFINE_PER_CPU(struct arm_pmu *, probed_pmus);
+static DEFINE_PER_CPU(int, pmu_irqs);
+
+static int arm_pmu_acpi_register_irq(int cpu)
+{
+	struct acpi_madt_generic_interrupt *gicc;
+	int gsi, trigger;
+
+	gicc = acpi_cpu_get_madt_gicc(cpu);
+	if (WARN_ON(!gicc))
+		return -EINVAL;
+
+	gsi = gicc->performance_interrupt;
+	if (gicc->flags & ACPI_MADT_PERFORMANCE_IRQ_MODE)
+		trigger = ACPI_EDGE_SENSITIVE;
+	else
+		trigger = ACPI_LEVEL_SENSITIVE;
+
+	/*
+	 * Helpfully, the MADT GICC doesn't have a polarity flag for the
+	 * "performance interrupt". Luckily, on compliant GICs the polarity is
+	 * a fixed value in HW (for both SPIs and PPIs) that we cannot change
+	 * from SW.
+	 *
+	 * Here we pass in ACPI_ACTIVE_HIGH to keep the core code happy. This
+	 * may not match the real polarity, but that should not matter.
+	 *
+	 * Other interrupt controllers are not supported with ACPI.
+	 */
+	return acpi_register_gsi(NULL, gsi, trigger, ACPI_ACTIVE_HIGH);
+}
+
+static void arm_pmu_acpi_unregister_irq(int cpu)
+{
+	struct acpi_madt_generic_interrupt *gicc;
+	int gsi;
+
+	gicc = acpi_cpu_get_madt_gicc(cpu);
+	if (!gicc)
+		return;
+
+	gsi = gicc->performance_interrupt;
+	acpi_unregister_gsi(gsi);
+}
+
+static int arm_pmu_acpi_parse_irqs(void)
+{
+	int irq, cpu, irq_cpu, err;
+
+	for_each_possible_cpu(cpu) {
+		irq = arm_pmu_acpi_register_irq(cpu);
+		if (irq < 0) {
+			err = irq;
+			pr_warn("Unable to parse ACPI PMU IRQ for CPU%d: %d\n",
+				cpu, err);
+			goto out_err;
+		} else if (irq == 0) {
+			pr_warn("No ACPI PMU IRQ for CPU%d\n", cpu);
+		}
+
+		per_cpu(pmu_irqs, cpu) = irq;
+	}
+
+	return 0;
+
+out_err:
+	for_each_possible_cpu(cpu) {
+		irq = per_cpu(pmu_irqs, cpu);
+		if (!irq)
+			continue;
+
+		arm_pmu_acpi_unregister_irq(cpu);
+
+		/*
+		 * Blat all copies of the IRQ so that we only unregister the
+		 * corresponding GSI once (e.g. when we have PPIs).
+		 */
+		for_each_possible_cpu(irq_cpu) {
+			if (per_cpu(pmu_irqs, irq_cpu) == irq)
+				per_cpu(pmu_irqs, irq_cpu) = 0;
+		}
+	}
+
+	return err;
+}
+
+static struct arm_pmu *arm_pmu_acpi_find_alloc_pmu(void)
+{
+	unsigned long cpuid = read_cpuid_id();
+	struct arm_pmu *pmu;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		pmu = per_cpu(probed_pmus, cpu);
+		if (!pmu || pmu->acpi_cpuid != cpuid)
+			continue;
+
+		return pmu;
+	}
+
+	pmu = armpmu_alloc();
+	if (!pmu) {
+		pr_warn("Unable to allocate PMU for CPU%d\n",
+			smp_processor_id());
+		return NULL;
+	}
+
+	pmu->acpi_cpuid = cpuid;
+
+	return pmu;
+}
+
+/*
+ * This must run before the common arm_pmu hotplug logic, so that we can
+ * associate a CPU and its interrupt before the common code tries to manage the
+ * affinity and so on.
+ *
+ * Note that hotplug events are serialized, so we cannot race with another CPU
+ * coming up. The perf core won't open events while a hotplug event is in
+ * progress.
+ */
+static int arm_pmu_acpi_cpu_starting(unsigned int cpu)
+{
+	struct arm_pmu *pmu;
+	struct pmu_hw_events __percpu *hw_events;
+	int irq;
+
+	/* If we've already probed this CPU, we have nothing to do */
+	if (per_cpu(probed_pmus, cpu))
+		return 0;
+
+	irq = per_cpu(pmu_irqs, cpu);
+
+	pmu = arm_pmu_acpi_find_alloc_pmu();
+	if (!pmu)
+		return -ENOMEM;
+
+	cpumask_set_cpu(cpu, &pmu->supported_cpus);
+
+	per_cpu(probed_pmus, cpu) = pmu;
+
+	/*
+	 * Log and request the IRQ so the core arm_pmu code can manage it.  In
+	 * some situations (e.g. mismatched PPIs), we may fail to request the
+	 * IRQ. However, it may be too late for us to do anything about it.
+	 * The common ARM PMU code will log a warning in this case.
+	 */
+	hw_events = pmu->hw_events;
+	per_cpu(hw_events->irq, cpu) = irq;
+	armpmu_request_irq(pmu, cpu);
+
+	/*
+	 * Ideally, we'd probe the PMU here when we find the first matching
+	 * CPU. We can't do that for several reasons; see the comment in
+	 * arm_pmu_acpi_init().
+	 *
+	 * So for the time being, we're done.
+	 */
+	return 0;
+}
+
+int arm_pmu_acpi_probe(armpmu_init_fn init_fn)
+{
+	int pmu_idx = 0;
+	int cpu, ret;
+
+	if (acpi_disabled)
+		return 0;
+
+	/*
+	 * Initialise and register the set of PMUs which we know about right
+	 * now. Ideally we'd do this in arm_pmu_acpi_cpu_starting() so that we
+	 * could handle late hotplug, but this may lead to deadlock since we
+	 * might try to register a hotplug notifier instance from within a
+	 * hotplug notifier.
+	 *
+	 * There's also the problem of having access to the right init_fn,
+	 * without tying this too deeply into the "real" PMU driver.
+	 *
+	 * For the moment, as with the platform/DT case, we need at least one
+	 * of a PMU's CPUs to be online at probe time.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct arm_pmu *pmu = per_cpu(probed_pmus, cpu);
+		char *base_name;
+
+		if (!pmu || pmu->name)
+			continue;
+
+		ret = init_fn(pmu);
+		if (ret == -ENODEV) {
+			/* PMU not handled by this driver, or not present */
+			continue;
+		} else if (ret) {
+			pr_warn("Unable to initialise PMU for CPU%d\n", cpu);
+			return ret;
+		}
+
+		base_name = pmu->name;
+		pmu->name = kasprintf(GFP_KERNEL, "%s_%d", base_name, pmu_idx++);
+		if (!pmu->name) {
+			pr_warn("Unable to allocate PMU name for CPU%d\n", cpu);
+			return -ENOMEM;
+		}
+
+		ret = armpmu_register(pmu);
+		if (ret) {
+			pr_warn("Failed to register PMU for CPU%d\n", cpu);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int arm_pmu_acpi_init(void)
+{
+	int ret;
+
+	if (acpi_disabled)
+		return 0;
+
+	/*
+	 * We can't request IRQs yet, since we don't know the cookie value
+	 * until we know which CPUs share the same logical PMU. We'll handle
+	 * that in arm_pmu_acpi_cpu_starting().
+	 */
+	ret = arm_pmu_acpi_parse_irqs();
+	if (ret)
+		return ret;
+
+	ret = cpuhp_setup_state(CPUHP_AP_PERF_ARM_ACPI_STARTING,
+				"perf/arm/pmu_acpi:starting",
+				arm_pmu_acpi_cpu_starting, NULL);
+
+	return ret;
+}
+subsys_initcall(arm_pmu_acpi_init)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index cfcfab37d9c4..0f2a80377520 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -94,6 +94,7 @@ enum cpuhp_state {
 	CPUHP_AP_ARM_VFP_STARTING,
 	CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
 	CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING,
+	CPUHP_AP_PERF_ARM_ACPI_STARTING,
 	CPUHP_AP_PERF_ARM_STARTING,
 	CPUHP_AP_ARM_L2X0_STARTING,
 	CPUHP_AP_ARM_ARCH_TIMER_STARTING,
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 25556ebb1c7b..1360dd6d5e61 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -117,6 +117,9 @@ struct arm_pmu {
 	struct notifier_block	cpu_pm_nb;
 	/* the attr_groups array must be NULL-terminated */
 	const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1];
+
+	/* Only to be used by ACPI probing code */
+	unsigned long acpi_cpuid;
 };
 
 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
@@ -159,12 +162,20 @@ int arm_pmu_device_probe(struct platform_device *pdev,
 			 const struct of_device_id *of_table,
 			 const struct pmu_probe_info *probe_table);
 
+#ifdef CONFIG_ACPI
+int arm_pmu_acpi_probe(armpmu_init_fn init_fn);
+#else
+static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
+#endif
+
 /* Internal functions only for core arm_pmu code */
 struct arm_pmu *armpmu_alloc(void);
 void armpmu_free(struct arm_pmu *pmu);
 int armpmu_register(struct arm_pmu *pmu);
 int armpmu_request_irqs(struct arm_pmu *armpmu);
 void armpmu_free_irqs(struct arm_pmu *armpmu);
+int armpmu_request_irq(struct arm_pmu *armpmu, int cpu);
+void armpmu_free_irq(struct arm_pmu *armpmu, int cpu);
 
 #define ARMV8_PMU_PDEV_NAME "armv8-pmu"
 
-- 
cgit v1.2.3


From 50512625da06c41517cb596f51b923ce15f401a4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:50 +1000
Subject: Revert "block: introduce bio_copy_data_partial"

This reverts commit 6f8802852f7e58a12177a86179803b9efaad98e2.
bio_copy_data_partial() is no longer needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 block/bio.c         | 60 ++++++++++++-----------------------------------------
 include/linux/bio.h |  2 --
 2 files changed, 13 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 036435995c55..12c2837c4277 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -990,8 +990,19 @@ int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bio_alloc_pages);
 
-static void __bio_copy_data(struct bio *dst, struct bio *src,
-			    int offset, int size)
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
 {
 	struct bvec_iter src_iter, dst_iter;
 	struct bio_vec src_bv, dst_bv;
@@ -1001,12 +1012,6 @@ static void __bio_copy_data(struct bio *dst, struct bio *src,
 	src_iter = src->bi_iter;
 	dst_iter = dst->bi_iter;
 
-	/* for supporting partial copy */
-	if (offset || size != src->bi_iter.bi_size) {
-		bio_advance_iter(src, &src_iter, offset);
-		src_iter.bi_size = size;
-	}
-
 	while (1) {
 		if (!src_iter.bi_size) {
 			src = src->bi_next;
@@ -1043,47 +1048,8 @@ static void __bio_copy_data(struct bio *dst, struct bio *src,
 		bio_advance_iter(dst, &dst_iter, bytes);
 	}
 }
-
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
-{
-	__bio_copy_data(dst, src, 0, src->bi_iter.bi_size);
-}
 EXPORT_SYMBOL(bio_copy_data);
 
-/**
- * bio_copy_data_partial - copy partial contents of data buffers from one
- * chain of bios to another
- * @dst: destination bio list
- * @src: source bio list
- * @offset: starting copy from the offset
- * @size: how many bytes to copy
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data_partial(struct bio *dst, struct bio *src,
-			   int offset, int size)
-{
-	__bio_copy_data(dst, src, offset, size);
-
-}
-EXPORT_SYMBOL(bio_copy_data_partial);
-
 struct bio_map_data {
 	int is_our_pages;
 	struct iov_iter iter;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fafef6343d1b..7cf8a6c70a3f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -461,8 +461,6 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
-extern void bio_copy_data_partial(struct bio *dst, struct bio *src,
-				  int offset, int size);
 extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 extern void bio_free_pages(struct bio *bio);
 
-- 
cgit v1.2.3


From be9370a7d8614d1fa54649c75de14458e79b91ec Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 11 Apr 2017 15:34:57 +0200
Subject: bpf: remove struct bpf_prog_type_list

There's no need to have struct bpf_prog_type_list since
it just contains a list_head, the type, and the ops
pointer. Since the types are densely packed and not
actually dynamically registered, it's much easier and
smaller to have an array of type->ops pointer. Also
initialize this array statically to remove code needed
to initialize it.

In order to save duplicating the list, move it to a new
header file and include it in the places needing it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 16 ++++------
 include/linux/bpf_types.h | 18 ++++++++++++
 kernel/bpf/syscall.c      | 27 +++++++----------
 kernel/trace/bpf_trace.c  | 30 ++-----------------
 net/core/filter.c         | 75 +++++------------------------------------------
 5 files changed, 44 insertions(+), 122 deletions(-)
 create mode 100644 include/linux/bpf_types.h

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bbb513da5075..07fc02bb38e4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -173,12 +173,6 @@ struct bpf_verifier_ops {
 			union bpf_attr __user *uattr);
 };
 
-struct bpf_prog_type_list {
-	struct list_head list_node;
-	const struct bpf_verifier_ops *ops;
-	enum bpf_prog_type type;
-};
-
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -243,7 +237,11 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl);
+#define BPF_PROG_TYPE(_id, _ops) \
+	extern const struct bpf_verifier_ops _ops;
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
@@ -306,10 +304,6 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 #else
-static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl)
-{
-}
-
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
new file mode 100644
index 000000000000..68b0a9811216
--- /dev/null
+++ b/include/linux/bpf_types.h
@@ -0,0 +1,18 @@
+/* internal file - do not include directly */
+
+#ifdef CONFIG_NET
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
+#endif
+#ifdef CONFIG_BPF_EVENTS
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
+#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ab0cf4c43690..ea55691cbf5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -573,26 +573,21 @@ err_put:
 	return err;
 }
 
-static LIST_HEAD(bpf_prog_types);
+static const struct bpf_verifier_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _ops) \
+	[_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+};
 
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
-	struct bpf_prog_type_list *tl;
-
-	list_for_each_entry(tl, &bpf_prog_types, list_node) {
-		if (tl->type == type) {
-			prog->aux->ops = tl->ops;
-			prog->type = type;
-			return 0;
-		}
-	}
-
-	return -EINVAL;
-}
+	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
+		return -EINVAL;
 
-void bpf_register_prog_type(struct bpf_prog_type_list *tl)
-{
-	list_add(&tl->list_node, &bpf_prog_types);
+	prog->aux->ops = bpf_prog_types[type];
+	prog->type = type;
+	return 0;
 }
 
 /* drop refcnt on maps used by eBPF program and free auxilary data */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cee9802cf3e0..8a4efac28710 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -501,16 +501,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-static const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_prog_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
-static struct bpf_prog_type_list kprobe_tl __ro_after_init = {
-	.ops	= &kprobe_prog_ops,
-	.type	= BPF_PROG_TYPE_KPROBE,
-};
-
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
@@ -584,16 +579,11 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-static const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_prog_ops = {
 	.get_func_proto  = tp_prog_func_proto,
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
-static struct bpf_prog_type_list tracepoint_tl __ro_after_init = {
-	.ops	= &tracepoint_prog_ops,
-	.type	= BPF_PROG_TYPE_TRACEPOINT,
-};
-
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    enum bpf_reg_type *reg_type)
 {
@@ -642,22 +632,8 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-static const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_prog_ops = {
 	.get_func_proto		= tp_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
-
-static struct bpf_prog_type_list perf_event_tl __ro_after_init = {
-	.ops	= &perf_event_prog_ops,
-	.type	= BPF_PROG_TYPE_PERF_EVENT,
-};
-
-static int __init register_kprobe_prog_ops(void)
-{
-	bpf_register_prog_type(&kprobe_tl);
-	bpf_register_prog_type(&tracepoint_tl);
-	bpf_register_prog_type(&perf_event_tl);
-	return 0;
-}
-late_initcall(register_kprobe_prog_ops);
diff --git a/net/core/filter.c b/net/core/filter.c
index 15e9a81ffebe..bbe0cf415105 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3298,13 +3298,13 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-static const struct bpf_verifier_ops sk_filter_ops = {
+const struct bpf_verifier_ops sk_filter_prog_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-static const struct bpf_verifier_ops tc_cls_act_ops = {
+const struct bpf_verifier_ops tc_cls_act_prog_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
@@ -3312,28 +3312,28 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-static const struct bpf_verifier_ops xdp_ops = {
+const struct bpf_verifier_ops xdp_prog_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
 	.test_run		= bpf_prog_test_run_xdp,
 };
 
-static const struct bpf_verifier_ops cg_skb_ops = {
+const struct bpf_verifier_ops cg_skb_prog_ops = {
 	.get_func_proto		= cg_skb_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-static const struct bpf_verifier_ops lwt_inout_ops = {
+const struct bpf_verifier_ops lwt_inout_prog_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-static const struct bpf_verifier_ops lwt_xmit_ops = {
+const struct bpf_verifier_ops lwt_xmit_prog_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
@@ -3341,73 +3341,12 @@ static const struct bpf_verifier_ops lwt_xmit_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-static const struct bpf_verifier_ops cg_sock_ops = {
+const struct bpf_verifier_ops cg_sock_prog_ops = {
 	.get_func_proto		= bpf_base_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
-	.ops	= &sk_filter_ops,
-	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
-};
-
-static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
-	.ops	= &tc_cls_act_ops,
-	.type	= BPF_PROG_TYPE_SCHED_CLS,
-};
-
-static struct bpf_prog_type_list sched_act_type __ro_after_init = {
-	.ops	= &tc_cls_act_ops,
-	.type	= BPF_PROG_TYPE_SCHED_ACT,
-};
-
-static struct bpf_prog_type_list xdp_type __ro_after_init = {
-	.ops	= &xdp_ops,
-	.type	= BPF_PROG_TYPE_XDP,
-};
-
-static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
-	.ops	= &cg_skb_ops,
-	.type	= BPF_PROG_TYPE_CGROUP_SKB,
-};
-
-static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
-	.ops	= &lwt_inout_ops,
-	.type	= BPF_PROG_TYPE_LWT_IN,
-};
-
-static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
-	.ops	= &lwt_inout_ops,
-	.type	= BPF_PROG_TYPE_LWT_OUT,
-};
-
-static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
-	.ops	= &lwt_xmit_ops,
-	.type	= BPF_PROG_TYPE_LWT_XMIT,
-};
-
-static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
-	.ops	= &cg_sock_ops,
-	.type	= BPF_PROG_TYPE_CGROUP_SOCK
-};
-
-static int __init register_sk_filter_ops(void)
-{
-	bpf_register_prog_type(&sk_filter_type);
-	bpf_register_prog_type(&sched_cls_type);
-	bpf_register_prog_type(&sched_act_type);
-	bpf_register_prog_type(&xdp_type);
-	bpf_register_prog_type(&cg_skb_type);
-	bpf_register_prog_type(&cg_sock_type);
-	bpf_register_prog_type(&lwt_in_type);
-	bpf_register_prog_type(&lwt_out_type);
-	bpf_register_prog_type(&lwt_xmit_type);
-
-	return 0;
-}
-late_initcall(register_sk_filter_ops);
-
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From 40077e0cf62206ac3c315b6991d8dcddb3703286 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 11 Apr 2017 15:34:58 +0200
Subject: bpf: remove struct bpf_map_type_list

There's no need to have struct bpf_map_type_list since
it just contains a list_head, the type, and the ops
pointer. Since the types are densely packed and not
actually dynamically registered, it's much easier and
smaller to have an array of type->ops pointer. Also
initialize this array statically to remove code needed
to initialize it.

In order to save duplicating the list, move it to the
types header file added by the previous patch and
include it in the same fashion.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 11 ++-----
 include/linux/bpf_types.h | 18 +++++++++++
 kernel/bpf/arraymap.c     | 78 ++++-------------------------------------------
 kernel/bpf/hashtab.c      | 46 +++-------------------------
 kernel/bpf/lpm_trie.c     | 14 +--------
 kernel/bpf/stackmap.c     | 14 +--------
 kernel/bpf/syscall.c      | 37 +++++++++++-----------
 7 files changed, 53 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 07fc02bb38e4..6bb38d76faf4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -53,12 +53,6 @@ struct bpf_map {
 	struct bpf_map *inner_map_meta;
 };
 
-struct bpf_map_type_list {
-	struct list_head list_node;
-	const struct bpf_map_ops *ops;
-	enum bpf_map_type type;
-};
-
 /* function argument constraints */
 enum bpf_arg_type {
 	ARG_DONTCARE = 0,	/* unused argument in helper function */
@@ -239,10 +233,11 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 
 #define BPF_PROG_TYPE(_id, _ops) \
 	extern const struct bpf_verifier_ops _ops;
+#define BPF_MAP_TYPE(_id, _ops) \
+	extern const struct bpf_map_ops _ops;
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
-
-void bpf_register_map_type(struct bpf_map_type_list *tl);
+#undef BPF_MAP_TYPE
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 68b0a9811216..03bf223f18be 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -16,3 +16,21 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
 #endif
+
+BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
+#ifdef CONFIG_CGROUPS
+BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
+#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops)
+#ifdef CONFIG_PERF_EVENTS
+BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops)
+#endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index bc9da93db403..ec621df5a97a 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -287,7 +287,7 @@ static void array_map_free(struct bpf_map *map)
 	bpf_map_area_free(array);
 }
 
-static const struct bpf_map_ops array_ops = {
+const struct bpf_map_ops array_map_ops = {
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -297,12 +297,7 @@ static const struct bpf_map_ops array_ops = {
 	.map_gen_lookup = array_map_gen_lookup,
 };
 
-static struct bpf_map_type_list array_type __ro_after_init = {
-	.ops = &array_ops,
-	.type = BPF_MAP_TYPE_ARRAY,
-};
-
-static const struct bpf_map_ops percpu_array_ops = {
+const struct bpf_map_ops percpu_array_map_ops = {
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -311,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = {
 	.map_delete_elem = array_map_delete_elem,
 };
 
-static struct bpf_map_type_list percpu_array_type __ro_after_init = {
-	.ops = &percpu_array_ops,
-	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
-};
-
-static int __init register_array_map(void)
-{
-	bpf_register_map_type(&array_type);
-	bpf_register_map_type(&percpu_array_type);
-	return 0;
-}
-late_initcall(register_array_map);
-
 static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
 {
 	/* only file descriptors can be stored in this type of map */
@@ -427,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map)
 		fd_array_map_delete_elem(map, &i);
 }
 
-static const struct bpf_map_ops prog_array_ops = {
+const struct bpf_map_ops prog_array_map_ops = {
 	.map_alloc = fd_array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -437,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = {
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 };
 
-static struct bpf_map_type_list prog_array_type __ro_after_init = {
-	.ops = &prog_array_ops,
-	.type = BPF_MAP_TYPE_PROG_ARRAY,
-};
-
-static int __init register_prog_array_map(void)
-{
-	bpf_register_map_type(&prog_array_type);
-	return 0;
-}
-late_initcall(register_prog_array_map);
-
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
 						   struct file *map_file)
 {
@@ -539,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map,
 	rcu_read_unlock();
 }
 
-static const struct bpf_map_ops perf_event_array_ops = {
+const struct bpf_map_ops perf_event_array_map_ops = {
 	.map_alloc = fd_array_map_alloc,
 	.map_free = fd_array_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -550,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = {
 	.map_release = perf_event_fd_array_release,
 };
 
-static struct bpf_map_type_list perf_event_array_type __ro_after_init = {
-	.ops = &perf_event_array_ops,
-	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
-};
-
-static int __init register_perf_event_array_map(void)
-{
-	bpf_register_map_type(&perf_event_array_type);
-	return 0;
-}
-late_initcall(register_perf_event_array_map);
-
 #ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 				     struct file *map_file /* not used */,
@@ -582,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map)
 	fd_array_map_free(map);
 }
 
-static const struct bpf_map_ops cgroup_array_ops = {
+const struct bpf_map_ops cgroup_array_map_ops = {
 	.map_alloc = fd_array_map_alloc,
 	.map_free = cgroup_fd_array_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -591,18 +549,6 @@ static const struct bpf_map_ops cgroup_array_ops = {
 	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
 };
-
-static struct bpf_map_type_list cgroup_array_type __ro_after_init = {
-	.ops = &cgroup_array_ops,
-	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
-};
-
-static int __init register_cgroup_array_map(void)
-{
-	bpf_register_map_type(&cgroup_array_type);
-	return 0;
-}
-late_initcall(register_cgroup_array_map);
 #endif
 
 static struct bpf_map *array_of_map_alloc(union bpf_attr *attr)
@@ -644,7 +590,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
 	return READ_ONCE(*inner_map);
 }
 
-static const struct bpf_map_ops array_of_map_ops = {
+const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_alloc = array_of_map_alloc,
 	.map_free = array_of_map_free,
 	.map_get_next_key = array_map_get_next_key,
@@ -653,15 +599,3 @@ static const struct bpf_map_ops array_of_map_ops = {
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 };
-
-static struct bpf_map_type_list array_of_map_type __ro_after_init = {
-	.ops = &array_of_map_ops,
-	.type = BPF_MAP_TYPE_ARRAY_OF_MAPS,
-};
-
-static int __init register_array_of_map(void)
-{
-	bpf_register_map_type(&array_of_map_type);
-	return 0;
-}
-late_initcall(register_array_of_map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d5b0623ce87d..bc80c038e430 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1096,7 +1096,7 @@ static void htab_map_free(struct bpf_map *map)
 	kfree(htab);
 }
 
-static const struct bpf_map_ops htab_ops = {
+const struct bpf_map_ops htab_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1106,12 +1106,7 @@ static const struct bpf_map_ops htab_ops = {
 	.map_gen_lookup = htab_map_gen_lookup,
 };
 
-static struct bpf_map_type_list htab_type __ro_after_init = {
-	.ops = &htab_ops,
-	.type = BPF_MAP_TYPE_HASH,
-};
-
-static const struct bpf_map_ops htab_lru_ops = {
+const struct bpf_map_ops htab_lru_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1120,11 +1115,6 @@ static const struct bpf_map_ops htab_lru_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_lru_type __ro_after_init = {
-	.ops = &htab_lru_ops,
-	.type = BPF_MAP_TYPE_LRU_HASH,
-};
-
 /* Called from eBPF program */
 static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
 {
@@ -1198,7 +1188,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
 	return ret;
 }
 
-static const struct bpf_map_ops htab_percpu_ops = {
+const struct bpf_map_ops htab_percpu_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1207,12 +1197,7 @@ static const struct bpf_map_ops htab_percpu_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
-	.ops = &htab_percpu_ops,
-	.type = BPF_MAP_TYPE_PERCPU_HASH,
-};
-
-static const struct bpf_map_ops htab_lru_percpu_ops = {
+const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1221,11 +1206,6 @@ static const struct bpf_map_ops htab_lru_percpu_ops = {
 	.map_delete_elem = htab_lru_map_delete_elem,
 };
 
-static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
-	.ops = &htab_lru_percpu_ops,
-	.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
-};
-
 static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
 {
 	struct bpf_map *map;
@@ -1316,7 +1296,7 @@ static void htab_of_map_free(struct bpf_map *map)
 	fd_htab_map_free(map);
 }
 
-static const struct bpf_map_ops htab_of_map_ops = {
+const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_alloc = htab_of_map_alloc,
 	.map_free = htab_of_map_free,
 	.map_get_next_key = htab_map_get_next_key,
@@ -1325,19 +1305,3 @@ static const struct bpf_map_ops htab_of_map_ops = {
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
 };
-
-static struct bpf_map_type_list htab_of_map_type __ro_after_init = {
-	.ops = &htab_of_map_ops,
-	.type = BPF_MAP_TYPE_HASH_OF_MAPS,
-};
-
-static int __init register_htab_map(void)
-{
-	bpf_register_map_type(&htab_type);
-	bpf_register_map_type(&htab_percpu_type);
-	bpf_register_map_type(&htab_lru_type);
-	bpf_register_map_type(&htab_lru_percpu_type);
-	bpf_register_map_type(&htab_of_map_type);
-	return 0;
-}
-late_initcall(register_htab_map);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index b37bd9ab7f57..39cfafd895b8 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key)
 	return -ENOTSUPP;
 }
 
-static const struct bpf_map_ops trie_ops = {
+const struct bpf_map_ops trie_map_ops = {
 	.map_alloc = trie_alloc,
 	.map_free = trie_free,
 	.map_get_next_key = trie_get_next_key,
@@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = {
 	.map_update_elem = trie_update_elem,
 	.map_delete_elem = trie_delete_elem,
 };
-
-static struct bpf_map_type_list trie_type __ro_after_init = {
-	.ops = &trie_ops,
-	.type = BPF_MAP_TYPE_LPM_TRIE,
-};
-
-static int __init register_trie_map(void)
-{
-	bpf_register_map_type(&trie_type);
-	return 0;
-}
-late_initcall(register_trie_map);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 22aa45cd0324..4dfd6f2ec2f9 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map)
 	put_callchain_buffers();
 }
 
-static const struct bpf_map_ops stack_map_ops = {
+const struct bpf_map_ops stack_map_ops = {
 	.map_alloc = stack_map_alloc,
 	.map_free = stack_map_free,
 	.map_get_next_key = stack_map_get_next_key,
@@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
-
-static struct bpf_map_type_list stack_map_type __ro_after_init = {
-	.ops = &stack_map_ops,
-	.type = BPF_MAP_TYPE_STACK_TRACE,
-};
-
-static int __init register_stack_map(void)
-{
-	bpf_register_map_type(&stack_map_type);
-	return 0;
-}
-late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ea55691cbf5e..b89288e2b589 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active);
 
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
-static LIST_HEAD(bpf_map_types);
+static const struct bpf_map_ops * const bpf_map_types[] = {
+#define BPF_PROG_TYPE(_id, _ops)
+#define BPF_MAP_TYPE(_id, _ops) \
+	[_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
 {
-	struct bpf_map_type_list *tl;
 	struct bpf_map *map;
 
-	list_for_each_entry(tl, &bpf_map_types, list_node) {
-		if (tl->type == attr->map_type) {
-			map = tl->ops->map_alloc(attr);
-			if (IS_ERR(map))
-				return map;
-			map->ops = tl->ops;
-			map->map_type = attr->map_type;
-			return map;
-		}
-	}
-	return ERR_PTR(-EINVAL);
-}
+	if (attr->map_type >= ARRAY_SIZE(bpf_map_types) ||
+	    !bpf_map_types[attr->map_type])
+		return ERR_PTR(-EINVAL);
 
-/* boot time registration of different map implementations */
-void bpf_register_map_type(struct bpf_map_type_list *tl)
-{
-	list_add(&tl->list_node, &bpf_map_types);
+	map = bpf_map_types[attr->map_type]->map_alloc(attr);
+	if (IS_ERR(map))
+		return map;
+	map->ops = bpf_map_types[attr->map_type];
+	map->map_type = attr->map_type;
+	return map;
 }
 
 void *bpf_map_area_alloc(size_t size)
@@ -576,8 +575,10 @@ err_put:
 static const struct bpf_verifier_ops * const bpf_prog_types[] = {
 #define BPF_PROG_TYPE(_id, _ops) \
 	[_id] = &_ops,
+#define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
 };
 
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
-- 
cgit v1.2.3


From 5e8cb4033807e39849b753e5399ec130c0995f1f Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 10 Apr 2017 19:25:10 +0530
Subject: PCI: endpoint: Add EP core layer to enable EP controller and EP
 functions

Introduce a new EP core layer in order to support endpoint functions in
linux kernel. This comprises the EPC library (Endpoint Controller Library)
and EPF library (Endpoint Function Library). EPC library implements
functions specific to an endpoint controller and EPF library implements
functions specific to an endpoint function.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Acked-by: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/Makefile                    |   2 +
 drivers/pci/Kconfig                 |   1 +
 drivers/pci/endpoint/Kconfig        |  20 ++
 drivers/pci/endpoint/Makefile       |   6 +
 drivers/pci/endpoint/pci-epc-core.c | 576 ++++++++++++++++++++++++++++++++++++
 drivers/pci/endpoint/pci-epc-mem.c  | 143 +++++++++
 drivers/pci/endpoint/pci-epf-core.c | 355 ++++++++++++++++++++++
 include/linux/mod_devicetable.h     |  10 +
 include/linux/pci-epc.h             | 142 +++++++++
 include/linux/pci-epf.h             | 160 ++++++++++
 10 files changed, 1415 insertions(+)
 create mode 100644 drivers/pci/endpoint/Kconfig
 create mode 100644 drivers/pci/endpoint/Makefile
 create mode 100644 drivers/pci/endpoint/pci-epc-core.c
 create mode 100644 drivers/pci/endpoint/pci-epc-mem.c
 create mode 100644 drivers/pci/endpoint/pci-epf-core.c
 create mode 100644 include/linux/pci-epc.h
 create mode 100644 include/linux/pci-epf.h

(limited to 'include/linux')

diff --git a/drivers/Makefile b/drivers/Makefile
index 2eced9afba53..a5f8e43b2c4d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -14,7 +14,9 @@ obj-$(CONFIG_GENERIC_PHY)	+= phy/
 obj-$(CONFIG_PINCTRL)		+= pinctrl/
 obj-$(CONFIG_GPIOLIB)		+= gpio/
 obj-y				+= pwm/
+
 obj-$(CONFIG_PCI)		+= pci/
+obj-$(CONFIG_PCI_ENDPOINT)	+= pci/endpoint/
 # PCI dwc controller drivers
 obj-y				+= pci/dwc/
 
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index df141420c902..9747c1ec8c74 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -134,3 +134,4 @@ config PCI_HYPERV
 source "drivers/pci/hotplug/Kconfig"
 source "drivers/pci/dwc/Kconfig"
 source "drivers/pci/host/Kconfig"
+source "drivers/pci/endpoint/Kconfig"
diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig
new file mode 100644
index 000000000000..a5442ace7077
--- /dev/null
+++ b/drivers/pci/endpoint/Kconfig
@@ -0,0 +1,20 @@
+#
+# PCI Endpoint Support
+#
+
+menu "PCI Endpoint"
+
+config PCI_ENDPOINT
+	bool "PCI Endpoint Support"
+	help
+	   Enable this configuration option to support configurable PCI
+	   endpoint. This should be enabled if the platform has a PCI
+	   controller that can operate in endpoint mode.
+
+	   Enabling this option will build the endpoint library, which
+	   includes endpoint controller library and endpoint function
+	   library.
+
+	   If in doubt, say "N" to disable Endpoint support.
+
+endmenu
diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile
new file mode 100644
index 000000000000..dc1bc16491e6
--- /dev/null
+++ b/drivers/pci/endpoint/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for PCI Endpoint Support
+#
+
+obj-$(CONFIG_PCI_ENDPOINT)		+= pci-epc-core.o pci-epf-core.o\
+					   pci-epc-mem.o
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
new file mode 100644
index 000000000000..7c71dd94721c
--- /dev/null
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -0,0 +1,576 @@
+/**
+ * PCI Endpoint *Controller* (EPC) library
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+
+static struct class *pci_epc_class;
+
+static void devm_pci_epc_release(struct device *dev, void *res)
+{
+	struct pci_epc *epc = *(struct pci_epc **)res;
+
+	pci_epc_destroy(epc);
+}
+
+static int devm_pci_epc_match(struct device *dev, void *res, void *match_data)
+{
+	struct pci_epc **epc = res;
+
+	return *epc == match_data;
+}
+
+/**
+ * pci_epc_put() - release the PCI endpoint controller
+ * @epc: epc returned by pci_epc_get()
+ *
+ * release the refcount the caller obtained by invoking pci_epc_get()
+ */
+void pci_epc_put(struct pci_epc *epc)
+{
+	if (!epc || IS_ERR(epc))
+		return;
+
+	module_put(epc->ops->owner);
+	put_device(&epc->dev);
+}
+EXPORT_SYMBOL_GPL(pci_epc_put);
+
+/**
+ * pci_epc_get() - get the PCI endpoint controller
+ * @epc_name: device name of the endpoint controller
+ *
+ * Invoke to get struct pci_epc * corresponding to the device name of the
+ * endpoint controller
+ */
+struct pci_epc *pci_epc_get(const char *epc_name)
+{
+	int ret = -EINVAL;
+	struct pci_epc *epc;
+	struct device *dev;
+	struct class_dev_iter iter;
+
+	class_dev_iter_init(&iter, pci_epc_class, NULL, NULL);
+	while ((dev = class_dev_iter_next(&iter))) {
+		if (strcmp(epc_name, dev_name(dev)))
+			continue;
+
+		epc = to_pci_epc(dev);
+		if (!try_module_get(epc->ops->owner)) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		class_dev_iter_exit(&iter);
+		get_device(&epc->dev);
+		return epc;
+	}
+
+err:
+	class_dev_iter_exit(&iter);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(pci_epc_get);
+
+/**
+ * pci_epc_stop() - stop the PCI link
+ * @epc: the link of the EPC device that has to be stopped
+ *
+ * Invoke to stop the PCI link
+ */
+void pci_epc_stop(struct pci_epc *epc)
+{
+	unsigned long flags;
+
+	if (IS_ERR(epc) || !epc->ops->stop)
+		return;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	epc->ops->stop(epc);
+	spin_unlock_irqrestore(&epc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(pci_epc_stop);
+
+/**
+ * pci_epc_start() - start the PCI link
+ * @epc: the link of *this* EPC device has to be started
+ *
+ * Invoke to start the PCI link
+ */
+int pci_epc_start(struct pci_epc *epc)
+{
+	int ret;
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (!epc->ops->start)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->start(epc);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_start);
+
+/**
+ * pci_epc_raise_irq() - interrupt the host system
+ * @epc: the EPC device which has to interrupt the host
+ * @type: specify the type of interrupt; legacy or MSI
+ * @interrupt_num: the MSI interrupt number
+ *
+ * Invoke to raise an MSI or legacy interrupt
+ */
+int pci_epc_raise_irq(struct pci_epc *epc, enum pci_epc_irq_type type,
+		      u8 interrupt_num)
+{
+	int ret;
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (!epc->ops->raise_irq)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->raise_irq(epc, type, interrupt_num);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_raise_irq);
+
+/**
+ * pci_epc_get_msi() - get the number of MSI interrupt numbers allocated
+ * @epc: the EPC device to which MSI interrupts was requested
+ *
+ * Invoke to get the number of MSI interrupts allocated by the RC
+ */
+int pci_epc_get_msi(struct pci_epc *epc)
+{
+	int interrupt;
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return 0;
+
+	if (!epc->ops->get_msi)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	interrupt = epc->ops->get_msi(epc);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	if (interrupt < 0)
+		return 0;
+
+	interrupt = 1 << interrupt;
+
+	return interrupt;
+}
+EXPORT_SYMBOL_GPL(pci_epc_get_msi);
+
+/**
+ * pci_epc_set_msi() - set the number of MSI interrupt numbers required
+ * @epc: the EPC device on which MSI has to be configured
+ * @interrupts: number of MSI interrupts required by the EPF
+ *
+ * Invoke to set the required number of MSI interrupts.
+ */
+int pci_epc_set_msi(struct pci_epc *epc, u8 interrupts)
+{
+	int ret;
+	u8 encode_int;
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (!epc->ops->set_msi)
+		return 0;
+
+	encode_int = order_base_2(interrupts);
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->set_msi(epc, encode_int);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_set_msi);
+
+/**
+ * pci_epc_unmap_addr() - unmap CPU address from PCI address
+ * @epc: the EPC device on which address is allocated
+ * @phys_addr: physical address of the local system
+ *
+ * Invoke to unmap the CPU address from PCI address.
+ */
+void pci_epc_unmap_addr(struct pci_epc *epc, phys_addr_t phys_addr)
+{
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return;
+
+	if (!epc->ops->unmap_addr)
+		return;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	epc->ops->unmap_addr(epc, phys_addr);
+	spin_unlock_irqrestore(&epc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(pci_epc_unmap_addr);
+
+/**
+ * pci_epc_map_addr() - map CPU address to PCI address
+ * @epc: the EPC device on which address is allocated
+ * @phys_addr: physical address of the local system
+ * @pci_addr: PCI address to which the physical address should be mapped
+ * @size: the size of the allocation
+ *
+ * Invoke to map CPU address with PCI address.
+ */
+int pci_epc_map_addr(struct pci_epc *epc, phys_addr_t phys_addr,
+		     u64 pci_addr, size_t size)
+{
+	int ret;
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (!epc->ops->map_addr)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->map_addr(epc, phys_addr, pci_addr, size);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_map_addr);
+
+/**
+ * pci_epc_clear_bar() - reset the BAR
+ * @epc: the EPC device for which the BAR has to be cleared
+ * @bar: the BAR number that has to be reset
+ *
+ * Invoke to reset the BAR of the endpoint device.
+ */
+void pci_epc_clear_bar(struct pci_epc *epc, int bar)
+{
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return;
+
+	if (!epc->ops->clear_bar)
+		return;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	epc->ops->clear_bar(epc, bar);
+	spin_unlock_irqrestore(&epc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(pci_epc_clear_bar);
+
+/**
+ * pci_epc_set_bar() - configure BAR in order for host to assign PCI addr space
+ * @epc: the EPC device on which BAR has to be configured
+ * @bar: the BAR number that has to be configured
+ * @size: the size of the addr space
+ * @flags: specify memory allocation/io allocation/32bit address/64 bit address
+ *
+ * Invoke to configure the BAR of the endpoint device.
+ */
+int pci_epc_set_bar(struct pci_epc *epc, enum pci_barno bar,
+		    dma_addr_t bar_phys, size_t size, int flags)
+{
+	int ret;
+	unsigned long irq_flags;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (!epc->ops->set_bar)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, irq_flags);
+	ret = epc->ops->set_bar(epc, bar, bar_phys, size, flags);
+	spin_unlock_irqrestore(&epc->lock, irq_flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_set_bar);
+
+/**
+ * pci_epc_write_header() - write standard configuration header
+ * @epc: the EPC device to which the configuration header should be written
+ * @header: standard configuration header fields
+ *
+ * Invoke to write the configuration header to the endpoint controller. Every
+ * endpoint controller will have a dedicated location to which the standard
+ * configuration header would be written. The callback function should write
+ * the header fields to this dedicated location.
+ */
+int pci_epc_write_header(struct pci_epc *epc, struct pci_epf_header *header)
+{
+	int ret;
+	unsigned long flags;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (!epc->ops->write_header)
+		return 0;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	ret = epc->ops->write_header(epc, header);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_write_header);
+
+/**
+ * pci_epc_add_epf() - bind PCI endpoint function to an endpoint controller
+ * @epc: the EPC device to which the endpoint function should be added
+ * @epf: the endpoint function to be added
+ *
+ * A PCI endpoint device can have one or more functions. In the case of PCIe,
+ * the specification allows up to 8 PCIe endpoint functions. Invoke
+ * pci_epc_add_epf() to add a PCI endpoint function to an endpoint controller.
+ */
+int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf)
+{
+	unsigned long flags;
+
+	if (epf->epc)
+		return -EBUSY;
+
+	if (IS_ERR(epc))
+		return -EINVAL;
+
+	if (epf->func_no > epc->max_functions - 1)
+		return -EINVAL;
+
+	epf->epc = epc;
+	dma_set_coherent_mask(&epf->dev, epc->dev.coherent_dma_mask);
+	epf->dev.dma_mask = epc->dev.dma_mask;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	list_add_tail(&epf->list, &epc->pci_epf);
+	spin_unlock_irqrestore(&epc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pci_epc_add_epf);
+
+/**
+ * pci_epc_remove_epf() - remove PCI endpoint function from endpoint controller
+ * @epc: the EPC device from which the endpoint function should be removed
+ * @epf: the endpoint function to be removed
+ *
+ * Invoke to remove PCI endpoint function from the endpoint controller.
+ */
+void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf)
+{
+	unsigned long flags;
+
+	if (!epc || IS_ERR(epc))
+		return;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	list_del(&epf->list);
+	spin_unlock_irqrestore(&epc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(pci_epc_remove_epf);
+
+/**
+ * pci_epc_linkup() - Notify the EPF device that EPC device has established a
+ *		      connection with the Root Complex.
+ * @epc: the EPC device which has established link with the host
+ *
+ * Invoke to Notify the EPF device that the EPC device has established a
+ * connection with the Root Complex.
+ */
+void pci_epc_linkup(struct pci_epc *epc)
+{
+	unsigned long flags;
+	struct pci_epf *epf;
+
+	if (!epc || IS_ERR(epc))
+		return;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	list_for_each_entry(epf, &epc->pci_epf, list)
+		pci_epf_linkup(epf);
+	spin_unlock_irqrestore(&epc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(pci_epc_linkup);
+
+/**
+ * pci_epc_destroy() - destroy the EPC device
+ * @epc: the EPC device that has to be destroyed
+ *
+ * Invoke to destroy the PCI EPC device
+ */
+void pci_epc_destroy(struct pci_epc *epc)
+{
+	device_unregister(&epc->dev);
+	kfree(epc);
+}
+EXPORT_SYMBOL_GPL(pci_epc_destroy);
+
+/**
+ * devm_pci_epc_destroy() - destroy the EPC device
+ * @dev: device that wants to destroy the EPC
+ * @epc: the EPC device that has to be destroyed
+ *
+ * Invoke to destroy the devres associated with this
+ * pci_epc and destroy the EPC device.
+ */
+void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc)
+{
+	int r;
+
+	r = devres_destroy(dev, devm_pci_epc_release, devm_pci_epc_match,
+			   epc);
+	dev_WARN_ONCE(dev, r, "couldn't find PCI EPC resource\n");
+}
+EXPORT_SYMBOL_GPL(devm_pci_epc_destroy);
+
+/**
+ * __pci_epc_create() - create a new endpoint controller (EPC) device
+ * @dev: device that is creating the new EPC
+ * @ops: function pointers for performing EPC operations
+ * @owner: the owner of the module that creates the EPC device
+ *
+ * Invoke to create a new EPC device and add it to pci_epc class.
+ */
+struct pci_epc *
+__pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
+		 struct module *owner)
+{
+	int ret;
+	struct pci_epc *epc;
+
+	if (WARN_ON(!dev)) {
+		ret = -EINVAL;
+		goto err_ret;
+	}
+
+	epc = kzalloc(sizeof(*epc), GFP_KERNEL);
+	if (!epc) {
+		ret = -ENOMEM;
+		goto err_ret;
+	}
+
+	spin_lock_init(&epc->lock);
+	INIT_LIST_HEAD(&epc->pci_epf);
+
+	device_initialize(&epc->dev);
+	dma_set_coherent_mask(&epc->dev, dev->coherent_dma_mask);
+	epc->dev.class = pci_epc_class;
+	epc->dev.dma_mask = dev->dma_mask;
+	epc->ops = ops;
+
+	ret = dev_set_name(&epc->dev, "%s", dev_name(dev));
+	if (ret)
+		goto put_dev;
+
+	ret = device_add(&epc->dev);
+	if (ret)
+		goto put_dev;
+
+	return epc;
+
+put_dev:
+	put_device(&epc->dev);
+	kfree(epc);
+
+err_ret:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(__pci_epc_create);
+
+/**
+ * __devm_pci_epc_create() - create a new endpoint controller (EPC) device
+ * @dev: device that is creating the new EPC
+ * @ops: function pointers for performing EPC operations
+ * @owner: the owner of the module that creates the EPC device
+ *
+ * Invoke to create a new EPC device and add it to pci_epc class.
+ * While at that, it also associates the device with the pci_epc using devres.
+ * On driver detach, release function is invoked on the devres data,
+ * then, devres data is freed.
+ */
+struct pci_epc *
+__devm_pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
+		      struct module *owner)
+{
+	struct pci_epc **ptr, *epc;
+
+	ptr = devres_alloc(devm_pci_epc_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	epc = __pci_epc_create(dev, ops, owner);
+	if (!IS_ERR(epc)) {
+		*ptr = epc;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return epc;
+}
+EXPORT_SYMBOL_GPL(__devm_pci_epc_create);
+
+static int __init pci_epc_init(void)
+{
+	pci_epc_class = class_create(THIS_MODULE, "pci_epc");
+	if (IS_ERR(pci_epc_class)) {
+		pr_err("failed to create pci epc class --> %ld\n",
+		       PTR_ERR(pci_epc_class));
+		return PTR_ERR(pci_epc_class);
+	}
+
+	return 0;
+}
+module_init(pci_epc_init);
+
+static void __exit pci_epc_exit(void)
+{
+	class_destroy(pci_epc_class);
+}
+module_exit(pci_epc_exit);
+
+MODULE_DESCRIPTION("PCI EPC Library");
+MODULE_AUTHOR("Kishon Vijay Abraham I <kishon@ti.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/endpoint/pci-epc-mem.c b/drivers/pci/endpoint/pci-epc-mem.c
new file mode 100644
index 000000000000..3a94cc1caf22
--- /dev/null
+++ b/drivers/pci/endpoint/pci-epc-mem.c
@@ -0,0 +1,143 @@
+/**
+ * PCI Endpoint *Controller* Address Space Management
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/pci-epc.h>
+
+/**
+ * pci_epc_mem_init() - initialize the pci_epc_mem structure
+ * @epc: the EPC device that invoked pci_epc_mem_init
+ * @phys_base: the physical address of the base
+ * @size: the size of the address space
+ *
+ * Invoke to initialize the pci_epc_mem structure used by the
+ * endpoint functions to allocate mapped PCI address.
+ */
+int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_base, size_t size)
+{
+	int ret;
+	struct pci_epc_mem *mem;
+	unsigned long *bitmap;
+	int pages = size >> PAGE_SHIFT;
+	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	if (!mem) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!bitmap) {
+		ret = -ENOMEM;
+		goto err_mem;
+	}
+
+	mem->bitmap = bitmap;
+	mem->phys_base = phys_base;
+	mem->pages = pages;
+	mem->size = size;
+
+	epc->mem = mem;
+
+	return 0;
+
+err_mem:
+	kfree(mem);
+
+err:
+return ret;
+}
+EXPORT_SYMBOL_GPL(pci_epc_mem_init);
+
+/**
+ * pci_epc_mem_exit() - cleanup the pci_epc_mem structure
+ * @epc: the EPC device that invoked pci_epc_mem_exit
+ *
+ * Invoke to cleanup the pci_epc_mem structure allocated in
+ * pci_epc_mem_init().
+ */
+void pci_epc_mem_exit(struct pci_epc *epc)
+{
+	struct pci_epc_mem *mem = epc->mem;
+
+	epc->mem = NULL;
+	kfree(mem->bitmap);
+	kfree(mem);
+}
+EXPORT_SYMBOL_GPL(pci_epc_mem_exit);
+
+/**
+ * pci_epc_mem_alloc_addr() - allocate memory address from EPC addr space
+ * @epc: the EPC device on which memory has to be allocated
+ * @phys_addr: populate the allocated physical address here
+ * @size: the size of the address space that has to be allocated
+ *
+ * Invoke to allocate memory address from the EPC address space. This
+ * is usually done to map the remote RC address into the local system.
+ */
+void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc,
+				     phys_addr_t *phys_addr, size_t size)
+{
+	int pageno;
+	void __iomem *virt_addr;
+	struct pci_epc_mem *mem = epc->mem;
+	int order = get_order(size);
+
+	pageno = bitmap_find_free_region(mem->bitmap, mem->pages, order);
+	if (pageno < 0)
+		return NULL;
+
+	*phys_addr = mem->phys_base + (pageno << PAGE_SHIFT);
+	virt_addr = ioremap(*phys_addr, size);
+	if (!virt_addr)
+		bitmap_release_region(mem->bitmap, pageno, order);
+
+	return virt_addr;
+}
+EXPORT_SYMBOL_GPL(pci_epc_mem_alloc_addr);
+
+/**
+ * pci_epc_mem_free_addr() - free the allocated memory address
+ * @epc: the EPC device on which memory was allocated
+ * @phys_addr: the allocated physical address
+ * @virt_addr: virtual address of the allocated mem space
+ * @size: the size of the allocated address space
+ *
+ * Invoke to free the memory allocated using pci_epc_mem_alloc_addr.
+ */
+void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr,
+			   void __iomem *virt_addr, size_t size)
+{
+	int pageno;
+	int order = get_order(size);
+	struct pci_epc_mem *mem = epc->mem;
+
+	iounmap(virt_addr);
+	pageno = (phys_addr - mem->phys_base) >> PAGE_SHIFT;
+	bitmap_release_region(mem->bitmap, pageno, order);
+}
+EXPORT_SYMBOL_GPL(pci_epc_mem_free_addr);
+
+MODULE_DESCRIPTION("PCI EPC Address Space Management");
+MODULE_AUTHOR("Kishon Vijay Abraham I <kishon@ti.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
new file mode 100644
index 000000000000..a281c599a504
--- /dev/null
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -0,0 +1,355 @@
+/**
+ * PCI Endpoint *Function* (EPF) library
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+
+static struct bus_type pci_epf_bus_type;
+static struct device_type pci_epf_type;
+
+/**
+ * pci_epf_linkup() - Notify the function driver that EPC device has
+ *		      established a connection with the Root Complex.
+ * @epf: the EPF device bound to the EPC device which has established
+ *	 the connection with the host
+ *
+ * Invoke to notify the function driver that EPC device has established
+ * a connection with the Root Complex.
+ */
+void pci_epf_linkup(struct pci_epf *epf)
+{
+	if (!epf->driver) {
+		dev_WARN(&epf->dev, "epf device not bound to driver\n");
+		return;
+	}
+
+	epf->driver->ops->linkup(epf);
+}
+EXPORT_SYMBOL_GPL(pci_epf_linkup);
+
+/**
+ * pci_epf_unbind() - Notify the function driver that the binding between the
+ *		      EPF device and EPC device has been lost
+ * @epf: the EPF device which has lost the binding with the EPC device
+ *
+ * Invoke to notify the function driver that the binding between the EPF device
+ * and EPC device has been lost.
+ */
+void pci_epf_unbind(struct pci_epf *epf)
+{
+	if (!epf->driver) {
+		dev_WARN(&epf->dev, "epf device not bound to driver\n");
+		return;
+	}
+
+	epf->driver->ops->unbind(epf);
+	module_put(epf->driver->owner);
+}
+EXPORT_SYMBOL_GPL(pci_epf_unbind);
+
+/**
+ * pci_epf_bind() - Notify the function driver that the EPF device has been
+ *		    bound to a EPC device
+ * @epf: the EPF device which has been bound to the EPC device
+ *
+ * Invoke to notify the function driver that it has been bound to a EPC device
+ */
+int pci_epf_bind(struct pci_epf *epf)
+{
+	if (!epf->driver) {
+		dev_WARN(&epf->dev, "epf device not bound to driver\n");
+		return -EINVAL;
+	}
+
+	if (!try_module_get(epf->driver->owner))
+		return -EAGAIN;
+
+	return epf->driver->ops->bind(epf);
+}
+EXPORT_SYMBOL_GPL(pci_epf_bind);
+
+/**
+ * pci_epf_free_space() - free the allocated PCI EPF register space
+ * @addr: the virtual address of the PCI EPF register space
+ * @bar: the BAR number corresponding to the register space
+ *
+ * Invoke to free the allocated PCI EPF register space.
+ */
+void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar)
+{
+	struct device *dev = &epf->dev;
+
+	if (!addr)
+		return;
+
+	dma_free_coherent(dev, epf->bar[bar].size, addr,
+			  epf->bar[bar].phys_addr);
+
+	epf->bar[bar].phys_addr = 0;
+	epf->bar[bar].size = 0;
+}
+EXPORT_SYMBOL_GPL(pci_epf_free_space);
+
+/**
+ * pci_epf_alloc_space() - allocate memory for the PCI EPF register space
+ * @size: the size of the memory that has to be allocated
+ * @bar: the BAR number corresponding to the allocated register space
+ *
+ * Invoke to allocate memory for the PCI EPF register space.
+ */
+void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar)
+{
+	void *space;
+	struct device *dev = &epf->dev;
+	dma_addr_t phys_addr;
+
+	if (size < 128)
+		size = 128;
+	size = roundup_pow_of_two(size);
+
+	space = dma_alloc_coherent(dev, size, &phys_addr, GFP_KERNEL);
+	if (!space) {
+		dev_err(dev, "failed to allocate mem space\n");
+		return NULL;
+	}
+
+	epf->bar[bar].phys_addr = phys_addr;
+	epf->bar[bar].size = size;
+
+	return space;
+}
+EXPORT_SYMBOL_GPL(pci_epf_alloc_space);
+
+/**
+ * pci_epf_unregister_driver() - unregister the PCI EPF driver
+ * @driver: the PCI EPF driver that has to be unregistered
+ *
+ * Invoke to unregister the PCI EPF driver.
+ */
+void pci_epf_unregister_driver(struct pci_epf_driver *driver)
+{
+	driver_unregister(&driver->driver);
+}
+EXPORT_SYMBOL_GPL(pci_epf_unregister_driver);
+
+/**
+ * __pci_epf_register_driver() - register a new PCI EPF driver
+ * @driver: structure representing PCI EPF driver
+ * @owner: the owner of the module that registers the PCI EPF driver
+ *
+ * Invoke to register a new PCI EPF driver.
+ */
+int __pci_epf_register_driver(struct pci_epf_driver *driver,
+			      struct module *owner)
+{
+	int ret;
+
+	if (!driver->ops)
+		return -EINVAL;
+
+	if (!driver->ops->bind || !driver->ops->unbind || !driver->ops->linkup)
+		return -EINVAL;
+
+	driver->driver.bus = &pci_epf_bus_type;
+	driver->driver.owner = owner;
+
+	ret = driver_register(&driver->driver);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__pci_epf_register_driver);
+
+/**
+ * pci_epf_destroy() - destroy the created PCI EPF device
+ * @epf: the PCI EPF device that has to be destroyed.
+ *
+ * Invoke to destroy the PCI EPF device created by invoking pci_epf_create().
+ */
+void pci_epf_destroy(struct pci_epf *epf)
+{
+	device_unregister(&epf->dev);
+}
+EXPORT_SYMBOL_GPL(pci_epf_destroy);
+
+/**
+ * pci_epf_create() - create a new PCI EPF device
+ * @name: the name of the PCI EPF device. This name will be used to bind the
+ *	  the EPF device to a EPF driver
+ *
+ * Invoke to create a new PCI EPF device by providing the name of the function
+ * device.
+ */
+struct pci_epf *pci_epf_create(const char *name)
+{
+	int ret;
+	struct pci_epf *epf;
+	struct device *dev;
+	char *func_name;
+	char *buf;
+
+	epf = kzalloc(sizeof(*epf), GFP_KERNEL);
+	if (!epf) {
+		ret = -ENOMEM;
+		goto err_ret;
+	}
+
+	buf = kstrdup(name, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto free_epf;
+	}
+
+	func_name = buf;
+	buf = strchrnul(buf, '.');
+	*buf = '\0';
+
+	epf->name = kstrdup(func_name, GFP_KERNEL);
+	if (!epf->name) {
+		ret = -ENOMEM;
+		goto free_func_name;
+	}
+
+	dev = &epf->dev;
+	device_initialize(dev);
+	dev->bus = &pci_epf_bus_type;
+	dev->type = &pci_epf_type;
+
+	ret = dev_set_name(dev, "%s", name);
+	if (ret)
+		goto put_dev;
+
+	ret = device_add(dev);
+	if (ret)
+		goto put_dev;
+
+	kfree(func_name);
+	return epf;
+
+put_dev:
+	put_device(dev);
+	kfree(epf->name);
+
+free_func_name:
+	kfree(func_name);
+
+free_epf:
+	kfree(epf);
+
+err_ret:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(pci_epf_create);
+
+static void pci_epf_dev_release(struct device *dev)
+{
+	struct pci_epf *epf = to_pci_epf(dev);
+
+	kfree(epf->name);
+	kfree(epf);
+}
+
+static struct device_type pci_epf_type = {
+	.release	= pci_epf_dev_release,
+};
+
+static int
+pci_epf_match_id(const struct pci_epf_device_id *id, const struct pci_epf *epf)
+{
+	while (id->name[0]) {
+		if (strcmp(epf->name, id->name) == 0)
+			return true;
+		id++;
+	}
+
+	return false;
+}
+
+static int pci_epf_device_match(struct device *dev, struct device_driver *drv)
+{
+	struct pci_epf *epf = to_pci_epf(dev);
+	struct pci_epf_driver *driver = to_pci_epf_driver(drv);
+
+	if (driver->id_table)
+		return pci_epf_match_id(driver->id_table, epf);
+
+	return !strcmp(epf->name, drv->name);
+}
+
+static int pci_epf_device_probe(struct device *dev)
+{
+	struct pci_epf *epf = to_pci_epf(dev);
+	struct pci_epf_driver *driver = to_pci_epf_driver(dev->driver);
+
+	if (!driver->probe)
+		return -ENODEV;
+
+	epf->driver = driver;
+
+	return driver->probe(epf);
+}
+
+static int pci_epf_device_remove(struct device *dev)
+{
+	int ret;
+	struct pci_epf *epf = to_pci_epf(dev);
+	struct pci_epf_driver *driver = to_pci_epf_driver(dev->driver);
+
+	ret = driver->remove(epf);
+	epf->driver = NULL;
+
+	return ret;
+}
+
+static struct bus_type pci_epf_bus_type = {
+	.name		= "pci-epf",
+	.match		= pci_epf_device_match,
+	.probe		= pci_epf_device_probe,
+	.remove		= pci_epf_device_remove,
+};
+
+static int __init pci_epf_init(void)
+{
+	int ret;
+
+	ret = bus_register(&pci_epf_bus_type);
+	if (ret) {
+		pr_err("failed to register pci epf bus --> %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+module_init(pci_epf_init);
+
+static void __exit pci_epf_exit(void)
+{
+	bus_unregister(&pci_epf_bus_type);
+}
+module_exit(pci_epf_exit);
+
+MODULE_DESCRIPTION("PCI EPF Library");
+MODULE_AUTHOR("Kishon Vijay Abraham I <kishon@ti.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 8850fcaf50db..566fda587fcf 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -428,6 +428,16 @@ struct i2c_device_id {
 	kernel_ulong_t driver_data;	/* Data private to the driver */
 };
 
+/* pci_epf */
+
+#define PCI_EPF_NAME_SIZE	20
+#define PCI_EPF_MODULE_PREFIX	"pci_epf:"
+
+struct pci_epf_device_id {
+	char name[PCI_EPF_NAME_SIZE];
+	kernel_ulong_t driver_data;
+};
+
 /* spi */
 
 #define SPI_NAME_SIZE	32
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
new file mode 100644
index 000000000000..8c63d3c37f76
--- /dev/null
+++ b/include/linux/pci-epc.h
@@ -0,0 +1,142 @@
+/**
+ * PCI Endpoint *Controller* (EPC) header file
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_PCI_EPC_H
+#define __LINUX_PCI_EPC_H
+
+#include <linux/pci-epf.h>
+
+struct pci_epc;
+
+enum pci_epc_irq_type {
+	PCI_EPC_IRQ_UNKNOWN,
+	PCI_EPC_IRQ_LEGACY,
+	PCI_EPC_IRQ_MSI,
+};
+
+/**
+ * struct pci_epc_ops - set of function pointers for performing EPC operations
+ * @write_header: ops to populate configuration space header
+ * @set_bar: ops to configure the BAR
+ * @clear_bar: ops to reset the BAR
+ * @map_addr: ops to map CPU address to PCI address
+ * @unmap_addr: ops to unmap CPU address and PCI address
+ * @set_msi: ops to set the requested number of MSI interrupts in the MSI
+ *	     capability register
+ * @get_msi: ops to get the number of MSI interrupts allocated by the RC from
+ *	     the MSI capability register
+ * @raise_irq: ops to raise a legacy or MSI interrupt
+ * @start: ops to start the PCI link
+ * @stop: ops to stop the PCI link
+ * @owner: the module owner containing the ops
+ */
+struct pci_epc_ops {
+	int	(*write_header)(struct pci_epc *pci_epc,
+				struct pci_epf_header *hdr);
+	int	(*set_bar)(struct pci_epc *epc, enum pci_barno bar,
+			   dma_addr_t bar_phys, size_t size, int flags);
+	void	(*clear_bar)(struct pci_epc *epc, enum pci_barno bar);
+	int	(*map_addr)(struct pci_epc *epc, phys_addr_t addr,
+			    u64 pci_addr, size_t size);
+	void	(*unmap_addr)(struct pci_epc *epc, phys_addr_t addr);
+	int	(*set_msi)(struct pci_epc *epc, u8 interrupts);
+	int	(*get_msi)(struct pci_epc *epc);
+	int	(*raise_irq)(struct pci_epc *pci_epc,
+			     enum pci_epc_irq_type type, u8 interrupt_num);
+	int	(*start)(struct pci_epc *epc);
+	void	(*stop)(struct pci_epc *epc);
+	struct module *owner;
+};
+
+/**
+ * struct pci_epc_mem - address space of the endpoint controller
+ * @phys_base: physical base address of the PCI address space
+ * @size: the size of the PCI address space
+ * @bitmap: bitmap to manage the PCI address space
+ * @pages: number of bits representing the address region
+ */
+struct pci_epc_mem {
+	phys_addr_t	phys_base;
+	size_t		size;
+	unsigned long	*bitmap;
+	int		pages;
+};
+
+/**
+ * struct pci_epc - represents the PCI EPC device
+ * @dev: PCI EPC device
+ * @pci_epf: list of endpoint functions present in this EPC device
+ * @ops: function pointers for performing endpoint operations
+ * @mem: address space of the endpoint controller
+ * @max_functions: max number of functions that can be configured in this EPC
+ * @lock: spinlock to protect pci_epc ops
+ */
+struct pci_epc {
+	struct device			dev;
+	struct list_head		pci_epf;
+	const struct pci_epc_ops	*ops;
+	struct pci_epc_mem		*mem;
+	u8				max_functions;
+	/* spinlock to protect against concurrent access of EP controller */
+	spinlock_t			lock;
+};
+
+#define to_pci_epc(device) container_of((device), struct pci_epc, dev)
+
+#define pci_epc_create(dev, ops)    \
+		__pci_epc_create((dev), (ops), THIS_MODULE)
+#define devm_pci_epc_create(dev, ops)    \
+		__devm_pci_epc_create((dev), (ops), THIS_MODULE)
+
+static inline void epc_set_drvdata(struct pci_epc *epc, void *data)
+{
+	dev_set_drvdata(&epc->dev, data);
+}
+
+static inline void *epc_get_drvdata(struct pci_epc *epc)
+{
+	return dev_get_drvdata(&epc->dev);
+}
+
+struct pci_epc *
+__devm_pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
+		      struct module *owner);
+struct pci_epc *
+__pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
+		 struct module *owner);
+void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc);
+void pci_epc_destroy(struct pci_epc *epc);
+int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf);
+void pci_epc_linkup(struct pci_epc *epc);
+void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf);
+int pci_epc_write_header(struct pci_epc *epc, struct pci_epf_header *hdr);
+int pci_epc_set_bar(struct pci_epc *epc, enum pci_barno bar,
+		    dma_addr_t bar_phys, size_t size, int flags);
+void pci_epc_clear_bar(struct pci_epc *epc, int bar);
+int pci_epc_map_addr(struct pci_epc *epc, phys_addr_t phys_addr,
+		     u64 pci_addr, size_t size);
+void pci_epc_unmap_addr(struct pci_epc *epc, phys_addr_t phys_addr);
+int pci_epc_set_msi(struct pci_epc *epc, u8 interrupts);
+int pci_epc_get_msi(struct pci_epc *epc);
+int pci_epc_raise_irq(struct pci_epc *epc, enum pci_epc_irq_type type,
+		      u8 interrupt_num);
+int pci_epc_start(struct pci_epc *epc);
+void pci_epc_stop(struct pci_epc *epc);
+struct pci_epc *pci_epc_get(const char *epc_name);
+void pci_epc_put(struct pci_epc *epc);
+
+int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size);
+void pci_epc_mem_exit(struct pci_epc *epc);
+void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc,
+				     phys_addr_t *phys_addr, size_t size);
+void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr,
+			   void __iomem *virt_addr, size_t size);
+#endif /* __LINUX_PCI_EPC_H */
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
new file mode 100644
index 000000000000..5628714f7bcf
--- /dev/null
+++ b/include/linux/pci-epf.h
@@ -0,0 +1,160 @@
+/**
+ * PCI Endpoint *Function* (EPF) header file
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_PCI_EPF_H
+#define __LINUX_PCI_EPF_H
+
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+
+struct pci_epf;
+
+enum pci_interrupt_pin {
+	PCI_INTERRUPT_UNKNOWN,
+	PCI_INTERRUPT_INTA,
+	PCI_INTERRUPT_INTB,
+	PCI_INTERRUPT_INTC,
+	PCI_INTERRUPT_INTD,
+};
+
+enum pci_barno {
+	BAR_0,
+	BAR_1,
+	BAR_2,
+	BAR_3,
+	BAR_4,
+	BAR_5,
+};
+
+/**
+ * struct pci_epf_header - represents standard configuration header
+ * @vendorid: identifies device manufacturer
+ * @deviceid: identifies a particular device
+ * @revid: specifies a device-specific revision identifier
+ * @progif_code: identifies a specific register-level programming interface
+ * @subclass_code: identifies more specifically the function of the device
+ * @baseclass_code: broadly classifies the type of function the device performs
+ * @cache_line_size: specifies the system cacheline size in units of DWORDs
+ * @subsys_vendor_id: vendor of the add-in card or subsystem
+ * @subsys_id: id specific to vendor
+ * @interrupt_pin: interrupt pin the device (or device function) uses
+ */
+struct pci_epf_header {
+	u16	vendorid;
+	u16	deviceid;
+	u8	revid;
+	u8	progif_code;
+	u8	subclass_code;
+	u8	baseclass_code;
+	u8	cache_line_size;
+	u16	subsys_vendor_id;
+	u16	subsys_id;
+	enum pci_interrupt_pin interrupt_pin;
+};
+
+/**
+ * struct pci_epf_ops - set of function pointers for performing EPF operations
+ * @bind: ops to perform when a EPC device has been bound to EPF device
+ * @unbind: ops to perform when a binding has been lost between a EPC device
+ *	    and EPF device
+ * @linkup: ops to perform when the EPC device has established a connection with
+ *	    a host system
+ */
+struct pci_epf_ops {
+	int	(*bind)(struct pci_epf *epf);
+	void	(*unbind)(struct pci_epf *epf);
+	void	(*linkup)(struct pci_epf *epf);
+};
+
+/**
+ * struct pci_epf_driver - represents the PCI EPF driver
+ * @probe: ops to perform when a new EPF device has been bound to the EPF driver
+ * @remove: ops to perform when the binding between the EPF device and EPF
+ *	    driver is broken
+ * @driver: PCI EPF driver
+ * @ops: set of function pointers for performing EPF operations
+ * @owner: the owner of the module that registers the PCI EPF driver
+ * @id_table: identifies EPF devices for probing
+ */
+struct pci_epf_driver {
+	int	(*probe)(struct pci_epf *epf);
+	int	(*remove)(struct pci_epf *epf);
+
+	struct device_driver	driver;
+	struct pci_epf_ops	*ops;
+	struct module		*owner;
+	const struct pci_epf_device_id	*id_table;
+};
+
+#define to_pci_epf_driver(drv) (container_of((drv), struct pci_epf_driver, \
+				driver))
+
+/**
+ * struct pci_epf_bar - represents the BAR of EPF device
+ * @phys_addr: physical address that should be mapped to the BAR
+ * @size: the size of the address space present in BAR
+ */
+struct pci_epf_bar {
+	dma_addr_t	phys_addr;
+	size_t		size;
+};
+
+/**
+ * struct pci_epf - represents the PCI EPF device
+ * @dev: the PCI EPF device
+ * @name: the name of the PCI EPF device
+ * @header: represents standard configuration header
+ * @bar: represents the BAR of EPF device
+ * @msi_interrupts: number of MSI interrupts required by this function
+ * @func_no: unique function number within this endpoint device
+ * @epc: the EPC device to which this EPF device is bound
+ * @driver: the EPF driver to which this EPF device is bound
+ * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc
+ */
+struct pci_epf {
+	struct device		dev;
+	const char		*name;
+	struct pci_epf_header	*header;
+	struct pci_epf_bar	bar[6];
+	u8			msi_interrupts;
+	u8			func_no;
+
+	struct pci_epc		*epc;
+	struct pci_epf_driver	*driver;
+	struct list_head	list;
+};
+
+#define to_pci_epf(epf_dev) container_of((epf_dev), struct pci_epf, dev)
+
+#define pci_epf_register_driver(driver)    \
+		__pci_epf_register_driver((driver), THIS_MODULE)
+
+static inline void epf_set_drvdata(struct pci_epf *epf, void *data)
+{
+	dev_set_drvdata(&epf->dev, data);
+}
+
+static inline void *epf_get_drvdata(struct pci_epf *epf)
+{
+	return dev_get_drvdata(&epf->dev);
+}
+
+struct pci_epf *pci_epf_create(const char *name);
+void pci_epf_destroy(struct pci_epf *epf);
+int __pci_epf_register_driver(struct pci_epf_driver *driver,
+			      struct module *owner);
+void pci_epf_unregister_driver(struct pci_epf_driver *driver);
+void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar);
+void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar);
+int pci_epf_bind(struct pci_epf *epf);
+void pci_epf_unbind(struct pci_epf *epf);
+void pci_epf_linkup(struct pci_epf *epf);
+#endif /* __LINUX_PCI_EPF_H */
-- 
cgit v1.2.3


From d746799116103d857be203382b09035bbe225d03 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 27 Mar 2017 15:14:59 +0530
Subject: PCI: endpoint: Introduce configfs entry for configuring EP functions

Introduce a new configfs entry to configure the EP function (like
configuring the standard configuration header entries) and to bind the EP
function with EP controller.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/Kconfig      |   9 +
 drivers/pci/endpoint/Makefile     |   1 +
 drivers/pci/endpoint/pci-ep-cfs.c | 509 ++++++++++++++++++++++++++++++++++++++
 include/linux/pci-ep-cfs.h        |  41 +++
 4 files changed, 560 insertions(+)
 create mode 100644 drivers/pci/endpoint/pci-ep-cfs.c
 create mode 100644 include/linux/pci-ep-cfs.h

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig
index a5442ace7077..c86bca9b7de3 100644
--- a/drivers/pci/endpoint/Kconfig
+++ b/drivers/pci/endpoint/Kconfig
@@ -17,4 +17,13 @@ config PCI_ENDPOINT
 
 	   If in doubt, say "N" to disable Endpoint support.
 
+config PCI_ENDPOINT_CONFIGFS
+	bool "PCI Endpoint Configfs Support"
+	depends on PCI_ENDPOINT
+	select CONFIGFS_FS
+	help
+	   This will enable the configfs entry that can be used to
+	   configure the endpoint function and used to bind the
+	   function with a endpoint controller.
+
 endmenu
diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile
index dc1bc16491e6..7219d51bb401 100644
--- a/drivers/pci/endpoint/Makefile
+++ b/drivers/pci/endpoint/Makefile
@@ -2,5 +2,6 @@
 # Makefile for PCI Endpoint Support
 #
 
+obj-$(CONFIG_PCI_ENDPOINT_CONFIGFS)	+= pci-ep-cfs.o
 obj-$(CONFIG_PCI_ENDPOINT)		+= pci-epc-core.o pci-epf-core.o\
 					   pci-epc-mem.o
diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
new file mode 100644
index 000000000000..424fdd6ed1ca
--- /dev/null
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -0,0 +1,509 @@
+/**
+ * configfs to configure the PCI endpoint
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/pci-epc.h>
+#include <linux/pci-epf.h>
+#include <linux/pci-ep-cfs.h>
+
+static struct config_group *functions_group;
+static struct config_group *controllers_group;
+
+struct pci_epf_group {
+	struct config_group group;
+	struct pci_epf *epf;
+};
+
+struct pci_epc_group {
+	struct config_group group;
+	struct pci_epc *epc;
+	bool start;
+	unsigned long function_num_map;
+};
+
+static inline struct pci_epf_group *to_pci_epf_group(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct pci_epf_group, group);
+}
+
+static inline struct pci_epc_group *to_pci_epc_group(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct pci_epc_group, group);
+}
+
+static ssize_t pci_epc_start_store(struct config_item *item, const char *page,
+				   size_t len)
+{
+	int ret;
+	bool start;
+	struct pci_epc *epc;
+	struct pci_epc_group *epc_group = to_pci_epc_group(item);
+
+	epc = epc_group->epc;
+
+	ret = kstrtobool(page, &start);
+	if (ret)
+		return ret;
+
+	if (!start) {
+		pci_epc_stop(epc);
+		return len;
+	}
+
+	ret = pci_epc_start(epc);
+	if (ret) {
+		dev_err(&epc->dev, "failed to start endpoint controller\n");
+		return -EINVAL;
+	}
+
+	epc_group->start = start;
+
+	return len;
+}
+
+static ssize_t pci_epc_start_show(struct config_item *item, char *page)
+{
+	return sprintf(page, "%d\n",
+		       to_pci_epc_group(item)->start);
+}
+
+CONFIGFS_ATTR(pci_epc_, start);
+
+static struct configfs_attribute *pci_epc_attrs[] = {
+	&pci_epc_attr_start,
+	NULL,
+};
+
+static int pci_epc_epf_link(struct config_item *epc_item,
+			    struct config_item *epf_item)
+{
+	int ret;
+	u32 func_no = 0;
+	struct pci_epc *epc;
+	struct pci_epf *epf;
+	struct pci_epf_group *epf_group = to_pci_epf_group(epf_item);
+	struct pci_epc_group *epc_group = to_pci_epc_group(epc_item);
+
+	epc = epc_group->epc;
+	epf = epf_group->epf;
+	ret = pci_epc_add_epf(epc, epf);
+	if (ret)
+		goto err_add_epf;
+
+	func_no = find_first_zero_bit(&epc_group->function_num_map,
+				      sizeof(epc_group->function_num_map));
+	set_bit(func_no, &epc_group->function_num_map);
+	epf->func_no = func_no;
+
+	ret = pci_epf_bind(epf);
+	if (ret)
+		goto err_epf_bind;
+
+	return 0;
+
+err_epf_bind:
+	pci_epc_remove_epf(epc, epf);
+
+err_add_epf:
+	clear_bit(func_no, &epc_group->function_num_map);
+
+	return ret;
+}
+
+static void pci_epc_epf_unlink(struct config_item *epc_item,
+			       struct config_item *epf_item)
+{
+	struct pci_epc *epc;
+	struct pci_epf *epf;
+	struct pci_epf_group *epf_group = to_pci_epf_group(epf_item);
+	struct pci_epc_group *epc_group = to_pci_epc_group(epc_item);
+
+	WARN_ON_ONCE(epc_group->start);
+
+	epc = epc_group->epc;
+	epf = epf_group->epf;
+	clear_bit(epf->func_no, &epc_group->function_num_map);
+	pci_epf_unbind(epf);
+	pci_epc_remove_epf(epc, epf);
+}
+
+static struct configfs_item_operations pci_epc_item_ops = {
+	.allow_link	= pci_epc_epf_link,
+	.drop_link	= pci_epc_epf_unlink,
+};
+
+static struct config_item_type pci_epc_type = {
+	.ct_item_ops	= &pci_epc_item_ops,
+	.ct_attrs	= pci_epc_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+struct config_group *pci_ep_cfs_add_epc_group(const char *name)
+{
+	int ret;
+	struct pci_epc *epc;
+	struct config_group *group;
+	struct pci_epc_group *epc_group;
+
+	epc_group = kzalloc(sizeof(*epc_group), GFP_KERNEL);
+	if (!epc_group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	group = &epc_group->group;
+
+	config_group_init_type_name(group, name, &pci_epc_type);
+	ret = configfs_register_group(controllers_group, group);
+	if (ret) {
+		pr_err("failed to register configfs group for %s\n", name);
+		goto err_register_group;
+	}
+
+	epc = pci_epc_get(name);
+	if (IS_ERR(epc)) {
+		ret = PTR_ERR(epc);
+		goto err_epc_get;
+	}
+
+	epc_group->epc = epc;
+
+	return group;
+
+err_epc_get:
+	configfs_unregister_group(group);
+
+err_register_group:
+	kfree(epc_group);
+
+err:
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(pci_ep_cfs_add_epc_group);
+
+void pci_ep_cfs_remove_epc_group(struct config_group *group)
+{
+	struct pci_epc_group *epc_group;
+
+	if (!group)
+		return;
+
+	epc_group = container_of(group, struct pci_epc_group, group);
+	pci_epc_put(epc_group->epc);
+	configfs_unregister_group(&epc_group->group);
+	kfree(epc_group);
+}
+EXPORT_SYMBOL(pci_ep_cfs_remove_epc_group);
+
+#define PCI_EPF_HEADER_R(_name)						       \
+static ssize_t pci_epf_##_name##_show(struct config_item *item,	char *page)    \
+{									       \
+	struct pci_epf *epf = to_pci_epf_group(item)->epf;		       \
+	if (WARN_ON_ONCE(!epf->header))					       \
+		return -EINVAL;						       \
+	return sprintf(page, "0x%04x\n", epf->header->_name);		       \
+}
+
+#define PCI_EPF_HEADER_W_u32(_name)					       \
+static ssize_t pci_epf_##_name##_store(struct config_item *item,	       \
+				       const char *page, size_t len)	       \
+{									       \
+	u32 val;							       \
+	int ret;							       \
+	struct pci_epf *epf = to_pci_epf_group(item)->epf;		       \
+	if (WARN_ON_ONCE(!epf->header))					       \
+		return -EINVAL;						       \
+	ret = kstrtou32(page, 0, &val);					       \
+	if (ret)							       \
+		return ret;						       \
+	epf->header->_name = val;					       \
+	return len;							       \
+}
+
+#define PCI_EPF_HEADER_W_u16(_name)					       \
+static ssize_t pci_epf_##_name##_store(struct config_item *item,	       \
+				       const char *page, size_t len)	       \
+{									       \
+	u16 val;							       \
+	int ret;							       \
+	struct pci_epf *epf = to_pci_epf_group(item)->epf;		       \
+	if (WARN_ON_ONCE(!epf->header))					       \
+		return -EINVAL;						       \
+	ret = kstrtou16(page, 0, &val);					       \
+	if (ret)							       \
+		return ret;						       \
+	epf->header->_name = val;					       \
+	return len;							       \
+}
+
+#define PCI_EPF_HEADER_W_u8(_name)					       \
+static ssize_t pci_epf_##_name##_store(struct config_item *item,	       \
+				       const char *page, size_t len)	       \
+{									       \
+	u8 val;								       \
+	int ret;							       \
+	struct pci_epf *epf = to_pci_epf_group(item)->epf;		       \
+	if (WARN_ON_ONCE(!epf->header))					       \
+		return -EINVAL;						       \
+	ret = kstrtou8(page, 0, &val);					       \
+	if (ret)							       \
+		return ret;						       \
+	epf->header->_name = val;					       \
+	return len;							       \
+}
+
+static ssize_t pci_epf_msi_interrupts_store(struct config_item *item,
+					    const char *page, size_t len)
+{
+	u8 val;
+	int ret;
+
+	ret = kstrtou8(page, 0, &val);
+	if (ret)
+		return ret;
+
+	to_pci_epf_group(item)->epf->msi_interrupts = val;
+
+	return len;
+}
+
+static ssize_t pci_epf_msi_interrupts_show(struct config_item *item,
+					   char *page)
+{
+	return sprintf(page, "%d\n",
+		       to_pci_epf_group(item)->epf->msi_interrupts);
+}
+
+PCI_EPF_HEADER_R(vendorid)
+PCI_EPF_HEADER_W_u16(vendorid)
+
+PCI_EPF_HEADER_R(deviceid)
+PCI_EPF_HEADER_W_u16(deviceid)
+
+PCI_EPF_HEADER_R(revid)
+PCI_EPF_HEADER_W_u8(revid)
+
+PCI_EPF_HEADER_R(progif_code)
+PCI_EPF_HEADER_W_u8(progif_code)
+
+PCI_EPF_HEADER_R(subclass_code)
+PCI_EPF_HEADER_W_u8(subclass_code)
+
+PCI_EPF_HEADER_R(baseclass_code)
+PCI_EPF_HEADER_W_u8(baseclass_code)
+
+PCI_EPF_HEADER_R(cache_line_size)
+PCI_EPF_HEADER_W_u8(cache_line_size)
+
+PCI_EPF_HEADER_R(subsys_vendor_id)
+PCI_EPF_HEADER_W_u16(subsys_vendor_id)
+
+PCI_EPF_HEADER_R(subsys_id)
+PCI_EPF_HEADER_W_u16(subsys_id)
+
+PCI_EPF_HEADER_R(interrupt_pin)
+PCI_EPF_HEADER_W_u8(interrupt_pin)
+
+CONFIGFS_ATTR(pci_epf_, vendorid);
+CONFIGFS_ATTR(pci_epf_, deviceid);
+CONFIGFS_ATTR(pci_epf_, revid);
+CONFIGFS_ATTR(pci_epf_, progif_code);
+CONFIGFS_ATTR(pci_epf_, subclass_code);
+CONFIGFS_ATTR(pci_epf_, baseclass_code);
+CONFIGFS_ATTR(pci_epf_, cache_line_size);
+CONFIGFS_ATTR(pci_epf_, subsys_vendor_id);
+CONFIGFS_ATTR(pci_epf_, subsys_id);
+CONFIGFS_ATTR(pci_epf_, interrupt_pin);
+CONFIGFS_ATTR(pci_epf_, msi_interrupts);
+
+static struct configfs_attribute *pci_epf_attrs[] = {
+	&pci_epf_attr_vendorid,
+	&pci_epf_attr_deviceid,
+	&pci_epf_attr_revid,
+	&pci_epf_attr_progif_code,
+	&pci_epf_attr_subclass_code,
+	&pci_epf_attr_baseclass_code,
+	&pci_epf_attr_cache_line_size,
+	&pci_epf_attr_subsys_vendor_id,
+	&pci_epf_attr_subsys_id,
+	&pci_epf_attr_interrupt_pin,
+	&pci_epf_attr_msi_interrupts,
+	NULL,
+};
+
+static void pci_epf_release(struct config_item *item)
+{
+	struct pci_epf_group *epf_group = to_pci_epf_group(item);
+
+	pci_epf_destroy(epf_group->epf);
+	kfree(epf_group);
+}
+
+static struct configfs_item_operations pci_epf_ops = {
+	.release		= pci_epf_release,
+};
+
+static struct config_item_type pci_epf_type = {
+	.ct_item_ops	= &pci_epf_ops,
+	.ct_attrs	= pci_epf_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_group *pci_epf_make(struct config_group *group,
+					 const char *name)
+{
+	struct pci_epf_group *epf_group;
+	struct pci_epf *epf;
+
+	epf_group = kzalloc(sizeof(*epf_group), GFP_KERNEL);
+	if (!epf_group)
+		return ERR_PTR(-ENOMEM);
+
+	config_group_init_type_name(&epf_group->group, name, &pci_epf_type);
+
+	epf = pci_epf_create(group->cg_item.ci_name);
+	if (IS_ERR(epf)) {
+		pr_err("failed to create endpoint function device\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	epf_group->epf = epf;
+
+	return &epf_group->group;
+}
+
+static void pci_epf_drop(struct config_group *group, struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations pci_epf_group_ops = {
+	.make_group     = &pci_epf_make,
+	.drop_item      = &pci_epf_drop,
+};
+
+static struct config_item_type pci_epf_group_type = {
+	.ct_group_ops	= &pci_epf_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+struct config_group *pci_ep_cfs_add_epf_group(const char *name)
+{
+	struct config_group *group;
+
+	group = configfs_register_default_group(functions_group, name,
+						&pci_epf_group_type);
+	if (IS_ERR(group))
+		pr_err("failed to register configfs group for %s function\n",
+		       name);
+
+	return group;
+}
+EXPORT_SYMBOL(pci_ep_cfs_add_epf_group);
+
+void pci_ep_cfs_remove_epf_group(struct config_group *group)
+{
+	if (IS_ERR_OR_NULL(group))
+		return;
+
+	configfs_unregister_default_group(group);
+}
+EXPORT_SYMBOL(pci_ep_cfs_remove_epf_group);
+
+static struct config_item_type pci_functions_type = {
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_item_type pci_controllers_type = {
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_item_type pci_ep_type = {
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem pci_ep_cfs_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "pci_ep",
+			.ci_type = &pci_ep_type,
+		},
+	},
+	.su_mutex = __MUTEX_INITIALIZER(pci_ep_cfs_subsys.su_mutex),
+};
+
+static int __init pci_ep_cfs_init(void)
+{
+	int ret;
+	struct config_group *root = &pci_ep_cfs_subsys.su_group;
+
+	config_group_init(root);
+
+	ret = configfs_register_subsystem(&pci_ep_cfs_subsys);
+	if (ret) {
+		pr_err("Error %d while registering subsystem %s\n",
+		       ret, root->cg_item.ci_namebuf);
+		goto err;
+	}
+
+	functions_group = configfs_register_default_group(root, "functions",
+							  &pci_functions_type);
+	if (IS_ERR(functions_group)) {
+		ret = PTR_ERR(functions_group);
+		pr_err("Error %d while registering functions group\n",
+		       ret);
+		goto err_functions_group;
+	}
+
+	controllers_group =
+		configfs_register_default_group(root, "controllers",
+						&pci_controllers_type);
+	if (IS_ERR(controllers_group)) {
+		ret = PTR_ERR(controllers_group);
+		pr_err("Error %d while registering controllers group\n",
+		       ret);
+		goto err_controllers_group;
+	}
+
+	return 0;
+
+err_controllers_group:
+	configfs_unregister_default_group(functions_group);
+
+err_functions_group:
+	configfs_unregister_subsystem(&pci_ep_cfs_subsys);
+
+err:
+	return ret;
+}
+module_init(pci_ep_cfs_init);
+
+static void __exit pci_ep_cfs_exit(void)
+{
+	configfs_unregister_default_group(controllers_group);
+	configfs_unregister_default_group(functions_group);
+	configfs_unregister_subsystem(&pci_ep_cfs_subsys);
+}
+module_exit(pci_ep_cfs_exit);
+
+MODULE_DESCRIPTION("PCI EP CONFIGFS");
+MODULE_AUTHOR("Kishon Vijay Abraham I <kishon@ti.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/pci-ep-cfs.h b/include/linux/pci-ep-cfs.h
new file mode 100644
index 000000000000..263b89ea5705
--- /dev/null
+++ b/include/linux/pci-ep-cfs.h
@@ -0,0 +1,41 @@
+/**
+ * PCI Endpoint ConfigFS header file
+ *
+ * Copyright (C) 2017 Texas Instruments
+ * Author: Kishon Vijay Abraham I <kishon@ti.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_PCI_EP_CFS_H
+#define __LINUX_PCI_EP_CFS_H
+
+#include <linux/configfs.h>
+
+#ifdef CONFIG_PCI_ENDPOINT_CONFIGFS
+struct config_group *pci_ep_cfs_add_epc_group(const char *name);
+void pci_ep_cfs_remove_epc_group(struct config_group *group);
+struct config_group *pci_ep_cfs_add_epf_group(const char *name);
+void pci_ep_cfs_remove_epf_group(struct config_group *group);
+#else
+static inline struct config_group *pci_ep_cfs_add_epc_group(const char *name)
+{
+	return 0;
+}
+
+static inline void pci_ep_cfs_remove_epc_group(struct config_group *group)
+{
+}
+
+static inline struct config_group *pci_ep_cfs_add_epf_group(const char *name)
+{
+	return 0;
+}
+
+static inline void pci_ep_cfs_remove_epf_group(struct config_group *group)
+{
+}
+#endif
+#endif /* __LINUX_PCI_EP_CFS_H */
-- 
cgit v1.2.3


From 3a401a2ce1cb6f6e52b78f21aa82e5d90e35c430 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 27 Mar 2017 15:15:01 +0530
Subject: PCI: endpoint: Create configfs entry for EPC device and EPF driver

Invoke APIs provided by pci-ep-cfs to create configfs entry for every EPC
device and EPF driver to help users in creating EPF device and binding the
EPF device to the EPC device.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/endpoint/pci-epc-core.c | 4 ++++
 drivers/pci/endpoint/pci-epf-core.c | 4 ++++
 include/linux/pci-epc.h             | 2 ++
 include/linux/pci-epf.h             | 2 ++
 4 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index 7c71dd94721c..caa7be10e473 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -24,6 +24,7 @@
 
 #include <linux/pci-epc.h>
 #include <linux/pci-epf.h>
+#include <linux/pci-ep-cfs.h>
 
 static struct class *pci_epc_class;
 
@@ -442,6 +443,7 @@ EXPORT_SYMBOL_GPL(pci_epc_linkup);
  */
 void pci_epc_destroy(struct pci_epc *epc)
 {
+	pci_ep_cfs_remove_epc_group(epc->group);
 	device_unregister(&epc->dev);
 	kfree(epc);
 }
@@ -508,6 +510,8 @@ __pci_epc_create(struct device *dev, const struct pci_epc_ops *ops,
 	if (ret)
 		goto put_dev;
 
+	epc->group = pci_ep_cfs_add_epc_group(dev_name(dev));
+
 	return epc;
 
 put_dev:
diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c
index a281c599a504..6877d6a5bcc9 100644
--- a/drivers/pci/endpoint/pci-epf-core.c
+++ b/drivers/pci/endpoint/pci-epf-core.c
@@ -24,6 +24,7 @@
 
 #include <linux/pci-epc.h>
 #include <linux/pci-epf.h>
+#include <linux/pci-ep-cfs.h>
 
 static struct bus_type pci_epf_bus_type;
 static struct device_type pci_epf_type;
@@ -149,6 +150,7 @@ EXPORT_SYMBOL_GPL(pci_epf_alloc_space);
  */
 void pci_epf_unregister_driver(struct pci_epf_driver *driver)
 {
+	pci_ep_cfs_remove_epf_group(driver->group);
 	driver_unregister(&driver->driver);
 }
 EXPORT_SYMBOL_GPL(pci_epf_unregister_driver);
@@ -178,6 +180,8 @@ int __pci_epf_register_driver(struct pci_epf_driver *driver,
 	if (ret)
 		return ret;
 
+	driver->group = pci_ep_cfs_add_epf_group(driver->driver.name);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__pci_epf_register_driver);
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index 8c63d3c37f76..af5edbf3eea3 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -77,6 +77,7 @@ struct pci_epc_mem {
  * @ops: function pointers for performing endpoint operations
  * @mem: address space of the endpoint controller
  * @max_functions: max number of functions that can be configured in this EPC
+ * @group: configfs group representing the PCI EPC device
  * @lock: spinlock to protect pci_epc ops
  */
 struct pci_epc {
@@ -85,6 +86,7 @@ struct pci_epc {
 	const struct pci_epc_ops	*ops;
 	struct pci_epc_mem		*mem;
 	u8				max_functions;
+	struct config_group		*group;
 	/* spinlock to protect against concurrent access of EP controller */
 	spinlock_t			lock;
 };
diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h
index 5628714f7bcf..0d529cb90143 100644
--- a/include/linux/pci-epf.h
+++ b/include/linux/pci-epf.h
@@ -82,6 +82,7 @@ struct pci_epf_ops {
  * @driver: PCI EPF driver
  * @ops: set of function pointers for performing EPF operations
  * @owner: the owner of the module that registers the PCI EPF driver
+ * @group: configfs group corresponding to the PCI EPF driver
  * @id_table: identifies EPF devices for probing
  */
 struct pci_epf_driver {
@@ -91,6 +92,7 @@ struct pci_epf_driver {
 	struct device_driver	driver;
 	struct pci_epf_ops	*ops;
 	struct module		*owner;
+	struct config_group	*group;
 	const struct pci_epf_device_id	*id_table;
 };
 
-- 
cgit v1.2.3


From e0c34e900611a58c93adf463d096c5843eb1967a Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 5 Apr 2017 10:23:12 -0300
Subject: usb: get rid of some ReST doc build errors

We need an space before a numbered list to avoid those warnings:

./drivers/usb/core/message.c:478: ERROR: Unexpected indentation.
./drivers/usb/core/message.c:479: WARNING: Block quote ends without a blank line; unexpected unindent.
./include/linux/usb/composite.h:455: ERROR: Unexpected indentation.
./include/linux/usb/composite.h:456: WARNING: Block quote ends without a blank line; unexpected unindent.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 drivers/usb/core/message.c    | 1 +
 include/linux/usb/composite.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 2184ef40a82a..4c38ea41ae96 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -474,6 +474,7 @@ EXPORT_SYMBOL_GPL(usb_sg_init);
  * significantly improve USB throughput.
  *
  * There are three kinds of completion for this function.
+ *
  * (1) success, where io->status is zero.  The number of io->bytes
  *     transferred is as requested.
  * (2) error, where io->status is a negative errno value.  The number
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 4616a49a1c2e..30a063e98c19 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -451,6 +451,7 @@ static inline struct usb_composite_driver *to_cdriver(
  * sure doing that won't hurt too much.
  *
  * One notion for how to handle Wireless USB devices involves:
+ *
  * (a) a second gadget here, discovery mechanism TBD, but likely
  *     needing separate "register/unregister WUSB gadget" calls;
  * (b) updates to usb_gadget to include flags "is it wireless",
-- 
cgit v1.2.3


From 3e208a00cffd1358cecf79074e50b093212043c1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 5 Apr 2017 10:23:13 -0300
Subject: usb: composite.h: fix two warnings when building docs

By definition, we use /* private: */ tag when we won't be documenting
a parameter. However, those two parameters are documented:

./include/linux/usb/composite.h:510: warning: Excess struct/union/enum/typedef member 'setup_pending' description in 'usb_composite_dev'
./include/linux/usb/composite.h:510: warning: Excess struct/union/enum/typedef member 'os_desc_pending' description in 'usb_composite_dev'

So, we need to use /* public: */ to avoid a warning.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/usb/composite.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 30a063e98c19..f665d2ceac20 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -504,8 +504,9 @@ struct usb_composite_dev {
 	/* protects deactivations and delayed_status counts*/
 	spinlock_t			lock;
 
-	unsigned			setup_pending:1;
-	unsigned			os_desc_pending:1;
+	/* public: */
+	unsigned int			setup_pending:1;
+	unsigned int			os_desc_pending:1;
 };
 
 extern int usb_string_id(struct usb_composite_dev *c);
-- 
cgit v1.2.3


From 0cb300623e3bb460fd9853bbde2fd1973e3bbcd8 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Wed, 5 Apr 2017 10:23:14 -0300
Subject: usb: gadget.h: be consistent at kernel doc macros

There's one value that use spaces instead of tabs to ident.
That causes the following warning:

./include/linux/usb/gadget.h:193: ERROR: Unexpected indentation.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/usb/gadget.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index e4516e9ded0f..fbc22a39e7bc 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -188,7 +188,7 @@ struct usb_ep_caps {
  * @caps:The structure describing types and directions supported by endoint.
  * @maxpacket:The maximum packet size used on this endpoint.  The initial
  *	value can sometimes be reduced (hardware allowing), according to
- *      the endpoint descriptor used to configure the endpoint.
+ *	the endpoint descriptor used to configure the endpoint.
  * @maxpacket_limit:The maximum packet size value which can be handled by this
  *	endpoint. It's set once by UDC driver when endpoint is initialized, and
  *	should not be changed. Should not be confused with maxpacket.
-- 
cgit v1.2.3


From 3ea6b7001ef5da9f9816ee3c4fe731f4fe08b865 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 6 Apr 2017 13:19:35 +0900
Subject: PM / devfreq: Move struct devfreq_governor to devfreq directory

This patch moves the struct devfreq_governor from header file
to the devfreq directory because this structure is private data
and it have to be only accessed by the devfreq core.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/governor.h | 29 +++++++++++++++++++++++++++++
 include/linux/devfreq.h    | 30 +-----------------------------
 2 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h
index 71576b8bdfef..a4f2fa1091e4 100644
--- a/drivers/devfreq/governor.h
+++ b/drivers/devfreq/governor.h
@@ -25,6 +25,35 @@
 #define DEVFREQ_GOV_SUSPEND			0x4
 #define DEVFREQ_GOV_RESUME			0x5
 
+/**
+ * struct devfreq_governor - Devfreq policy governor
+ * @node:		list node - contains registered devfreq governors
+ * @name:		Governor's name
+ * @immutable:		Immutable flag for governor. If the value is 1,
+ *			this govenror is never changeable to other governor.
+ * @get_target_freq:	Returns desired operating frequency for the device.
+ *			Basically, get_target_freq will run
+ *			devfreq_dev_profile.get_dev_status() to get the
+ *			status of the device (load = busy_time / total_time).
+ *			If no_central_polling is set, this callback is called
+ *			only with update_devfreq() notified by OPP.
+ * @event_handler:      Callback for devfreq core framework to notify events
+ *                      to governors. Events include per device governor
+ *                      init and exit, opp changes out of devfreq, suspend
+ *                      and resume of per device devfreq during device idle.
+ *
+ * Note that the callbacks are called with devfreq->lock locked by devfreq.
+ */
+struct devfreq_governor {
+	struct list_head node;
+
+	const char name[DEVFREQ_NAME_LEN];
+	const unsigned int immutable;
+	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+	int (*event_handler)(struct devfreq *devfreq,
+				unsigned int event, void *data);
+};
+
 /* Caution: devfreq->lock must be locked before calling update_devfreq */
 extern int update_devfreq(struct devfreq *devfreq);
 
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index e0acb0e5243b..6c220e4ebb6b 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -27,6 +27,7 @@
 #define DEVFREQ_POSTCHANGE		(1)
 
 struct devfreq;
+struct devfreq_governor;
 
 /**
  * struct devfreq_dev_status - Data given from devfreq user device to
@@ -100,35 +101,6 @@ struct devfreq_dev_profile {
 	unsigned int max_state;
 };
 
-/**
- * struct devfreq_governor - Devfreq policy governor
- * @node:		list node - contains registered devfreq governors
- * @name:		Governor's name
- * @immutable:		Immutable flag for governor. If the value is 1,
- *			this govenror is never changeable to other governor.
- * @get_target_freq:	Returns desired operating frequency for the device.
- *			Basically, get_target_freq will run
- *			devfreq_dev_profile.get_dev_status() to get the
- *			status of the device (load = busy_time / total_time).
- *			If no_central_polling is set, this callback is called
- *			only with update_devfreq() notified by OPP.
- * @event_handler:      Callback for devfreq core framework to notify events
- *                      to governors. Events include per device governor
- *                      init and exit, opp changes out of devfreq, suspend
- *                      and resume of per device devfreq during device idle.
- *
- * Note that the callbacks are called with devfreq->lock locked by devfreq.
- */
-struct devfreq_governor {
-	struct list_head node;
-
-	const char name[DEVFREQ_NAME_LEN];
-	const unsigned int immutable;
-	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
-	int (*event_handler)(struct devfreq *devfreq,
-				unsigned int event, void *data);
-};
-
 /**
  * struct devfreq - Device devfreq structure
  * @node:	list node - contains the devices with devfreq that have been
-- 
cgit v1.2.3


From 5b3dc2f37d7daf76a679cd204492ec5dff06bb8a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 10 Apr 2017 11:11:17 +0300
Subject: net: neigh: make ->hh_len 32-bit

Using 16-bit ->hh_len doesn't save any memory, save some .text instead:

	add/remove: 0/0 grow/shrink: 1/6 up/down: 2/-19 (-17)
	function                                     old     new   delta
	neigh_update                                2312    2314      +2
	fwnet_header_cache                           199     197      -2
	eth_header_cache                             101      99      -2
	ip6_finish_output2                          2371    2368      -3
	vrf_finish_output6                          1522    1518      -4
	vrf_finish_output                           1413    1409      -4
	ip_finish_output2                           1627    1623      -4

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +--
 include/net/neighbour.h   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc07c3be2705..8ea8a8b70755 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -237,8 +237,7 @@ struct netdev_hw_addr_list {
 	netdev_hw_addr_list_for_each(ha, &(dev)->mc)
 
 struct hh_cache {
-	u16		hh_len;
-	u16		__pad;
+	unsigned int	hh_len;
 	seqlock_t	hh_lock;
 
 	/* cached hardware header; allow for machine alignment needs.        */
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9496179c7b4e..e4dd3a214034 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -450,7 +450,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
 static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
 {
 	unsigned int seq;
-	int hh_len;
+	unsigned int hh_len;
 
 	do {
 		seq = read_seqbegin(&hh->hh_lock);
@@ -459,7 +459,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb
 			/* this is inlined by gcc */
 			memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
 		} else {
-			int hh_alen = HH_DATA_ALIGN(hh_len);
+			unsigned int hh_alen = HH_DATA_ALIGN(hh_len);
 
 			memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
 		}
-- 
cgit v1.2.3


From d92be7a41ef15463eb816a4a2d42bf094b56dfce Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 10 Apr 2017 11:25:26 +0300
Subject: net: make struct net_device::min_header_len 8-bit

This field is never big enough to warrant 16-bitness.

8-bit accesses enjoy shorted encoding on i386/x86_64 than 16-bit
accesses:

	add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-10 (-10)
	function                                     old     new   delta
	loopback_setup                               169     164      -5
	ether_setup                                  148     143      -5

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8ea8a8b70755..b0aa089ce67f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1715,7 +1715,7 @@ struct net_device {
 	unsigned int		max_mtu;
 	unsigned short		type;
 	unsigned short		hard_header_len;
-	unsigned short		min_header_len;
+	unsigned char		min_header_len;
 
 	unsigned short		needed_headroom;
 	unsigned short		needed_tailroom;
-- 
cgit v1.2.3


From 993225adf4af20a0e50e37c3d4894b79c98e01c9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 7 Apr 2017 10:50:33 +0200
Subject: KVM: x86: rename kvm_vcpu_request_scan_ioapic()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Let's rename it into a proper arch specific callback.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/ioapic.c    | 2 +-
 include/linux/kvm_host.h | 4 ++--
 virt/kvm/eventfd.c       | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index dc29a2785b81..bdff437acbcb 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -266,7 +266,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 	spin_unlock(&ioapic->lock);
 }
 
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
 	if (!ioapic_in_kernel(kvm))
 		return;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7e74ae4d99bb..397b7b5b1933 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -502,10 +502,10 @@ int __must_check vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
-void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
 void kvm_arch_post_irq_routing_update(struct kvm *kvm);
 #else
-static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
 {
 }
 static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 4d28a9ddbee0..a8d540398bbd 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -490,7 +490,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 	mutex_lock(&kvm->irq_lock);
 	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_arch_post_irq_ack_notifier_list_update(kvm);
 }
 
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
@@ -500,7 +500,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 	hlist_del_init_rcu(&kian->link);
 	mutex_unlock(&kvm->irq_lock);
 	synchronize_srcu(&kvm->irq_srcu);
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_arch_post_irq_ack_notifier_list_update(kvm);
 }
 #endif
 
-- 
cgit v1.2.3


From b3f80c8f75efb2e6a817a0e48bf36cd30685a138 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Tue, 28 Mar 2017 17:59:31 +0200
Subject: serdev: add serdev_device_wait_until_sent

Add method, which waits until the transmission buffer has been sent.
Note, that the change in ttyport_write_wakeup is related, since
tty_wait_until_sent will hang without that change.

Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/tty/serdev/core.c           | 11 +++++++++++
 drivers/tty/serdev/serdev-ttyport.c | 18 ++++++++++++++----
 include/linux/serdev.h              |  3 +++
 3 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index f4c6c90add78..a63b74031e22 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -173,6 +173,17 @@ void serdev_device_set_flow_control(struct serdev_device *serdev, bool enable)
 }
 EXPORT_SYMBOL_GPL(serdev_device_set_flow_control);
 
+void serdev_device_wait_until_sent(struct serdev_device *serdev, long timeout)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->wait_until_sent)
+		return;
+
+	ctrl->ops->wait_until_sent(ctrl, timeout);
+}
+EXPORT_SYMBOL_GPL(serdev_device_wait_until_sent);
+
 static int serdev_drv_probe(struct device *dev)
 {
 	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index d05393594f15..50dc75c4d204 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -14,6 +14,7 @@
 #include <linux/serdev.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
+#include <linux/poll.h>
 
 #define SERPORT_ACTIVE		1
 
@@ -46,11 +47,11 @@ static void ttyport_write_wakeup(struct tty_port *port)
 	struct serdev_controller *ctrl = port->client_data;
 	struct serport *serport = serdev_controller_get_drvdata(ctrl);
 
-	if (!test_and_clear_bit(TTY_DO_WRITE_WAKEUP, &port->tty->flags))
-		return;
-
-	if (test_bit(SERPORT_ACTIVE, &serport->flags))
+	if (test_and_clear_bit(TTY_DO_WRITE_WAKEUP, &port->tty->flags) &&
+	    test_bit(SERPORT_ACTIVE, &serport->flags))
 		serdev_controller_write_wakeup(ctrl);
+
+	wake_up_interruptible_poll(&port->tty->write_wait, POLLOUT);
 }
 
 static const struct tty_port_client_operations client_ops = {
@@ -167,6 +168,14 @@ static void ttyport_set_flow_control(struct serdev_controller *ctrl, bool enable
 	tty_set_termios(tty, &ktermios);
 }
 
+static void ttyport_wait_until_sent(struct serdev_controller *ctrl, long timeout)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	tty_wait_until_sent(tty, timeout);
+}
+
 static const struct serdev_controller_ops ctrl_ops = {
 	.write_buf = ttyport_write_buf,
 	.write_flush = ttyport_write_flush,
@@ -175,6 +184,7 @@ static const struct serdev_controller_ops ctrl_ops = {
 	.close = ttyport_close,
 	.set_flow_control = ttyport_set_flow_control,
 	.set_baudrate = ttyport_set_baudrate,
+	.wait_until_sent = ttyport_wait_until_sent,
 };
 
 struct device *serdev_tty_port_register(struct tty_port *port,
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index 9519da6253a8..a308b206d204 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -81,6 +81,7 @@ struct serdev_controller_ops {
 	void (*close)(struct serdev_controller *);
 	void (*set_flow_control)(struct serdev_controller *, bool);
 	unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int);
+	void (*wait_until_sent)(struct serdev_controller *, long);
 };
 
 /**
@@ -186,6 +187,7 @@ int serdev_device_open(struct serdev_device *);
 void serdev_device_close(struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
+void serdev_device_wait_until_sent(struct serdev_device *, long);
 int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
 void serdev_device_write_flush(struct serdev_device *);
 int serdev_device_write_room(struct serdev_device *);
@@ -223,6 +225,7 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 	return 0;
 }
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
+static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {}
 static inline int serdev_device_write_buf(struct serdev_device *sdev, const unsigned char *buf, size_t count)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 5659dab26f09a60db8bd1600e1ce89802fab1c7f Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Tue, 28 Mar 2017 17:59:32 +0200
Subject: serdev: implement get/set tiocm

Add method for getting and setting tiocm.

Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/tty/serdev/core.c           | 22 ++++++++++++++++++++++
 drivers/tty/serdev/serdev-ttyport.c | 24 ++++++++++++++++++++++++
 include/linux/serdev.h              | 13 +++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index a63b74031e22..1e1cbae3a0ea 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -184,6 +184,28 @@ void serdev_device_wait_until_sent(struct serdev_device *serdev, long timeout)
 }
 EXPORT_SYMBOL_GPL(serdev_device_wait_until_sent);
 
+int serdev_device_get_tiocm(struct serdev_device *serdev)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->get_tiocm)
+		return -ENOTSUPP;
+
+	return ctrl->ops->get_tiocm(ctrl);
+}
+EXPORT_SYMBOL_GPL(serdev_device_get_tiocm);
+
+int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->set_tiocm)
+		return -ENOTSUPP;
+
+	return ctrl->ops->set_tiocm(ctrl, set, clear);
+}
+EXPORT_SYMBOL_GPL(serdev_device_set_tiocm);
+
 static int serdev_drv_probe(struct device *dev)
 {
 	const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver);
diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index 50dc75c4d204..487c88f6aa0e 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -176,6 +176,28 @@ static void ttyport_wait_until_sent(struct serdev_controller *ctrl, long timeout
 	tty_wait_until_sent(tty, timeout);
 }
 
+static int ttyport_get_tiocm(struct serdev_controller *ctrl)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	if (!tty->ops->tiocmget)
+		return -ENOTSUPP;
+
+	return tty->driver->ops->tiocmget(tty);
+}
+
+static int ttyport_set_tiocm(struct serdev_controller *ctrl, unsigned int set, unsigned int clear)
+{
+	struct serport *serport = serdev_controller_get_drvdata(ctrl);
+	struct tty_struct *tty = serport->tty;
+
+	if (!tty->ops->tiocmset)
+		return -ENOTSUPP;
+
+	return tty->driver->ops->tiocmset(tty, set, clear);
+}
+
 static const struct serdev_controller_ops ctrl_ops = {
 	.write_buf = ttyport_write_buf,
 	.write_flush = ttyport_write_flush,
@@ -185,6 +207,8 @@ static const struct serdev_controller_ops ctrl_ops = {
 	.set_flow_control = ttyport_set_flow_control,
 	.set_baudrate = ttyport_set_baudrate,
 	.wait_until_sent = ttyport_wait_until_sent,
+	.get_tiocm = ttyport_get_tiocm,
+	.set_tiocm = ttyport_set_tiocm,
 };
 
 struct device *serdev_tty_port_register(struct tty_port *port,
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index a308b206d204..e29a270f603c 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -15,6 +15,7 @@
 
 #include <linux/types.h>
 #include <linux/device.h>
+#include <linux/termios.h>
 
 struct serdev_controller;
 struct serdev_device;
@@ -82,6 +83,8 @@ struct serdev_controller_ops {
 	void (*set_flow_control)(struct serdev_controller *, bool);
 	unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int);
 	void (*wait_until_sent)(struct serdev_controller *, long);
+	int (*get_tiocm)(struct serdev_controller *);
+	int (*set_tiocm)(struct serdev_controller *, unsigned int, unsigned int);
 };
 
 /**
@@ -188,6 +191,8 @@ void serdev_device_close(struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
 void serdev_device_wait_until_sent(struct serdev_device *, long);
+int serdev_device_get_tiocm(struct serdev_device *);
+int serdev_device_set_tiocm(struct serdev_device *, int, int);
 int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
 void serdev_device_write_flush(struct serdev_device *);
 int serdev_device_write_room(struct serdev_device *);
@@ -226,6 +231,14 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 }
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
 static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {}
+static inline int serdev_device_get_tiocm(struct serdev_device *serdev)
+{
+	return -ENOTSUPP;
+}
+static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear)
+{
+	return -ENOTSUPP;
+}
 static inline int serdev_device_write_buf(struct serdev_device *sdev, const unsigned char *buf, size_t count)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 756db778748949f6403b727fc6251674dbfcb1a2 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Tue, 28 Mar 2017 17:59:33 +0200
Subject: serdev: add helpers for cts and rts handling

Add serdev helper functions for handling of cts and rts
lines using the serdev's tiocm functions.

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 include/linux/serdev.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index e29a270f603c..37395b8eb8f1 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/device.h>
 #include <linux/termios.h>
+#include <linux/delay.h>
 
 struct serdev_controller;
 struct serdev_device;
@@ -254,6 +255,36 @@ static inline int serdev_device_write_room(struct serdev_device *sdev)
 
 #endif /* CONFIG_SERIAL_DEV_BUS */
 
+static inline bool serdev_device_get_cts(struct serdev_device *serdev)
+{
+	int status = serdev_device_get_tiocm(serdev);
+	return !!(status & TIOCM_CTS);
+}
+
+static inline int serdev_device_wait_for_cts(struct serdev_device *serdev, bool state, int timeout_ms)
+{
+	unsigned long timeout;
+	bool signal;
+
+	timeout = jiffies + msecs_to_jiffies(timeout_ms);
+	while (time_is_after_jiffies(timeout)) {
+		signal = serdev_device_get_cts(serdev);
+		if (signal == state)
+			return 0;
+		usleep_range(1000, 2000);
+	}
+
+	return -ETIMEDOUT;
+}
+
+static inline int serdev_device_set_rts(struct serdev_device *serdev, bool enable)
+{
+	if (enable)
+		return serdev_device_set_tiocm(serdev, TIOCM_RTS, 0);
+	else
+		return serdev_device_set_tiocm(serdev, 0, TIOCM_RTS);
+}
+
 /*
  * serdev hooks into TTY core
  */
-- 
cgit v1.2.3


From 006358b35c73ab75544fb4509483a81ef1a9c0b2 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Fri, 7 Apr 2017 15:33:31 -0700
Subject: libnvdimm: add support for clear poison list and badblocks for device
 dax

Providing mechanism to clear poison list via the ndctl ND_CMD_CLEAR_ERROR
call. We will update the poison list and also the badblocks at region level
if the region is in dax mode or in pmem mode and not active. In other
words we force badblocks to be cleared through write requests if the
address is currently accessed through a block device, otherwise it can
only be done via the ioctl+dsm path.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c      | 83 ++++++++++++++++++++++++++++++++++++++++++-----
 drivers/nvdimm/core.c     | 17 +++++++---
 drivers/nvdimm/region.c   | 25 ++++++++++++++
 include/linux/libnvdimm.h |  7 +++-
 4 files changed, 118 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 351bac8f6503..5ad2e5909e1a 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -27,6 +27,7 @@
 #include <linux/nd.h>
 #include "nd-core.h"
 #include "nd.h"
+#include "pfn.h"
 
 int nvdimm_major;
 static int nvdimm_bus_major;
@@ -218,11 +219,20 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	if (cmd_rc < 0)
 		return cmd_rc;
 
-	nvdimm_clear_from_poison_list(nvdimm_bus, phys, len);
+	nvdimm_forget_poison(nvdimm_bus, phys, len);
 	return clear_err.cleared;
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
 
+void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
+		struct resource *res)
+{
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+	device_for_each_child(&nvdimm_bus->dev, (void *)res,
+			nvdimm_region_badblocks_clear);
+}
+EXPORT_SYMBOL_GPL(__nvdimm_bus_badblocks_clear);
+
 static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
 
 static struct bus_type nvdimm_bus_type = {
@@ -769,16 +779,55 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
 	} while (true);
 }
 
-static int pmem_active(struct device *dev, void *data)
+static int nd_pmem_forget_poison_check(struct device *dev, void *data)
 {
-	if (is_nd_pmem(dev) && dev->driver)
+	struct nd_cmd_clear_error *clear_err =
+		(struct nd_cmd_clear_error *)data;
+	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
+	struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+	struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
+	struct nd_namespace_common *ndns = NULL;
+	struct nd_namespace_io *nsio;
+	resource_size_t offset = 0, end_trunc = 0, start, end, pstart, pend;
+
+	if (nd_dax || !dev->driver)
+		return 0;
+
+	start = clear_err->address;
+	end = clear_err->address + clear_err->cleared - 1;
+
+	if (nd_btt || nd_pfn || nd_dax) {
+		if (nd_btt)
+			ndns = nd_btt->ndns;
+		else if (nd_pfn)
+			ndns = nd_pfn->ndns;
+		else if (nd_dax)
+			ndns = nd_dax->nd_pfn.ndns;
+
+		if (!ndns)
+			return 0;
+	} else
+		ndns = to_ndns(dev);
+
+	nsio = to_nd_namespace_io(&ndns->dev);
+	pstart = nsio->res.start + offset;
+	pend = nsio->res.end - end_trunc;
+
+	if ((pstart >= start) && (pend <= end))
 		return -EBUSY;
+
 	return 0;
+
+}
+
+static int nd_ns_forget_poison_check(struct device *dev, void *data)
+{
+	return device_for_each_child(dev, data, nd_pmem_forget_poison_check);
 }
 
 /* set_config requires an idle interleave set */
 static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
-		struct nvdimm *nvdimm, unsigned int cmd)
+		struct nvdimm *nvdimm, unsigned int cmd, void *data)
 {
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
 
@@ -792,8 +841,8 @@ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
 
 	/* require clear error to go through the pmem driver */
 	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
-		return device_for_each_child(&nvdimm_bus->dev, NULL,
-				pmem_active);
+		return device_for_each_child(&nvdimm_bus->dev, data,
+				nd_ns_forget_poison_check);
 
 	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
 		return 0;
@@ -820,7 +869,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	const char *cmd_name, *dimm_name;
 	unsigned long cmd_mask;
 	void *buf;
-	int rc, i;
+	int rc, i, cmd_rc;
 
 	if (nvdimm) {
 		desc = nd_cmd_dimm_desc(cmd);
@@ -927,13 +976,29 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	}
 
 	nvdimm_bus_lock(&nvdimm_bus->dev);
-	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd);
+	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd, buf);
 	if (rc)
 		goto out_unlock;
 
-	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL);
+	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, &cmd_rc);
 	if (rc < 0)
 		goto out_unlock;
+
+	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
+		struct nd_cmd_clear_error *clear_err = buf;
+		struct resource res;
+
+		if (clear_err->cleared) {
+			/* clearing the poison list we keep track of */
+			__nvdimm_forget_poison(nvdimm_bus, clear_err->address,
+					clear_err->cleared);
+
+			/* now sync the badblocks lists */
+			res.start = clear_err->address;
+			res.end = clear_err->address + clear_err->cleared - 1;
+			__nvdimm_bus_badblocks_clear(nvdimm_bus, &res);
+		}
+	}
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 
 	if (copy_to_user(p, buf, buf_len))
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 9303cfeb8bee..40a3da088fd2 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -574,14 +574,15 @@ int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
-void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len)
+void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
+		unsigned int len)
 {
 	struct list_head *poison_list = &nvdimm_bus->poison_list;
 	u64 clr_end = start + len - 1;
 	struct nd_poison *pl, *next;
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
+
 	WARN_ON_ONCE(list_empty(poison_list));
 
 	/*
@@ -634,9 +635,17 @@ void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
 			continue;
 		}
 	}
+}
+EXPORT_SYMBOL_GPL(__nvdimm_forget_poison);
+
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t start, unsigned int len)
+{
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	__nvdimm_forget_poison(nvdimm_bus, start, len);
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 }
-EXPORT_SYMBOL_GPL(nvdimm_clear_from_poison_list);
+EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
 
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 869a886c292e..23c4307d254c 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -131,6 +131,31 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 	device_for_each_child(dev, &event, child_notify);
 }
 
+int nvdimm_region_badblocks_clear(struct device *dev, void *data)
+{
+	struct resource *res = (struct resource *)data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+	sector_t sector;
+
+	/* make sure device is a region */
+	if (!is_nd_pmem(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+	/* make sure we are in the region */
+	if (res->start < nd_region->ndr_start || res->end > ndr_end)
+		return 0;
+
+	sector = (res->start - nd_region->ndr_start) >> 9;
+	badblocks_clear(&nd_region->bb, sector, resource_size(res) >> 9);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_badblocks_clear);
+
 static struct nd_device_driver nd_region_driver = {
 	.probe = nd_region_probe,
 	.remove = nd_region_remove,
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 77e7af32543f..1c609e89048a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -120,7 +120,9 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
 }
 
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus,
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t start, unsigned int len);
+void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t start, unsigned int len);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
@@ -162,4 +164,7 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
+int nvdimm_region_badblocks_clear(struct device *dev, void *data);
+void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
+		struct resource *res);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From 7b6be8444e0f0dd675b54d059793423d3c9b4c03 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 11 Apr 2017 09:49:49 -0700
Subject: dax: refactor dax-fs into a generic provider of 'struct dax_device'
 instances

We want dax capable drivers to be able to publish a set of dax
operations [1]. However, we do not want to further abuse block_devices
to advertise these operations. Instead we will attach these operations
to a dax device and add a lookup mechanism to go from block device path
to a dax device. A dax capable driver like pmem or brd is responsible
for registering a dax device, alongside a block device, and then a dax
capable filesystem is responsible for retrieving the dax device by path
name if it wants to call dax_operations.

For now, we refactor the dax pseudo-fs to be a generic facility, rather
than an implementation detail, of the device-dax use case. Where a "dax
device" is just an inode + dax infrastructure, and "Device DAX" is a
mapping service layered on top of that base 'struct dax_device'.
"Filesystem DAX" is then a mapping service that layers a filesystem on
top of that same base device. Filesystem DAX is associated with a
block_device for now, but perhaps directly to a dax device in the
future, or for new pmem-only filesystems.

[1]: https://lkml.org/lkml/2017/1/19/880

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/Makefile            |   2 +-
 drivers/dax/Kconfig         |  10 +-
 drivers/dax/Makefile        |   5 +-
 drivers/dax/dax.c           | 864 --------------------------------------------
 drivers/dax/dax.h           |  20 +-
 drivers/dax/device-dax.h    |  25 ++
 drivers/dax/device.c        | 709 ++++++++++++++++++++++++++++++++++++
 drivers/dax/pmem.c          |   2 +-
 drivers/dax/super.c         | 303 ++++++++++++++++
 include/linux/dax.h         |   3 +
 tools/testing/nvdimm/Kbuild |  10 +-
 11 files changed, 1070 insertions(+), 883 deletions(-)
 delete mode 100644 drivers/dax/dax.c
 create mode 100644 drivers/dax/device-dax.h
 create mode 100644 drivers/dax/device.c
 create mode 100644 drivers/dax/super.c

(limited to 'include/linux')

diff --git a/drivers/Makefile b/drivers/Makefile
index 2eced9afba53..0442e982cf35 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_PARPORT)		+= parport/
 obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
-obj-$(CONFIG_DEV_DAX)		+= dax/
+obj-$(CONFIG_DAX)		+= dax/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 9e95bf94eb13..b7053eafd88e 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,8 +1,13 @@
-menuconfig DEV_DAX
+menuconfig DAX
 	tristate "DAX: direct access to differentiated memory"
+	select SRCU
 	default m if NVDIMM_DAX
+
+if DAX
+
+config DEV_DAX
+	tristate "Device DAX: direct access mapping device"
 	depends on TRANSPARENT_HUGEPAGE
-	select SRCU
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
@@ -11,7 +16,6 @@ menuconfig DEV_DAX
 	  baseline memory pool.  Mappings of a /dev/daxX.Y device impose
 	  restrictions that make the mapping behavior deterministic.
 
-if DEV_DAX
 
 config DEV_DAX_PMEM
 	tristate "PMEM DAX: direct access to persistent memory"
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 27c54e38478a..dc7422530462 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -1,4 +1,7 @@
-obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
+dax-y := super.o
 dax_pmem-y := pmem.o
+device_dax-y := device.o
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
deleted file mode 100644
index 376fdd353aea..000000000000
--- a/drivers/dax/dax.c
+++ /dev/null
@@ -1,864 +0,0 @@
-/*
- * Copyright(c) 2016 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/pagemap.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/magic.h>
-#include <linux/mount.h>
-#include <linux/pfn_t.h>
-#include <linux/hash.h>
-#include <linux/cdev.h>
-#include <linux/slab.h>
-#include <linux/dax.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include "dax.h"
-
-static dev_t dax_devt;
-DEFINE_STATIC_SRCU(dax_srcu);
-static struct class *dax_class;
-static DEFINE_IDA(dax_minor_ida);
-static int nr_dax = CONFIG_NR_DEV_DAX;
-module_param(nr_dax, int, S_IRUGO);
-static struct vfsmount *dax_mnt;
-static struct kmem_cache *dax_cache __read_mostly;
-static struct super_block *dax_superblock __read_mostly;
-MODULE_PARM_DESC(nr_dax, "max number of device-dax instances");
-
-/**
- * struct dax_region - mapping infrastructure for dax devices
- * @id: kernel-wide unique region for a memory range
- * @base: linear address corresponding to @res
- * @kref: to pin while other agents have a need to do lookups
- * @dev: parent device backing this region
- * @align: allocation and mapping alignment for child dax devices
- * @res: physical address range of the region
- * @pfn_flags: identify whether the pfns are paged back or not
- */
-struct dax_region {
-	int id;
-	struct ida ida;
-	void *base;
-	struct kref kref;
-	struct device *dev;
-	unsigned int align;
-	struct resource res;
-	unsigned long pfn_flags;
-};
-
-/**
- * struct dev_dax - instance data for a subdivision of a dax region
- * @region - parent region
- * @dev - device backing the character device
- * @cdev - core chardev data
- * @alive - !alive + srcu grace period == no new mappings can be established
- * @id - child id in the region
- * @num_resources - number of physical address extents in this device
- * @res - array of physical address ranges
- */
-struct dev_dax {
-	struct dax_region *region;
-	struct inode *inode;
-	struct device dev;
-	struct cdev cdev;
-	bool alive;
-	int id;
-	int num_resources;
-	struct resource res[0];
-};
-
-static ssize_t id_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
-
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%d\n", dax_region->id);
-	device_unlock(dev);
-
-	return rc;
-}
-static DEVICE_ATTR_RO(id);
-
-static ssize_t region_size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
-
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%llu\n", (unsigned long long)
-				resource_size(&dax_region->res));
-	device_unlock(dev);
-
-	return rc;
-}
-static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
-		region_size_show, NULL);
-
-static ssize_t align_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dax_region *dax_region;
-	ssize_t rc = -ENXIO;
-
-	device_lock(dev);
-	dax_region = dev_get_drvdata(dev);
-	if (dax_region)
-		rc = sprintf(buf, "%u\n", dax_region->align);
-	device_unlock(dev);
-
-	return rc;
-}
-static DEVICE_ATTR_RO(align);
-
-static struct attribute *dax_region_attributes[] = {
-	&dev_attr_region_size.attr,
-	&dev_attr_align.attr,
-	&dev_attr_id.attr,
-	NULL,
-};
-
-static const struct attribute_group dax_region_attribute_group = {
-	.name = "dax_region",
-	.attrs = dax_region_attributes,
-};
-
-static const struct attribute_group *dax_region_attribute_groups[] = {
-	&dax_region_attribute_group,
-	NULL,
-};
-
-static struct inode *dax_alloc_inode(struct super_block *sb)
-{
-	return kmem_cache_alloc(dax_cache, GFP_KERNEL);
-}
-
-static void dax_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
-	kmem_cache_free(dax_cache, inode);
-}
-
-static void dax_destroy_inode(struct inode *inode)
-{
-	call_rcu(&inode->i_rcu, dax_i_callback);
-}
-
-static const struct super_operations dax_sops = {
-	.statfs = simple_statfs,
-	.alloc_inode = dax_alloc_inode,
-	.destroy_inode = dax_destroy_inode,
-	.drop_inode = generic_delete_inode,
-};
-
-static struct dentry *dax_mount(struct file_system_type *fs_type,
-		int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
-}
-
-static struct file_system_type dax_type = {
-	.name = "dax",
-	.mount = dax_mount,
-	.kill_sb = kill_anon_super,
-};
-
-static int dax_test(struct inode *inode, void *data)
-{
-	return inode->i_cdev == data;
-}
-
-static int dax_set(struct inode *inode, void *data)
-{
-	inode->i_cdev = data;
-	return 0;
-}
-
-static struct inode *dax_inode_get(struct cdev *cdev, dev_t devt)
-{
-	struct inode *inode;
-
-	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
-			dax_test, dax_set, cdev);
-
-	if (!inode)
-		return NULL;
-
-	if (inode->i_state & I_NEW) {
-		inode->i_mode = S_IFCHR;
-		inode->i_flags = S_DAX;
-		inode->i_rdev = devt;
-		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-		unlock_new_inode(inode);
-	}
-	return inode;
-}
-
-static void init_once(void *inode)
-{
-	inode_init_once(inode);
-}
-
-static int dax_inode_init(void)
-{
-	int rc;
-
-	dax_cache = kmem_cache_create("dax_cache", sizeof(struct inode), 0,
-			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
-			init_once);
-	if (!dax_cache)
-		return -ENOMEM;
-
-	rc = register_filesystem(&dax_type);
-	if (rc)
-		goto err_register_fs;
-
-	dax_mnt = kern_mount(&dax_type);
-	if (IS_ERR(dax_mnt)) {
-		rc = PTR_ERR(dax_mnt);
-		goto err_mount;
-	}
-	dax_superblock = dax_mnt->mnt_sb;
-
-	return 0;
-
- err_mount:
-	unregister_filesystem(&dax_type);
- err_register_fs:
-	kmem_cache_destroy(dax_cache);
-
-	return rc;
-}
-
-static void dax_inode_exit(void)
-{
-	kern_unmount(dax_mnt);
-	unregister_filesystem(&dax_type);
-	kmem_cache_destroy(dax_cache);
-}
-
-static void dax_region_free(struct kref *kref)
-{
-	struct dax_region *dax_region;
-
-	dax_region = container_of(kref, struct dax_region, kref);
-	kfree(dax_region);
-}
-
-void dax_region_put(struct dax_region *dax_region)
-{
-	kref_put(&dax_region->kref, dax_region_free);
-}
-EXPORT_SYMBOL_GPL(dax_region_put);
-
-static void dax_region_unregister(void *region)
-{
-	struct dax_region *dax_region = region;
-
-	sysfs_remove_groups(&dax_region->dev->kobj,
-			dax_region_attribute_groups);
-	dax_region_put(dax_region);
-}
-
-struct dax_region *alloc_dax_region(struct device *parent, int region_id,
-		struct resource *res, unsigned int align, void *addr,
-		unsigned long pfn_flags)
-{
-	struct dax_region *dax_region;
-
-	/*
-	 * The DAX core assumes that it can store its private data in
-	 * parent->driver_data. This WARN is a reminder / safeguard for
-	 * developers of device-dax drivers.
-	 */
-	if (dev_get_drvdata(parent)) {
-		dev_WARN(parent, "dax core failed to setup private data\n");
-		return NULL;
-	}
-
-	if (!IS_ALIGNED(res->start, align)
-			|| !IS_ALIGNED(resource_size(res), align))
-		return NULL;
-
-	dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
-	if (!dax_region)
-		return NULL;
-
-	dev_set_drvdata(parent, dax_region);
-	memcpy(&dax_region->res, res, sizeof(*res));
-	dax_region->pfn_flags = pfn_flags;
-	kref_init(&dax_region->kref);
-	dax_region->id = region_id;
-	ida_init(&dax_region->ida);
-	dax_region->align = align;
-	dax_region->dev = parent;
-	dax_region->base = addr;
-	if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
-		kfree(dax_region);
-		return NULL;;
-	}
-
-	kref_get(&dax_region->kref);
-	if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region))
-		return NULL;
-	return dax_region;
-}
-EXPORT_SYMBOL_GPL(alloc_dax_region);
-
-static struct dev_dax *to_dev_dax(struct device *dev)
-{
-	return container_of(dev, struct dev_dax, dev);
-}
-
-static ssize_t size_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-	unsigned long long size = 0;
-	int i;
-
-	for (i = 0; i < dev_dax->num_resources; i++)
-		size += resource_size(&dev_dax->res[i]);
-
-	return sprintf(buf, "%llu\n", size);
-}
-static DEVICE_ATTR_RO(size);
-
-static struct attribute *dev_dax_attributes[] = {
-	&dev_attr_size.attr,
-	NULL,
-};
-
-static const struct attribute_group dev_dax_attribute_group = {
-	.attrs = dev_dax_attributes,
-};
-
-static const struct attribute_group *dax_attribute_groups[] = {
-	&dev_dax_attribute_group,
-	NULL,
-};
-
-static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
-		const char *func)
-{
-	struct dax_region *dax_region = dev_dax->region;
-	struct device *dev = &dev_dax->dev;
-	unsigned long mask;
-
-	if (!dev_dax->alive)
-		return -ENXIO;
-
-	/* prevent private mappings from being established */
-	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
-		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
-				current->comm, func);
-		return -EINVAL;
-	}
-
-	mask = dax_region->align - 1;
-	if (vma->vm_start & mask || vma->vm_end & mask) {
-		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
-				current->comm, func, vma->vm_start, vma->vm_end,
-				mask);
-		return -EINVAL;
-	}
-
-	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
-			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
-		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
-				current->comm, func);
-		return -EINVAL;
-	}
-
-	if (!vma_is_dax(vma)) {
-		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
-				current->comm, func);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
-		unsigned long size)
-{
-	struct resource *res;
-	phys_addr_t phys;
-	int i;
-
-	for (i = 0; i < dev_dax->num_resources; i++) {
-		res = &dev_dax->res[i];
-		phys = pgoff * PAGE_SIZE + res->start;
-		if (phys >= res->start && phys <= res->end)
-			break;
-		pgoff -= PHYS_PFN(resource_size(res));
-	}
-
-	if (i < dev_dax->num_resources) {
-		res = &dev_dax->res[i];
-		if (phys + size - 1 <= res->end)
-			return phys;
-	}
-
-	return -1;
-}
-
-static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	struct device *dev = &dev_dax->dev;
-	struct dax_region *dax_region;
-	int rc = VM_FAULT_SIGBUS;
-	phys_addr_t phys;
-	pfn_t pfn;
-	unsigned int fault_size = PAGE_SIZE;
-
-	if (check_vma(dev_dax, vmf->vma, __func__))
-		return VM_FAULT_SIGBUS;
-
-	dax_region = dev_dax->region;
-	if (dax_region->align > PAGE_SIZE) {
-		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
-			__func__, dax_region->align, fault_size);
-		return VM_FAULT_SIGBUS;
-	}
-
-	if (fault_size != dax_region->align)
-		return VM_FAULT_SIGBUS;
-
-	phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
-	if (phys == -1) {
-		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
-				vmf->pgoff);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-
-	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
-
-	if (rc == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (rc < 0 && rc != -EBUSY)
-		return VM_FAULT_SIGBUS;
-
-	return VM_FAULT_NOPAGE;
-}
-
-static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	unsigned long pmd_addr = vmf->address & PMD_MASK;
-	struct device *dev = &dev_dax->dev;
-	struct dax_region *dax_region;
-	phys_addr_t phys;
-	pgoff_t pgoff;
-	pfn_t pfn;
-	unsigned int fault_size = PMD_SIZE;
-
-	if (check_vma(dev_dax, vmf->vma, __func__))
-		return VM_FAULT_SIGBUS;
-
-	dax_region = dev_dax->region;
-	if (dax_region->align > PMD_SIZE) {
-		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
-			__func__, dax_region->align, fault_size);
-		return VM_FAULT_SIGBUS;
-	}
-
-	/* dax pmd mappings require pfn_t_devmap() */
-	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
-		return VM_FAULT_SIGBUS;
-	}
-
-	if (fault_size < dax_region->align)
-		return VM_FAULT_SIGBUS;
-	else if (fault_size > dax_region->align)
-		return VM_FAULT_FALLBACK;
-
-	/* if we are outside of the VMA */
-	if (pmd_addr < vmf->vma->vm_start ||
-			(pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
-		return VM_FAULT_SIGBUS;
-
-	pgoff = linear_page_index(vmf->vma, pmd_addr);
-	phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
-	if (phys == -1) {
-		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
-				pgoff);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-
-	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
-			vmf->flags & FAULT_FLAG_WRITE);
-}
-
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	unsigned long pud_addr = vmf->address & PUD_MASK;
-	struct device *dev = &dev_dax->dev;
-	struct dax_region *dax_region;
-	phys_addr_t phys;
-	pgoff_t pgoff;
-	pfn_t pfn;
-	unsigned int fault_size = PUD_SIZE;
-
-
-	if (check_vma(dev_dax, vmf->vma, __func__))
-		return VM_FAULT_SIGBUS;
-
-	dax_region = dev_dax->region;
-	if (dax_region->align > PUD_SIZE) {
-		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
-			__func__, dax_region->align, fault_size);
-		return VM_FAULT_SIGBUS;
-	}
-
-	/* dax pud mappings require pfn_t_devmap() */
-	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
-		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
-		return VM_FAULT_SIGBUS;
-	}
-
-	if (fault_size < dax_region->align)
-		return VM_FAULT_SIGBUS;
-	else if (fault_size > dax_region->align)
-		return VM_FAULT_FALLBACK;
-
-	/* if we are outside of the VMA */
-	if (pud_addr < vmf->vma->vm_start ||
-			(pud_addr + PUD_SIZE) > vmf->vma->vm_end)
-		return VM_FAULT_SIGBUS;
-
-	pgoff = linear_page_index(vmf->vma, pud_addr);
-	phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
-	if (phys == -1) {
-		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
-				pgoff);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
-
-	return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
-			vmf->flags & FAULT_FLAG_WRITE);
-}
-#else
-static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
-{
-	return VM_FAULT_FALLBACK;
-}
-#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-
-static int dev_dax_huge_fault(struct vm_fault *vmf,
-		enum page_entry_size pe_size)
-{
-	int rc, id;
-	struct file *filp = vmf->vma->vm_file;
-	struct dev_dax *dev_dax = filp->private_data;
-
-	dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
-			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
-			? "write" : "read",
-			vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
-
-	id = srcu_read_lock(&dax_srcu);
-	switch (pe_size) {
-	case PE_SIZE_PTE:
-		rc = __dev_dax_pte_fault(dev_dax, vmf);
-		break;
-	case PE_SIZE_PMD:
-		rc = __dev_dax_pmd_fault(dev_dax, vmf);
-		break;
-	case PE_SIZE_PUD:
-		rc = __dev_dax_pud_fault(dev_dax, vmf);
-		break;
-	default:
-		rc = VM_FAULT_SIGBUS;
-	}
-	srcu_read_unlock(&dax_srcu, id);
-
-	return rc;
-}
-
-static int dev_dax_fault(struct vm_fault *vmf)
-{
-	return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
-}
-
-static const struct vm_operations_struct dax_vm_ops = {
-	.fault = dev_dax_fault,
-	.huge_fault = dev_dax_huge_fault,
-};
-
-static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
-{
-	struct dev_dax *dev_dax = filp->private_data;
-	int rc;
-
-	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-
-	rc = check_vma(dev_dax, vma, __func__);
-	if (rc)
-		return rc;
-
-	vma->vm_ops = &dax_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-	return 0;
-}
-
-/* return an unmapped area aligned to the dax region specified alignment */
-static unsigned long dax_get_unmapped_area(struct file *filp,
-		unsigned long addr, unsigned long len, unsigned long pgoff,
-		unsigned long flags)
-{
-	unsigned long off, off_end, off_align, len_align, addr_align, align;
-	struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
-	struct dax_region *dax_region;
-
-	if (!dev_dax || addr)
-		goto out;
-
-	dax_region = dev_dax->region;
-	align = dax_region->align;
-	off = pgoff << PAGE_SHIFT;
-	off_end = off + len;
-	off_align = round_up(off, align);
-
-	if ((off_end <= off_align) || ((off_end - off_align) < align))
-		goto out;
-
-	len_align = len + align;
-	if ((off + len_align) < off)
-		goto out;
-
-	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
-			pgoff, flags);
-	if (!IS_ERR_VALUE(addr_align)) {
-		addr_align += (off - addr_align) & (align - 1);
-		return addr_align;
-	}
- out:
-	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
-}
-
-static int dax_open(struct inode *inode, struct file *filp)
-{
-	struct dev_dax *dev_dax;
-
-	dev_dax = container_of(inode->i_cdev, struct dev_dax, cdev);
-	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	inode->i_mapping = dev_dax->inode->i_mapping;
-	inode->i_mapping->host = dev_dax->inode;
-	filp->f_mapping = inode->i_mapping;
-	filp->private_data = dev_dax;
-	inode->i_flags = S_DAX;
-
-	return 0;
-}
-
-static int dax_release(struct inode *inode, struct file *filp)
-{
-	struct dev_dax *dev_dax = filp->private_data;
-
-	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	return 0;
-}
-
-static const struct file_operations dax_fops = {
-	.llseek = noop_llseek,
-	.owner = THIS_MODULE,
-	.open = dax_open,
-	.release = dax_release,
-	.get_unmapped_area = dax_get_unmapped_area,
-	.mmap = dax_mmap,
-};
-
-static void dev_dax_release(struct device *dev)
-{
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-	struct dax_region *dax_region = dev_dax->region;
-
-	ida_simple_remove(&dax_region->ida, dev_dax->id);
-	ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
-	dax_region_put(dax_region);
-	iput(dev_dax->inode);
-	kfree(dev_dax);
-}
-
-static void kill_dev_dax(struct dev_dax *dev_dax)
-{
-	/*
-	 * Note, rcu is not protecting the liveness of dev_dax, rcu is
-	 * ensuring that any fault handlers that might have seen
-	 * dev_dax->alive == true, have completed.  Any fault handlers
-	 * that start after synchronize_srcu() has started will abort
-	 * upon seeing dev_dax->alive == false.
-	 */
-	dev_dax->alive = false;
-	synchronize_srcu(&dax_srcu);
-	unmap_mapping_range(dev_dax->inode->i_mapping, 0, 0, 1);
-}
-
-static void unregister_dev_dax(void *dev)
-{
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-
-	dev_dbg(dev, "%s\n", __func__);
-
-	kill_dev_dax(dev_dax);
-	cdev_device_del(&dev_dax->cdev, dev);
-	put_device(dev);
-}
-
-struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count)
-{
-	struct device *parent = dax_region->dev;
-	struct dev_dax *dev_dax;
-	int rc = 0, minor, i;
-	struct device *dev;
-	struct cdev *cdev;
-	dev_t dev_t;
-
-	dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL);
-	if (!dev_dax)
-		return ERR_PTR(-ENOMEM);
-
-	for (i = 0; i < count; i++) {
-		if (!IS_ALIGNED(res[i].start, dax_region->align)
-				|| !IS_ALIGNED(resource_size(&res[i]),
-					dax_region->align)) {
-			rc = -EINVAL;
-			break;
-		}
-		dev_dax->res[i].start = res[i].start;
-		dev_dax->res[i].end = res[i].end;
-	}
-
-	if (i < count)
-		goto err_id;
-
-	dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
-	if (dev_dax->id < 0) {
-		rc = dev_dax->id;
-		goto err_id;
-	}
-
-	minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
-	if (minor < 0) {
-		rc = minor;
-		goto err_minor;
-	}
-
-	dev_t = MKDEV(MAJOR(dax_devt), minor);
-	dev = &dev_dax->dev;
-	dev_dax->inode = dax_inode_get(&dev_dax->cdev, dev_t);
-	if (!dev_dax->inode) {
-		rc = -ENOMEM;
-		goto err_inode;
-	}
-
-	/* from here on we're committed to teardown via dev_dax_release() */
-	device_initialize(dev);
-
-	cdev = &dev_dax->cdev;
-	cdev_init(cdev, &dax_fops);
-	cdev->owner = parent->driver->owner;
-
-	dev_dax->num_resources = count;
-	dev_dax->alive = true;
-	dev_dax->region = dax_region;
-	kref_get(&dax_region->kref);
-
-	dev->devt = dev_t;
-	dev->class = dax_class;
-	dev->parent = parent;
-	dev->groups = dax_attribute_groups;
-	dev->release = dev_dax_release;
-	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
-
-	rc = cdev_device_add(cdev, dev);
-	if (rc) {
-		kill_dev_dax(dev_dax);
-		put_device(dev);
-		return ERR_PTR(rc);
-	}
-
-	rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
-	if (rc)
-		return ERR_PTR(rc);
-
-	return dev_dax;
-
- err_inode:
-	ida_simple_remove(&dax_minor_ida, minor);
- err_minor:
-	ida_simple_remove(&dax_region->ida, dev_dax->id);
- err_id:
-	kfree(dev_dax);
-
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL_GPL(devm_create_dev_dax);
-
-static int __init dax_init(void)
-{
-	int rc;
-
-	rc = dax_inode_init();
-	if (rc)
-		return rc;
-
-	nr_dax = max(nr_dax, 256);
-	rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
-	if (rc)
-		goto err_chrdev;
-
-	dax_class = class_create(THIS_MODULE, "dax");
-	if (IS_ERR(dax_class)) {
-		rc = PTR_ERR(dax_class);
-		goto err_class;
-	}
-
-	return 0;
-
- err_class:
-	unregister_chrdev_region(dax_devt, nr_dax);
- err_chrdev:
-	dax_inode_exit();
-	return rc;
-}
-
-static void __exit dax_exit(void)
-{
-	class_destroy(dax_class);
-	unregister_chrdev_region(dax_devt, nr_dax);
-	ida_destroy(&dax_minor_ida);
-	dax_inode_exit();
-}
-
-MODULE_AUTHOR("Intel Corporation");
-MODULE_LICENSE("GPL v2");
-subsys_initcall(dax_init);
-module_exit(dax_exit);
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index ea176d875d60..2472d9da96db 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -12,14 +12,12 @@
  */
 #ifndef __DAX_H__
 #define __DAX_H__
-struct device;
-struct dev_dax;
-struct resource;
-struct dax_region;
-void dax_region_put(struct dax_region *dax_region);
-struct dax_region *alloc_dax_region(struct device *parent,
-		int region_id, struct resource *res, unsigned int align,
-		void *addr, unsigned long flags);
-struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
-		struct resource *res, int count);
+struct dax_device;
+struct dax_device *alloc_dax(void *private);
+void put_dax(struct dax_device *dax_dev);
+bool dax_alive(struct dax_device *dax_dev);
+void kill_dax(struct dax_device *dax_dev);
+struct dax_device *inode_dax(struct inode *inode);
+struct inode *dax_inode(struct dax_device *dax_dev);
+void *dax_get_private(struct dax_device *dax_dev);
 #endif /* __DAX_H__ */
diff --git a/drivers/dax/device-dax.h b/drivers/dax/device-dax.h
new file mode 100644
index 000000000000..fdcd9769ffde
--- /dev/null
+++ b/drivers/dax/device-dax.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DEVICE_DAX_H__
+#define __DEVICE_DAX_H__
+struct device;
+struct dev_dax;
+struct resource;
+struct dax_region;
+void dax_region_put(struct dax_region *dax_region);
+struct dax_region *alloc_dax_region(struct device *parent,
+		int region_id, struct resource *res, unsigned int align,
+		void *addr, unsigned long flags);
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
+		struct resource *res, int count);
+#endif /* __DEVICE_DAX_H__ */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
new file mode 100644
index 000000000000..19a42edbfa03
--- /dev/null
+++ b/drivers/dax/device.c
@@ -0,0 +1,709 @@
+/*
+ * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pfn_t.h>
+#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "dax.h"
+
+static struct class *dax_class;
+
+/**
+ * struct dax_region - mapping infrastructure for dax devices
+ * @id: kernel-wide unique region for a memory range
+ * @base: linear address corresponding to @res
+ * @kref: to pin while other agents have a need to do lookups
+ * @dev: parent device backing this region
+ * @align: allocation and mapping alignment for child dax devices
+ * @res: physical address range of the region
+ * @pfn_flags: identify whether the pfns are paged back or not
+ */
+struct dax_region {
+	int id;
+	struct ida ida;
+	void *base;
+	struct kref kref;
+	struct device *dev;
+	unsigned int align;
+	struct resource res;
+	unsigned long pfn_flags;
+};
+
+/**
+ * struct dev_dax - instance data for a subdivision of a dax region
+ * @region - parent region
+ * @dax_dev - core dax functionality
+ * @dev - device core
+ * @id - child id in the region
+ * @num_resources - number of physical address extents in this device
+ * @res - array of physical address ranges
+ */
+struct dev_dax {
+	struct dax_region *region;
+	struct dax_device *dax_dev;
+	struct device dev;
+	int id;
+	int num_resources;
+	struct resource res[0];
+};
+
+static ssize_t id_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_region *dax_region;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	dax_region = dev_get_drvdata(dev);
+	if (dax_region)
+		rc = sprintf(buf, "%d\n", dax_region->id);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(id);
+
+static ssize_t region_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_region *dax_region;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	dax_region = dev_get_drvdata(dev);
+	if (dax_region)
+		rc = sprintf(buf, "%llu\n", (unsigned long long)
+				resource_size(&dax_region->res));
+	device_unlock(dev);
+
+	return rc;
+}
+static struct device_attribute dev_attr_region_size = __ATTR(size, 0444,
+		region_size_show, NULL);
+
+static ssize_t align_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_region *dax_region;
+	ssize_t rc = -ENXIO;
+
+	device_lock(dev);
+	dax_region = dev_get_drvdata(dev);
+	if (dax_region)
+		rc = sprintf(buf, "%u\n", dax_region->align);
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(align);
+
+static struct attribute *dax_region_attributes[] = {
+	&dev_attr_region_size.attr,
+	&dev_attr_align.attr,
+	&dev_attr_id.attr,
+	NULL,
+};
+
+static const struct attribute_group dax_region_attribute_group = {
+	.name = "dax_region",
+	.attrs = dax_region_attributes,
+};
+
+static const struct attribute_group *dax_region_attribute_groups[] = {
+	&dax_region_attribute_group,
+	NULL,
+};
+
+static void dax_region_free(struct kref *kref)
+{
+	struct dax_region *dax_region;
+
+	dax_region = container_of(kref, struct dax_region, kref);
+	kfree(dax_region);
+}
+
+void dax_region_put(struct dax_region *dax_region)
+{
+	kref_put(&dax_region->kref, dax_region_free);
+}
+EXPORT_SYMBOL_GPL(dax_region_put);
+
+static void dax_region_unregister(void *region)
+{
+	struct dax_region *dax_region = region;
+
+	sysfs_remove_groups(&dax_region->dev->kobj,
+			dax_region_attribute_groups);
+	dax_region_put(dax_region);
+}
+
+struct dax_region *alloc_dax_region(struct device *parent, int region_id,
+		struct resource *res, unsigned int align, void *addr,
+		unsigned long pfn_flags)
+{
+	struct dax_region *dax_region;
+
+	/*
+	 * The DAX core assumes that it can store its private data in
+	 * parent->driver_data. This WARN is a reminder / safeguard for
+	 * developers of device-dax drivers.
+	 */
+	if (dev_get_drvdata(parent)) {
+		dev_WARN(parent, "dax core failed to setup private data\n");
+		return NULL;
+	}
+
+	if (!IS_ALIGNED(res->start, align)
+			|| !IS_ALIGNED(resource_size(res), align))
+		return NULL;
+
+	dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
+	if (!dax_region)
+		return NULL;
+
+	dev_set_drvdata(parent, dax_region);
+	memcpy(&dax_region->res, res, sizeof(*res));
+	dax_region->pfn_flags = pfn_flags;
+	kref_init(&dax_region->kref);
+	dax_region->id = region_id;
+	ida_init(&dax_region->ida);
+	dax_region->align = align;
+	dax_region->dev = parent;
+	dax_region->base = addr;
+	if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) {
+		kfree(dax_region);
+		return NULL;;
+	}
+
+	kref_get(&dax_region->kref);
+	if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region))
+		return NULL;
+	return dax_region;
+}
+EXPORT_SYMBOL_GPL(alloc_dax_region);
+
+static struct dev_dax *to_dev_dax(struct device *dev)
+{
+	return container_of(dev, struct dev_dax, dev);
+}
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	unsigned long long size = 0;
+	int i;
+
+	for (i = 0; i < dev_dax->num_resources; i++)
+		size += resource_size(&dev_dax->res[i]);
+
+	return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static struct attribute *dev_dax_attributes[] = {
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group dev_dax_attribute_group = {
+	.attrs = dev_dax_attributes,
+};
+
+static const struct attribute_group *dax_attribute_groups[] = {
+	&dev_dax_attribute_group,
+	NULL,
+};
+
+static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
+		const char *func)
+{
+	struct dax_region *dax_region = dev_dax->region;
+	struct device *dev = &dev_dax->dev;
+	unsigned long mask;
+
+	if (!dax_alive(dev_dax->dax_dev))
+		return -ENXIO;
+
+	/* prevent private mappings from being established */
+	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	mask = dax_region->align - 1;
+	if (vma->vm_start & mask || vma->vm_end & mask) {
+		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+				current->comm, func, vma->vm_start, vma->vm_end,
+				mask);
+		return -EINVAL;
+	}
+
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
+		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	if (!vma_is_dax(vma)) {
+		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t phys;
+	int i;
+
+	for (i = 0; i < dev_dax->num_resources; i++) {
+		res = &dev_dax->res[i];
+		phys = pgoff * PAGE_SIZE + res->start;
+		if (phys >= res->start && phys <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dev_dax->num_resources) {
+		res = &dev_dax->res[i];
+		if (phys + size - 1 <= res->end)
+			return phys;
+	}
+
+	return -1;
+}
+
+static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	struct device *dev = &dev_dax->dev;
+	struct dax_region *dax_region;
+	int rc = VM_FAULT_SIGBUS;
+	phys_addr_t phys;
+	pfn_t pfn;
+	unsigned int fault_size = PAGE_SIZE;
+
+	if (check_vma(dev_dax, vmf->vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dev_dax->region;
+	if (dax_region->align > PAGE_SIZE) {
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (fault_size != dax_region->align)
+		return VM_FAULT_SIGBUS;
+
+	phys = pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
+				vmf->pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
+
+	if (rc == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (rc < 0 && rc != -EBUSY)
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	struct device *dev = &dev_dax->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	unsigned int fault_size = PMD_SIZE;
+
+	if (check_vma(dev_dax, vmf->vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dev_dax->region;
+	if (dax_region->align > PMD_SIZE) {
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pmd mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (fault_size < dax_region->align)
+		return VM_FAULT_SIGBUS;
+	else if (fault_size > dax_region->align)
+		return VM_FAULT_FALLBACK;
+
+	/* if we are outside of the VMA */
+	if (pmd_addr < vmf->vma->vm_start ||
+			(pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	pgoff = linear_page_index(vmf->vma, pmd_addr);
+	phys = pgoff_to_phys(dev_dax, pgoff, PMD_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	unsigned long pud_addr = vmf->address & PUD_MASK;
+	struct device *dev = &dev_dax->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	unsigned int fault_size = PUD_SIZE;
+
+
+	if (check_vma(dev_dax, vmf->vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dev_dax->region;
+	if (dax_region->align > PUD_SIZE) {
+		dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n",
+			__func__, dax_region->align, fault_size);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pud mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: region lacks devmap flags\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	if (fault_size < dax_region->align)
+		return VM_FAULT_SIGBUS;
+	else if (fault_size > dax_region->align)
+		return VM_FAULT_FALLBACK;
+
+	/* if we are outside of the VMA */
+	if (pud_addr < vmf->vma->vm_start ||
+			(pud_addr + PUD_SIZE) > vmf->vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	pgoff = linear_page_index(vmf->vma, pud_addr);
+	phys = pgoff_to_phys(dev_dax, pgoff, PUD_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn,
+			vmf->flags & FAULT_FLAG_WRITE);
+}
+#else
+static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf)
+{
+	return VM_FAULT_FALLBACK;
+}
+#endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+
+static int dev_dax_huge_fault(struct vm_fault *vmf,
+		enum page_entry_size pe_size)
+{
+	int rc, id;
+	struct file *filp = vmf->vma->vm_file;
+	struct dev_dax *dev_dax = filp->private_data;
+
+	dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__,
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+			? "write" : "read",
+			vmf->vma->vm_start, vmf->vma->vm_end, pe_size);
+
+	id = dax_read_lock();
+	switch (pe_size) {
+	case PE_SIZE_PTE:
+		rc = __dev_dax_pte_fault(dev_dax, vmf);
+		break;
+	case PE_SIZE_PMD:
+		rc = __dev_dax_pmd_fault(dev_dax, vmf);
+		break;
+	case PE_SIZE_PUD:
+		rc = __dev_dax_pud_fault(dev_dax, vmf);
+		break;
+	default:
+		rc = VM_FAULT_SIGBUS;
+	}
+	dax_read_unlock(id);
+
+	return rc;
+}
+
+static int dev_dax_fault(struct vm_fault *vmf)
+{
+	return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct dax_vm_ops = {
+	.fault = dev_dax_fault,
+	.huge_fault = dev_dax_huge_fault,
+};
+
+static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct dev_dax *dev_dax = filp->private_data;
+	int rc, id;
+
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+
+	/*
+	 * We lock to check dax_dev liveness and will re-check at
+	 * fault time.
+	 */
+	id = dax_read_lock();
+	rc = check_vma(dev_dax, vma, __func__);
+	dax_read_unlock(id);
+	if (rc)
+		return rc;
+
+	vma->vm_ops = &dax_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+}
+
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_get_unmapped_area(struct file *filp,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	unsigned long off, off_end, off_align, len_align, addr_align, align;
+	struct dev_dax *dev_dax = filp ? filp->private_data : NULL;
+	struct dax_region *dax_region;
+
+	if (!dev_dax || addr)
+		goto out;
+
+	dax_region = dev_dax->region;
+	align = dax_region->align;
+	off = pgoff << PAGE_SHIFT;
+	off_end = off + len;
+	off_align = round_up(off, align);
+
+	if ((off_end <= off_align) || ((off_end - off_align) < align))
+		goto out;
+
+	len_align = len + align;
+	if ((off + len_align) < off)
+		goto out;
+
+	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+			pgoff, flags);
+	if (!IS_ERR_VALUE(addr_align)) {
+		addr_align += (off - addr_align) & (align - 1);
+		return addr_align;
+	}
+ out:
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int dax_open(struct inode *inode, struct file *filp)
+{
+	struct dax_device *dax_dev = inode_dax(inode);
+	struct inode *__dax_inode = dax_inode(dax_dev);
+	struct dev_dax *dev_dax = dax_get_private(dax_dev);
+
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+	inode->i_mapping = __dax_inode->i_mapping;
+	inode->i_mapping->host = __dax_inode;
+	filp->f_mapping = inode->i_mapping;
+	filp->private_data = dev_dax;
+	inode->i_flags = S_DAX;
+
+	return 0;
+}
+
+static int dax_release(struct inode *inode, struct file *filp)
+{
+	struct dev_dax *dev_dax = filp->private_data;
+
+	dev_dbg(&dev_dax->dev, "%s\n", __func__);
+	return 0;
+}
+
+static const struct file_operations dax_fops = {
+	.llseek = noop_llseek,
+	.owner = THIS_MODULE,
+	.open = dax_open,
+	.release = dax_release,
+	.get_unmapped_area = dax_get_unmapped_area,
+	.mmap = dax_mmap,
+};
+
+static void dev_dax_release(struct device *dev)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	struct dax_region *dax_region = dev_dax->region;
+	struct dax_device *dax_dev = dev_dax->dax_dev;
+
+	ida_simple_remove(&dax_region->ida, dev_dax->id);
+	dax_region_put(dax_region);
+	put_dax(dax_dev);
+	kfree(dev_dax);
+}
+
+static void kill_dev_dax(struct dev_dax *dev_dax)
+{
+	struct dax_device *dax_dev = dev_dax->dax_dev;
+	struct inode *inode = dax_inode(dax_dev);
+
+	kill_dax(dax_dev);
+	unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+}
+
+static void unregister_dev_dax(void *dev)
+{
+	struct dev_dax *dev_dax = to_dev_dax(dev);
+	struct dax_device *dax_dev = dev_dax->dax_dev;
+	struct inode *inode = dax_inode(dax_dev);
+	struct cdev *cdev = inode->i_cdev;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	kill_dev_dax(dev_dax);
+	cdev_device_del(cdev, dev);
+	put_device(dev);
+}
+
+struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
+		struct resource *res, int count)
+{
+	struct device *parent = dax_region->dev;
+	struct dax_device *dax_dev;
+	struct dev_dax *dev_dax;
+	struct inode *inode;
+	struct device *dev;
+	struct cdev *cdev;
+	int rc = 0, i;
+
+	dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL);
+	if (!dev_dax)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < count; i++) {
+		if (!IS_ALIGNED(res[i].start, dax_region->align)
+				|| !IS_ALIGNED(resource_size(&res[i]),
+					dax_region->align)) {
+			rc = -EINVAL;
+			break;
+		}
+		dev_dax->res[i].start = res[i].start;
+		dev_dax->res[i].end = res[i].end;
+	}
+
+	if (i < count)
+		goto err_id;
+
+	dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
+	if (dev_dax->id < 0) {
+		rc = dev_dax->id;
+		goto err_id;
+	}
+
+	dax_dev = alloc_dax(dev_dax);
+	if (!dax_dev)
+		goto err_dax;
+
+	/* from here on we're committed to teardown via dax_dev_release() */
+	dev = &dev_dax->dev;
+	device_initialize(dev);
+
+	inode = dax_inode(dax_dev);
+	cdev = inode->i_cdev;
+	cdev_init(cdev, &dax_fops);
+	cdev->owner = parent->driver->owner;
+
+	dev_dax->num_resources = count;
+	dev_dax->dax_dev = dax_dev;
+	dev_dax->region = dax_region;
+	kref_get(&dax_region->kref);
+
+	dev->devt = inode->i_rdev;
+	dev->class = dax_class;
+	dev->parent = parent;
+	dev->groups = dax_attribute_groups;
+	dev->release = dev_dax_release;
+	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
+
+	rc = cdev_device_add(cdev, dev);
+	if (rc) {
+		kill_dev_dax(dev_dax);
+		put_device(dev);
+		return ERR_PTR(rc);
+	}
+
+	rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return dev_dax;
+
+ err_dax:
+	ida_simple_remove(&dax_region->ida, dev_dax->id);
+ err_id:
+	kfree(dev_dax);
+
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_GPL(devm_create_dev_dax);
+
+static int __init dax_init(void)
+{
+	dax_class = class_create(THIS_MODULE, "dax");
+	return PTR_ERR_OR_ZERO(dax_class);
+}
+
+static void __exit dax_exit(void)
+{
+	class_destroy(dax_class);
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_init);
+module_exit(dax_exit);
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 2c736fc4508b..d4ca19bd74eb 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -16,7 +16,7 @@
 #include <linux/pfn_t.h>
 #include "../nvdimm/pfn.h"
 #include "../nvdimm/nd.h"
-#include "dax.h"
+#include "device-dax.h"
 
 struct dax_pmem {
 	struct device *dev;
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
new file mode 100644
index 000000000000..c9f85f1c086e
--- /dev/null
+++ b/drivers/dax/super.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/cdev.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+
+static int nr_dax = CONFIG_NR_DEV_DAX;
+module_param(nr_dax, int, S_IRUGO);
+MODULE_PARM_DESC(nr_dax, "max number of dax device instances");
+
+static dev_t dax_devt;
+DEFINE_STATIC_SRCU(dax_srcu);
+static struct vfsmount *dax_mnt;
+static DEFINE_IDA(dax_minor_ida);
+static struct kmem_cache *dax_cache __read_mostly;
+static struct super_block *dax_superblock __read_mostly;
+
+int dax_read_lock(void)
+{
+	return srcu_read_lock(&dax_srcu);
+}
+EXPORT_SYMBOL_GPL(dax_read_lock);
+
+void dax_read_unlock(int id)
+{
+	srcu_read_unlock(&dax_srcu, id);
+}
+EXPORT_SYMBOL_GPL(dax_read_unlock);
+
+/**
+ * struct dax_device - anchor object for dax services
+ * @inode: core vfs
+ * @cdev: optional character interface for "device dax"
+ * @private: dax driver private data
+ * @alive: !alive + rcu grace period == no new operations / mappings
+ */
+struct dax_device {
+	struct inode inode;
+	struct cdev cdev;
+	void *private;
+	bool alive;
+};
+
+bool dax_alive(struct dax_device *dax_dev)
+{
+	lockdep_assert_held(&dax_srcu);
+	return dax_dev->alive;
+}
+EXPORT_SYMBOL_GPL(dax_alive);
+
+/*
+ * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
+ * that any fault handlers or operations that might have seen
+ * dax_alive(), have completed.  Any operations that start after
+ * synchronize_srcu() has run will abort upon seeing !dax_alive().
+ */
+void kill_dax(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return;
+
+	dax_dev->alive = false;
+	synchronize_srcu(&dax_srcu);
+	dax_dev->private = NULL;
+}
+EXPORT_SYMBOL_GPL(kill_dax);
+
+static struct inode *dax_alloc_inode(struct super_block *sb)
+{
+	struct dax_device *dax_dev;
+
+	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
+	return &dax_dev->inode;
+}
+
+static struct dax_device *to_dax_dev(struct inode *inode)
+{
+	return container_of(inode, struct dax_device, inode);
+}
+
+static void dax_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct dax_device *dax_dev = to_dax_dev(inode);
+
+	ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
+	kmem_cache_free(dax_cache, dax_dev);
+}
+
+static void dax_destroy_inode(struct inode *inode)
+{
+	struct dax_device *dax_dev = to_dax_dev(inode);
+
+	WARN_ONCE(dax_dev->alive,
+			"kill_dax() must be called before final iput()\n");
+	call_rcu(&inode->i_rcu, dax_i_callback);
+}
+
+static const struct super_operations dax_sops = {
+	.statfs = simple_statfs,
+	.alloc_inode = dax_alloc_inode,
+	.destroy_inode = dax_destroy_inode,
+	.drop_inode = generic_delete_inode,
+};
+
+static struct dentry *dax_mount(struct file_system_type *fs_type,
+		int flags, const char *dev_name, void *data)
+{
+	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
+}
+
+static struct file_system_type dax_fs_type = {
+	.name = "dax",
+	.mount = dax_mount,
+	.kill_sb = kill_anon_super,
+};
+
+static int dax_test(struct inode *inode, void *data)
+{
+	dev_t devt = *(dev_t *) data;
+
+	return inode->i_rdev == devt;
+}
+
+static int dax_set(struct inode *inode, void *data)
+{
+	dev_t devt = *(dev_t *) data;
+
+	inode->i_rdev = devt;
+	return 0;
+}
+
+static struct dax_device *dax_dev_get(dev_t devt)
+{
+	struct dax_device *dax_dev;
+	struct inode *inode;
+
+	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
+			dax_test, dax_set, &devt);
+
+	if (!inode)
+		return NULL;
+
+	dax_dev = to_dax_dev(inode);
+	if (inode->i_state & I_NEW) {
+		dax_dev->alive = true;
+		inode->i_cdev = &dax_dev->cdev;
+		inode->i_mode = S_IFCHR;
+		inode->i_flags = S_DAX;
+		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+		unlock_new_inode(inode);
+	}
+
+	return dax_dev;
+}
+
+struct dax_device *alloc_dax(void *private)
+{
+	struct dax_device *dax_dev;
+	dev_t devt;
+	int minor;
+
+	minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
+	if (minor < 0)
+		return NULL;
+
+	devt = MKDEV(MAJOR(dax_devt), minor);
+	dax_dev = dax_dev_get(devt);
+	if (!dax_dev)
+		goto err_inode;
+
+	dax_dev->private = private;
+	return dax_dev;
+
+ err_inode:
+	ida_simple_remove(&dax_minor_ida, minor);
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_dax);
+
+void put_dax(struct dax_device *dax_dev)
+{
+	if (!dax_dev)
+		return;
+	iput(&dax_dev->inode);
+}
+EXPORT_SYMBOL_GPL(put_dax);
+
+/**
+ * inode_dax: convert a public inode into its dax_dev
+ * @inode: An inode with i_cdev pointing to a dax_dev
+ *
+ * Note this is not equivalent to to_dax_dev() which is for private
+ * internal use where we know the inode filesystem type == dax_fs_type.
+ */
+struct dax_device *inode_dax(struct inode *inode)
+{
+	struct cdev *cdev = inode->i_cdev;
+
+	return container_of(cdev, struct dax_device, cdev);
+}
+EXPORT_SYMBOL_GPL(inode_dax);
+
+struct inode *dax_inode(struct dax_device *dax_dev)
+{
+	return &dax_dev->inode;
+}
+EXPORT_SYMBOL_GPL(dax_inode);
+
+void *dax_get_private(struct dax_device *dax_dev)
+{
+	return dax_dev->private;
+}
+EXPORT_SYMBOL_GPL(dax_get_private);
+
+static void init_once(void *_dax_dev)
+{
+	struct dax_device *dax_dev = _dax_dev;
+	struct inode *inode = &dax_dev->inode;
+
+	inode_init_once(inode);
+}
+
+static int __dax_fs_init(void)
+{
+	int rc;
+
+	dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
+			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+			init_once);
+	if (!dax_cache)
+		return -ENOMEM;
+
+	rc = register_filesystem(&dax_fs_type);
+	if (rc)
+		goto err_register_fs;
+
+	dax_mnt = kern_mount(&dax_fs_type);
+	if (IS_ERR(dax_mnt)) {
+		rc = PTR_ERR(dax_mnt);
+		goto err_mount;
+	}
+	dax_superblock = dax_mnt->mnt_sb;
+
+	return 0;
+
+ err_mount:
+	unregister_filesystem(&dax_fs_type);
+ err_register_fs:
+	kmem_cache_destroy(dax_cache);
+
+	return rc;
+}
+
+static void __dax_fs_exit(void)
+{
+	kern_unmount(dax_mnt);
+	unregister_filesystem(&dax_fs_type);
+	kmem_cache_destroy(dax_cache);
+}
+
+static int __init dax_fs_init(void)
+{
+	int rc;
+
+	rc = __dax_fs_init();
+	if (rc)
+		return rc;
+
+	nr_dax = max(nr_dax, 256);
+	rc = alloc_chrdev_region(&dax_devt, 0, nr_dax, "dax");
+	if (rc)
+		__dax_fs_exit();
+	return rc;
+}
+
+static void __exit dax_fs_exit(void)
+{
+	unregister_chrdev_region(dax_devt, nr_dax);
+	ida_destroy(&dax_minor_ida);
+	__dax_fs_exit();
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_fs_init);
+module_exit(dax_fs_exit);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d8a3dc042e1c..5b62f5d19aea 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -8,6 +8,9 @@
 
 struct iomap_ops;
 
+int dax_read_lock(void);
+void dax_read_unlock(int id);
+
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
  * the entry size (PMD) and two more to tell us if the entry is a huge zero
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 405212be044a..2033ad03b8cd 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -28,7 +28,10 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
-obj-$(CONFIG_DEV_DAX) += dax.o
+ifeq ($(CONFIG_DAX),m)
+obj-$(CONFIG_DAX) += dax.o
+endif
+obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 nfit-y := $(ACPI_SRC)/core.o
@@ -48,9 +51,12 @@ nd_blk-y += config_check.o
 nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
 
-dax-y := $(DAX_SRC)/dax.o
+dax-y := $(DAX_SRC)/super.o
 dax-y += config_check.o
 
+device_dax-y := $(DAX_SRC)/device.o
+device_dax-y += config_check.o
+
 dax_pmem-y := $(DAX_SRC)/pmem.o
 dax_pmem-y += config_check.o
 
-- 
cgit v1.2.3


From 6f79309acc32b025064a496dbfcd4c70c557294e Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 3 Apr 2017 18:05:21 +0200
Subject: gpio: Use unsigned int for interrupt numbers

Interrupt numbers are never negative, zero serves as the special invalid
value.

Signed-off-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 6 +++---
 include/linux/gpio/driver.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d11eb2abfced..1d1fa7248d63 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1522,7 +1522,7 @@ static bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gpiochip,
  */
 static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gpiochip,
 					  struct irq_chip *irqchip,
-					  int parent_irq,
+					  unsigned int parent_irq,
 					  irq_flow_handler_t parent_handler)
 {
 	unsigned int offset;
@@ -1571,7 +1571,7 @@ static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gpiochip,
  */
 void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 				  struct irq_chip *irqchip,
-				  int parent_irq,
+				  unsigned int parent_irq,
 				  irq_flow_handler_t parent_handler)
 {
 	gpiochip_set_cascaded_irqchip(gpiochip, irqchip, parent_irq,
@@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(gpiochip_set_chained_irqchip);
  */
 void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip,
 				 struct irq_chip *irqchip,
-				 int parent_irq)
+				 unsigned int parent_irq)
 {
 	if (!gpiochip->irq_nested) {
 		chip_err(gpiochip, "tried to nest a chained gpiochip\n");
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 846f3b989480..393582867afd 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -168,7 +168,7 @@ struct gpio_chip {
 	unsigned int		irq_base;
 	irq_flow_handler_t	irq_handler;
 	unsigned int		irq_default_type;
-	int			irq_chained_parent;
+	unsigned int		irq_chained_parent;
 	bool			irq_nested;
 	bool			irq_need_valid_mask;
 	unsigned long		*irq_valid_mask;
@@ -244,12 +244,12 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev,
 
 void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip,
 		struct irq_chip *irqchip,
-		int parent_irq,
+		unsigned int parent_irq,
 		irq_flow_handler_t parent_handler);
 
 void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip,
 		struct irq_chip *irqchip,
-		int parent_irq);
+		unsigned int parent_irq);
 
 int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
 			     struct irq_chip *irqchip,
-- 
cgit v1.2.3


From 2d4bc93368f5a0ddb57c8c885cdad9c9b7a10ed5 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 12 Apr 2017 14:34:04 +0200
Subject: netlink: extended ACK reporting

Add the base infrastructure and UAPI for netlink extended ACK
reporting. All "manual" calls to netlink_ack() pass NULL for now and
thus don't get extended ACK reporting.

Big thanks goes to Pablo Neira Ayuso for not only bringing up the
whole topic at netconf (again) but also coming up with the nlattr
passing trick and various other ideas.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/crypto_user.c              |  3 +-
 drivers/infiniband/core/netlink.c |  5 +--
 drivers/scsi/scsi_netlink.c       |  2 +-
 include/linux/netlink.h           | 26 +++++++++++++-
 include/net/netlink.h             |  3 +-
 include/uapi/linux/netlink.h      | 32 ++++++++++++++++++
 kernel/audit.c                    |  2 +-
 net/core/rtnetlink.c              |  3 +-
 net/core/sock_diag.c              |  3 +-
 net/decnet/netfilter/dn_rtmsg.c   |  2 +-
 net/hsr/hsr_netlink.c             |  4 +--
 net/netfilter/ipset/ip_set_core.c |  2 +-
 net/netfilter/nfnetlink.c         | 22 ++++++------
 net/netlink/af_netlink.c          | 71 ++++++++++++++++++++++++++++++++++-----
 net/netlink/af_netlink.h          |  1 +
 net/netlink/genetlink.c           |  3 +-
 net/xfrm/xfrm_user.c              |  3 +-
 17 files changed, 153 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
index a90404a0c5ff..4a44830741c1 100644
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -483,7 +483,8 @@ static const struct crypto_link {
 	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_rng },
 };
 
-static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
 {
 	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
 	const struct crypto_link *link;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 10469b0088b5..b784055423c8 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -146,7 +146,8 @@ nla_put_failure:
 }
 EXPORT_SYMBOL(ibnl_put_attr);
 
-static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			struct netlink_ext_ack *extack)
 {
 	struct ibnl_client *client;
 	int type = nlh->nlmsg_type;
@@ -209,7 +210,7 @@ static void ibnl_rcv_reply_skb(struct sk_buff *skb)
 		if (nlh->nlmsg_flags & NLM_F_REQUEST)
 			return;
 
-		ibnl_rcv_msg(skb, nlh);
+		ibnl_rcv_msg(skb, nlh, NULL);
 
 		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
 		if (msglen > skb->len)
diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c
index 109802f776ed..50e624fb8307 100644
--- a/drivers/scsi/scsi_netlink.c
+++ b/drivers/scsi/scsi_netlink.c
@@ -111,7 +111,7 @@ scsi_nl_rcv_msg(struct sk_buff *skb)
 
 next_msg:
 		if ((err) || (nlh->nlmsg_flags & NLM_F_ACK))
-			netlink_ack(skb, nlh, err);
+			netlink_ack(skb, nlh, err, NULL);
 
 		skb_pull(skb, rlen);
 	}
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index da14ab61f363..60e7137f840d 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -62,11 +62,35 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
 	return __netlink_kernel_create(net, unit, THIS_MODULE, cfg);
 }
 
+/**
+ * struct netlink_ext_ack - netlink extended ACK report struct
+ * @_msg: message string to report - don't access directly, use
+ *	%NL_SET_ERR_MSG
+ * @bad_attr: attribute with error
+ */
+struct netlink_ext_ack {
+	const char *_msg;
+	const struct nlattr *bad_attr;
+};
+
+/* Always use this macro, this allows later putting the
+ * message into a separate section or such for things
+ * like translation or listing all possible messages.
+ * Currently string formatting is not supported (due
+ * to the lack of an output buffer.)
+ */
+#define NL_SET_ERR_MSG(extack, msg) do {	\
+	static const char _msg[] = (msg);	\
+						\
+	(extack)->_msg = _msg;			\
+} while (0)
+
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group);
-extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
+extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
+			const struct netlink_ext_ack *extack);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
 
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
diff --git a/include/net/netlink.h b/include/net/netlink.h
index b239fcd33d80..a064ec3e2ee1 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -233,7 +233,8 @@ struct nl_info {
 };
 
 int netlink_rcv_skb(struct sk_buff *skb,
-		    int (*cb)(struct sk_buff *, struct nlmsghdr *));
+		    int (*cb)(struct sk_buff *, struct nlmsghdr *,
+			      struct netlink_ext_ack *));
 int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
 		 unsigned int group, int report, gfp_t flags);
 
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index b2c9c26ea30f..7df88770e029 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -69,6 +69,10 @@ struct nlmsghdr {
 #define NLM_F_CREATE	0x400	/* Create, if it does not exist	*/
 #define NLM_F_APPEND	0x800	/* Add to end of list		*/
 
+/* Flags for ACK message */
+#define NLM_F_CAPPED	0x100	/* request was capped */
+#define NLM_F_ACK_TLVS	0x200	/* extended ACK TVLs were included */
+
 /*
    4.4BSD ADD		NLM_F_CREATE|NLM_F_EXCL
    4.4BSD CHANGE	NLM_F_REPLACE
@@ -101,6 +105,33 @@ struct nlmsghdr {
 struct nlmsgerr {
 	int		error;
 	struct nlmsghdr msg;
+	/*
+	 * followed by the message contents unless NETLINK_CAP_ACK was set
+	 * or the ACK indicates success (error == 0)
+	 * message length is aligned with NLMSG_ALIGN()
+	 */
+	/*
+	 * followed by TLVs defined in enum nlmsgerr_attrs
+	 * if NETLINK_EXT_ACK was set
+	 */
+};
+
+/**
+ * enum nlmsgerr_attrs - nlmsgerr attributes
+ * @NLMSGERR_ATTR_UNUSED: unused
+ * @NLMSGERR_ATTR_MSG: error message string (string)
+ * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original
+ *	 message, counting from the beginning of the header (u32)
+ * @__NLMSGERR_ATTR_MAX: number of attributes
+ * @NLMSGERR_ATTR_MAX: highest attribute number
+ */
+enum nlmsgerr_attrs {
+	NLMSGERR_ATTR_UNUSED,
+	NLMSGERR_ATTR_MSG,
+	NLMSGERR_ATTR_OFFS,
+
+	__NLMSGERR_ATTR_MAX,
+	NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
 };
 
 #define NETLINK_ADD_MEMBERSHIP		1
@@ -115,6 +146,7 @@ struct nlmsgerr {
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
 #define NETLINK_CAP_ACK			10
+#define NETLINK_EXT_ACK			11
 
 struct nl_pktinfo {
 	__u32	group;
diff --git a/kernel/audit.c b/kernel/audit.c
index 2f4964cfde0b..d54bf5932374 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1402,7 +1402,7 @@ static void audit_receive_skb(struct sk_buff *skb)
 		err = audit_receive_msg(skb, nlh);
 		/* if err or if this message says it wants a response */
 		if (err || (nlh->nlmsg_flags & NLM_F_ACK))
-			netlink_ack(skb, nlh, err);
+			netlink_ack(skb, nlh, err, NULL);
 
 		nlh = nlmsg_next(nlh, &len);
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c138b6b75e59..3cc4a627a537 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4046,7 +4046,8 @@ out:
 
 /* Process one rtnetlink message. */
 
-static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	rtnl_doit_func doit;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index fb9d0e2fd148..217f4e3b82f6 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -238,7 +238,8 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
 	return err;
 }
 
-static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
 	int ret;
 
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index 85f2fdc360c2..c8bf5136a72b 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -96,7 +96,7 @@ static unsigned int dnrmg_hook(void *priv,
 }
 
 
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err), NULL); return; } while (0)
 
 static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
 {
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 1ab30e7d3f99..81dac16933fc 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -350,7 +350,7 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
 	return 0;
 
 invalid:
-	netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+	netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
 	return 0;
 
 nla_put_failure:
@@ -432,7 +432,7 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
 	return 0;
 
 invalid:
-	netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL);
+	netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
 	return 0;
 
 nla_put_failure:
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index c296f9b606d4..26356bf8cebf 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1305,7 +1305,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 			 * manually :-(
 			 */
 			if (nlh->nlmsg_flags & NLM_F_ACK)
-				netlink_ack(cb->skb, nlh, ret);
+				netlink_ack(cb->skb, nlh, ret, NULL);
 			return ret;
 		}
 	}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 68eda920160e..181d3bb800e6 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -148,7 +148,8 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
 EXPORT_SYMBOL_GPL(nfnetlink_unicast);
 
 /* Process one complete nfnetlink message. */
-static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	const struct nfnl_callback *nc;
@@ -261,7 +262,7 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb)
 	struct nfnl_err *nfnl_err, *next;
 
 	list_for_each_entry_safe(nfnl_err, next, err_list, head) {
-		netlink_ack(skb, nfnl_err->nlh, nfnl_err->err);
+		netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, NULL);
 		nfnl_err_del(nfnl_err);
 	}
 }
@@ -284,13 +285,13 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err;
 
 	if (subsys_id >= NFNL_SUBSYS_COUNT)
-		return netlink_ack(skb, nlh, -EINVAL);
+		return netlink_ack(skb, nlh, -EINVAL, NULL);
 replay:
 	status = 0;
 
 	skb = netlink_skb_clone(oskb, GFP_KERNEL);
 	if (!skb)
-		return netlink_ack(oskb, nlh, -ENOMEM);
+		return netlink_ack(oskb, nlh, -ENOMEM, NULL);
 
 	nfnl_lock(subsys_id);
 	ss = nfnl_dereference_protected(subsys_id);
@@ -304,20 +305,20 @@ replay:
 #endif
 		{
 			nfnl_unlock(subsys_id);
-			netlink_ack(oskb, nlh, -EOPNOTSUPP);
+			netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
 			return kfree_skb(skb);
 		}
 	}
 
 	if (!ss->commit || !ss->abort) {
 		nfnl_unlock(subsys_id);
-		netlink_ack(oskb, nlh, -EOPNOTSUPP);
+		netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
 		return kfree_skb(skb);
 	}
 
 	if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
 		nfnl_unlock(subsys_id);
-		netlink_ack(oskb, nlh, -ERESTART);
+		netlink_ack(oskb, nlh, -ERESTART, NULL);
 		return kfree_skb(skb);
 	}
 
@@ -407,7 +408,8 @@ ack:
 				 * pointing to the batch header.
 				 */
 				nfnl_err_reset(&err_list);
-				netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM);
+				netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM,
+					    NULL);
 				status |= NFNL_BATCH_FAILURE;
 				goto done;
 			}
@@ -467,7 +469,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy);
 	if (err < 0) {
-		netlink_ack(skb, nlh, err);
+		netlink_ack(skb, nlh, err, NULL);
 		return;
 	}
 	if (cda[NFNL_BATCH_GENID])
@@ -493,7 +495,7 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 		return;
 
 	if (!netlink_net_capable(skb, CAP_NET_ADMIN)) {
-		netlink_ack(skb, nlh, -EPERM);
+		netlink_ack(skb, nlh, -EPERM, NULL);
 		return;
 	}
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index fc232441cf23..c1564768000e 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1652,6 +1652,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 			nlk->flags &= ~NETLINK_F_CAP_ACK;
 		err = 0;
 		break;
+	case NETLINK_EXT_ACK:
+		if (val)
+			nlk->flags |= NETLINK_F_EXT_ACK;
+		else
+			nlk->flags &= ~NETLINK_F_EXT_ACK;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -1736,6 +1743,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
 			return -EFAULT;
 		err = 0;
 		break;
+	case NETLINK_EXT_ACK:
+		if (len < sizeof(int))
+			return -EINVAL;
+		len = sizeof(int);
+		val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0;
+		if (put_user(len, optlen) || put_user(val, optval))
+			return -EFAULT;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2267,21 +2283,40 @@ error_free:
 }
 EXPORT_SYMBOL(__netlink_dump_start);
 
-void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
+void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
+		 const struct netlink_ext_ack *extack)
 {
 	struct sk_buff *skb;
 	struct nlmsghdr *rep;
 	struct nlmsgerr *errmsg;
 	size_t payload = sizeof(*errmsg);
+	size_t tlvlen = 0;
 	struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
+	unsigned int flags = 0;
 
 	/* Error messages get the original request appened, unless the user
-	 * requests to cap the error message.
+	 * requests to cap the error message, and get extra error data if
+	 * requested.
 	 */
-	if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
-		payload += nlmsg_len(nlh);
+	if (err) {
+		if (!(nlk->flags & NETLINK_F_CAP_ACK))
+			payload += nlmsg_len(nlh);
+		else
+			flags |= NLM_F_CAPPED;
+		if (nlk->flags & NETLINK_F_EXT_ACK && extack) {
+			if (extack->_msg)
+				tlvlen += nla_total_size(strlen(extack->_msg) + 1);
+			if (extack->bad_attr)
+				tlvlen += nla_total_size(sizeof(u32));
+		}
+	} else {
+		flags |= NLM_F_CAPPED;
+	}
 
-	skb = nlmsg_new(payload, GFP_KERNEL);
+	if (tlvlen)
+		flags |= NLM_F_ACK_TLVS;
+
+	skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
 	if (!skb) {
 		struct sock *sk;
 
@@ -2297,17 +2332,35 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	}
 
 	rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-			  NLMSG_ERROR, payload, 0);
+			  NLMSG_ERROR, payload, flags);
 	errmsg = nlmsg_data(rep);
 	errmsg->error = err;
 	memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
+
+	if (err && nlk->flags & NETLINK_F_EXT_ACK && extack) {
+		if (extack->_msg)
+			WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
+					       extack->_msg));
+		if (extack->bad_attr &&
+		    !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
+			     (u8 *)extack->bad_attr >= in_skb->data +
+						       in_skb->len))
+			WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
+					    (u8 *)extack->bad_attr -
+					    in_skb->data));
+	}
+
+	nlmsg_end(skb, rep);
+
 	netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT);
 }
 EXPORT_SYMBOL(netlink_ack);
 
 int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
-						     struct nlmsghdr *))
+						   struct nlmsghdr *,
+						   struct netlink_ext_ack *))
 {
+	struct netlink_ext_ack extack = {};
 	struct nlmsghdr *nlh;
 	int err;
 
@@ -2328,13 +2381,13 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
 		if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
 			goto ack;
 
-		err = cb(skb, nlh);
+		err = cb(skb, nlh, &extack);
 		if (err == -EINTR)
 			goto skip;
 
 ack:
 		if (nlh->nlmsg_flags & NLM_F_ACK || err)
-			netlink_ack(skb, nlh, err);
+			netlink_ack(skb, nlh, err, &extack);
 
 skip:
 		msglen = NLMSG_ALIGN(nlh->nlmsg_len);
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index f792f8d7f982..3490f2430532 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -13,6 +13,7 @@
 #define NETLINK_F_RECV_NO_ENOBUFS	0x8
 #define NETLINK_F_LISTEN_ALL_NSID	0x10
 #define NETLINK_F_CAP_ACK		0x20
+#define NETLINK_F_EXT_ACK		0x40
 
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 92e0981f7404..57b2e3648bc0 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -605,7 +605,8 @@ out:
 	return err;
 }
 
-static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			struct netlink_ext_ack *extack)
 {
 	const struct genl_family *family;
 	int err;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4f7e62ddc17e..e93d5c0471b2 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2448,7 +2448,8 @@ static const struct xfrm_link {
 	[XFRM_MSG_GETSPDINFO  - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo   },
 };
 
-static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *attrs[XFRMA_MAX+1];
-- 
cgit v1.2.3


From ba0dc5f6e0ba5a5d2f575bcdb35e5d1960cf7c04 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 12 Apr 2017 14:34:06 +0200
Subject: netlink: allow sending extended ACK with cookie on success

Now that we have extended error reporting and a new message format for
netlink ACK messages, also extend this to be able to return arbitrary
cookie data on success.

This will allow, for example, nl80211 to not send an extra message for
cookies identifying newly created objects, but return those directly
in the ACK message.

The cookie data size is currently limited to 20 bytes (since Jamal
talked about using SHA1 for identifiers.)

Thanks to Jamal Hadi Salim for bringing up this idea during the
discussions.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h      |  7 +++++++
 include/uapi/linux/netlink.h |  4 ++++
 net/netlink/af_netlink.c     | 33 ++++++++++++++++++++++-----------
 3 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 60e7137f840d..8d2a8924705c 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -62,15 +62,22 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg)
 	return __netlink_kernel_create(net, unit, THIS_MODULE, cfg);
 }
 
+/* this can be increased when necessary - don't expose to userland */
+#define NETLINK_MAX_COOKIE_LEN	20
+
 /**
  * struct netlink_ext_ack - netlink extended ACK report struct
  * @_msg: message string to report - don't access directly, use
  *	%NL_SET_ERR_MSG
  * @bad_attr: attribute with error
+ * @cookie: cookie data to return to userspace (for success)
+ * @cookie_len: actual cookie data length
  */
 struct netlink_ext_ack {
 	const char *_msg;
 	const struct nlattr *bad_attr;
+	u8 cookie[NETLINK_MAX_COOKIE_LEN];
+	u8 cookie_len;
 };
 
 /* Always use this macro, this allows later putting the
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 7df88770e029..f86127a46cfc 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -122,6 +122,9 @@ struct nlmsgerr {
  * @NLMSGERR_ATTR_MSG: error message string (string)
  * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original
  *	 message, counting from the beginning of the header (u32)
+ * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to
+ *	be used - in the success case - to identify a created
+ *	object or operation or similar (binary)
  * @__NLMSGERR_ATTR_MAX: number of attributes
  * @NLMSGERR_ATTR_MAX: highest attribute number
  */
@@ -129,6 +132,7 @@ enum nlmsgerr_attrs {
 	NLMSGERR_ATTR_UNUSED,
 	NLMSGERR_ATTR_MSG,
 	NLMSGERR_ATTR_OFFS,
+	NLMSGERR_ATTR_COOKIE,
 
 	__NLMSGERR_ATTR_MAX,
 	NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index c1564768000e..ee841f00a6ec 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2311,6 +2311,10 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 		}
 	} else {
 		flags |= NLM_F_CAPPED;
+
+		if (nlk->flags & NETLINK_F_EXT_ACK &&
+		    extack && extack->cookie_len)
+			tlvlen += nla_total_size(extack->cookie_len);
 	}
 
 	if (tlvlen)
@@ -2337,17 +2341,24 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 	errmsg->error = err;
 	memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh));
 
-	if (err && nlk->flags & NETLINK_F_EXT_ACK && extack) {
-		if (extack->_msg)
-			WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
-					       extack->_msg));
-		if (extack->bad_attr &&
-		    !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
-			     (u8 *)extack->bad_attr >= in_skb->data +
-						       in_skb->len))
-			WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
-					    (u8 *)extack->bad_attr -
-					    in_skb->data));
+	if (nlk->flags & NETLINK_F_EXT_ACK && extack) {
+		if (err) {
+			if (extack->_msg)
+				WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
+						       extack->_msg));
+			if (extack->bad_attr &&
+			    !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
+				     (u8 *)extack->bad_attr >= in_skb->data +
+							       in_skb->len))
+				WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
+						    (u8 *)extack->bad_attr -
+						    in_skb->data));
+		} else {
+			if (extack->cookie_len)
+				WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
+						extack->cookie_len,
+						extack->cookie));
+		}
 	}
 
 	nlmsg_end(skb, rep);
-- 
cgit v1.2.3


From b3b454f694db663773bc22002e10909afe9c1739 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 13 Apr 2017 14:25:17 -0700
Subject: libnvdimm: fix clear poison locking with spinlock and GFP_NOWAIT
 allocation

The following warning results from holding a lane spinlock,
preempt_disable(), or the btt map spinlock and then trying to take the
reconfig_mutex to walk the poison list and potentially add new entries.

BUG: sleeping function called from invalid context at kernel/locking/mutex.
c:747
in_atomic(): 1, irqs_disabled(): 0, pid: 17159, name: dd
[..]
Call Trace:
dump_stack+0x85/0xc8
___might_sleep+0x184/0x250
__might_sleep+0x4a/0x90
__mutex_lock+0x58/0x9b0
? nvdimm_bus_lock+0x21/0x30 [libnvdimm]
? __nvdimm_bus_badblocks_clear+0x2f/0x60 [libnvdimm]
? acpi_nfit_forget_poison+0x79/0x80 [nfit]
? _raw_spin_unlock+0x27/0x40
mutex_lock_nested+0x1b/0x20
nvdimm_bus_lock+0x21/0x30 [libnvdimm]
nvdimm_forget_poison+0x25/0x50 [libnvdimm]
nvdimm_clear_poison+0x106/0x140 [libnvdimm]
nsio_rw_bytes+0x164/0x270 [libnvdimm]
btt_write_pg+0x1de/0x3e0 [nd_btt]
? blk_queue_enter+0x30/0x290
btt_make_request+0x11a/0x310 [nd_btt]
? blk_queue_enter+0xb7/0x290
? blk_queue_enter+0x30/0x290
generic_make_request+0x118/0x3b0

A spinlock is introduced to protect the poison list. This allows us to not
having to acquire the reconfig_mutex for touching the poison list. The
add_poison() function has been broken out into two helper functions. One to
allocate the poison entry and the other to apppend the entry. This allows us
to unlock the poison_lock in non-I/O path and continue to be able to allocate
the poison entry with GFP_KERNEL. We will use GFP_NOWAIT in the I/O path in
order to satisfy being in atomic context.

Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c      |  7 +++---
 drivers/nvdimm/core.c     | 56 ++++++++++++++++++++++++++++-------------------
 drivers/nvdimm/nd-core.h  |  1 +
 include/linux/libnvdimm.h |  2 --
 4 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5ad2e5909e1a..d214ac44d111 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -296,6 +296,7 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
+	spin_lock_init(&nvdimm_bus->poison_lock);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
@@ -364,9 +365,9 @@ static int nd_bus_remove(struct device *dev)
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	spin_lock(&nvdimm_bus->poison_lock);
 	free_poison_list(&nvdimm_bus->poison_list);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
@@ -990,7 +991,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 		if (clear_err->cleared) {
 			/* clearing the poison list we keep track of */
-			__nvdimm_forget_poison(nvdimm_bus, clear_err->address,
+			nvdimm_forget_poison(nvdimm_bus, clear_err->address,
 					clear_err->cleared);
 
 			/* now sync the badblocks lists */
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 40a3da088fd2..2dee908e4bae 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -518,6 +518,15 @@ void nvdimm_badblocks_populate(struct nd_region *nd_region,
 }
 EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
 
+static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
+		struct nd_poison *pl, u64 addr, u64 length)
+{
+	lockdep_assert_held(&nvdimm_bus->poison_lock);
+	pl->start = addr;
+	pl->length = length;
+	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
+}
+
 static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
 			gfp_t flags)
 {
@@ -527,19 +536,24 @@ static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
 	if (!pl)
 		return -ENOMEM;
 
-	pl->start = addr;
-	pl->length = length;
-	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-
+	append_poison_entry(nvdimm_bus, pl, addr, length);
 	return 0;
 }
 
 static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
-	struct nd_poison *pl;
+	struct nd_poison *pl, *pl_new;
 
-	if (list_empty(&nvdimm_bus->poison_list))
-		return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
+	spin_unlock(&nvdimm_bus->poison_lock);
+	pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
+	spin_lock(&nvdimm_bus->poison_lock);
+
+	if (list_empty(&nvdimm_bus->poison_list)) {
+		if (!pl_new)
+			return -ENOMEM;
+		append_poison_entry(nvdimm_bus, pl_new, addr, length);
+		return 0;
+	}
 
 	/*
 	 * There is a chance this is a duplicate, check for those first.
@@ -551,6 +565,7 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 			/* If length has changed, update this list entry */
 			if (pl->length != length)
 				pl->length = length;
+			kfree(pl_new);
 			return 0;
 		}
 
@@ -559,30 +574,33 @@ static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 	 * as any overlapping ranges will get resolved when the list is consumed
 	 * and converted to badblocks
 	 */
-	return add_poison(nvdimm_bus, addr, length, GFP_KERNEL);
+	if (!pl_new)
+		return -ENOMEM;
+	append_poison_entry(nvdimm_bus, pl_new, addr, length);
+
+	return 0;
 }
 
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
 	int rc;
 
-	nvdimm_bus_lock(&nvdimm_bus->dev);
+	spin_lock(&nvdimm_bus->poison_lock);
 	rc = bus_add_poison(nvdimm_bus, addr, length);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 
 	return rc;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
-void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
+void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
 		unsigned int len)
 {
 	struct list_head *poison_list = &nvdimm_bus->poison_list;
 	u64 clr_end = start + len - 1;
 	struct nd_poison *pl, *next;
 
-	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
-
+	spin_lock(&nvdimm_bus->poison_lock);
 	WARN_ON_ONCE(list_empty(poison_list));
 
 	/*
@@ -629,21 +647,13 @@ void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
 			u64 new_len = pl_end - new_start + 1;
 
 			/* Add new entry covering the right half */
-			add_poison(nvdimm_bus, new_start, new_len, GFP_NOIO);
+			add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
 			/* Adjust this entry to cover the left half */
 			pl->length = start - pl->start;
 			continue;
 		}
 	}
-}
-EXPORT_SYMBOL_GPL(__nvdimm_forget_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len)
-{
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	__nvdimm_forget_poison(nvdimm_bus, start, len);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
+	spin_unlock(&nvdimm_bus->poison_lock);
 }
 EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
 
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 8623e57c2ce3..4c4bd209e725 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -32,6 +32,7 @@ struct nvdimm_bus {
 	struct list_head poison_list;
 	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
+	spinlock_t poison_lock;
 };
 
 struct nvdimm {
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 1c609e89048a..98b207611b06 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -122,8 +122,6 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
 int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
 void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t start, unsigned int len);
-void __nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
-- 
cgit v1.2.3


From 01c0e0a28da749e80cb7d549f75a5f52e2f40d0e Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 22 Mar 2017 15:55:30 +0100
Subject: power: supply: bq24190_charger: Use i2c-core irq-mapping code

The i2c-core already maps of irqs before calling the driver's probe
function and there are no in tree users of
bq24190_platform_data->gpio_int.

Remove the redundant custom irq-mapping code and just use client->irq.

Cc: Liam Breck <kernel@networkimprov.net>
Cc: Tony Lindgren <tony@atomide.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq24190_charger.c | 66 ++--------------------------------
 include/linux/power/bq24190_charger.h  | 16 ---------
 2 files changed, 3 insertions(+), 79 deletions(-)
 delete mode 100644 include/linux/power/bq24190_charger.h

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c
index 451f2bc05ea5..fa2d2da2de5f 100644
--- a/drivers/power/supply/bq24190_charger.c
+++ b/drivers/power/supply/bq24190_charger.c
@@ -18,9 +18,6 @@
 #include <linux/gpio.h>
 #include <linux/i2c.h>
 
-#include <linux/power/bq24190_charger.h>
-
-
 #define	BQ24190_MANUFACTURER	"Texas Instruments"
 
 #define BQ24190_REG_ISC		0x00 /* Input Source Control */
@@ -153,8 +150,6 @@ struct bq24190_dev_info {
 	struct power_supply		*battery;
 	char				model_name[I2C_NAME_SIZE];
 	kernel_ulong_t			model;
-	unsigned int			gpio_int;
-	unsigned int			irq;
 	bool				initialized;
 	bool				irq_event;
 	struct mutex			f_reg_lock;
@@ -1310,56 +1305,11 @@ static int bq24190_hw_init(struct bq24190_dev_info *bdi)
 	return bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
 }
 
-#ifdef CONFIG_OF
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
-{
-	bdi->irq = irq_of_parse_and_map(bdi->dev->of_node, 0);
-	if (bdi->irq <= 0)
-		return -1;
-
-	return 0;
-}
-#else
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
-{
-	return -1;
-}
-#endif
-
-static int bq24190_setup_pdata(struct bq24190_dev_info *bdi,
-		struct bq24190_platform_data *pdata)
-{
-	int ret;
-
-	if (!gpio_is_valid(pdata->gpio_int))
-		return -1;
-
-	ret = gpio_request(pdata->gpio_int, dev_name(bdi->dev));
-	if (ret < 0)
-		return -1;
-
-	ret = gpio_direction_input(pdata->gpio_int);
-	if (ret < 0)
-		goto out;
-
-	bdi->irq = gpio_to_irq(pdata->gpio_int);
-	if (!bdi->irq)
-		goto out;
-
-	bdi->gpio_int = pdata->gpio_int;
-	return 0;
-
-out:
-	gpio_free(pdata->gpio_int);
-	return -1;
-}
-
 static int bq24190_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
 	struct device *dev = &client->dev;
-	struct bq24190_platform_data *pdata = client->dev.platform_data;
 	struct power_supply_config charger_cfg = {}, battery_cfg = {};
 	struct bq24190_dev_info *bdi;
 	int ret;
@@ -1385,12 +1335,7 @@ static int bq24190_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, bdi);
 
-	if (dev->of_node)
-		ret = bq24190_setup_dt(bdi);
-	else
-		ret = bq24190_setup_pdata(bdi, pdata);
-
-	if (ret) {
+	if (!client->irq) {
 		dev_err(dev, "Can't get irq info\n");
 		return -EINVAL;
 	}
@@ -1436,7 +1381,7 @@ static int bq24190_probe(struct i2c_client *client,
 
 	bdi->initialized = true;
 
-	ret = devm_request_threaded_irq(dev, bdi->irq, NULL,
+	ret = devm_request_threaded_irq(dev, client->irq, NULL,
 			bq24190_irq_handler_thread,
 			IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 			"bq24190-charger", bdi);
@@ -1445,7 +1390,7 @@ static int bq24190_probe(struct i2c_client *client,
 		goto out5;
 	}
 
-	enable_irq_wake(bdi->irq);
+	enable_irq_wake(client->irq);
 
 	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
@@ -1467,8 +1412,6 @@ out2:
 out1:
 	pm_runtime_dont_use_autosuspend(dev);
 	pm_runtime_disable(dev);
-	if (bdi->gpio_int)
-		gpio_free(bdi->gpio_int);
 	return ret;
 }
 
@@ -1492,9 +1435,6 @@ static int bq24190_remove(struct i2c_client *client)
 	pm_runtime_dont_use_autosuspend(bdi->dev);
 	pm_runtime_disable(bdi->dev);
 
-	if (bdi->gpio_int)
-		gpio_free(bdi->gpio_int);
-
 	return 0;
 }
 
diff --git a/include/linux/power/bq24190_charger.h b/include/linux/power/bq24190_charger.h
deleted file mode 100644
index 9f0283721cbc..000000000000
--- a/include/linux/power/bq24190_charger.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Platform data for the TI bq24190 battery charger driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _BQ24190_CHARGER_H_
-#define _BQ24190_CHARGER_H_
-
-struct bq24190_platform_data {
-	unsigned int	gpio_int;	/* GPIO pin that's connected to INT# */
-};
-
-#endif
-- 
cgit v1.2.3


From ffff885832101543c002cef7abcab0fd27a9aee1 Mon Sep 17 00:00:00 2001
From: Jayachandran C <jnair@caviumnetworks.com>
Date: Thu, 13 Apr 2017 20:30:44 +0000
Subject: PCI: Add device flag PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT

Add a new quirk flag PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT to limit the DMA alias
search to go no further than the bridge where the IOMMU unit is attached.

The flag will be used to indicate a bridge device which forwards the
address translation requests to the IOMMU, i.e., where the interrupt and
DMA requests leave the PCIe hierarchy and go into the system blocks.

Usually this happens at the PCI RC, so this flag is not needed.  But on
systems where there are bridges that introduce aliases above the IOMMU,
this flag prevents pci_for_each_dma_alias() from generating aliases that
the IOMMU will never see.

The function pci_for_each_dma_alias() is updated to stop when it see a
bridge with this flag set.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=195447
Signed-off-by: Jayachandran C <jnair@caviumnetworks.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: David Daney <david.daney@cavium.com>
---
 drivers/pci/search.c | 4 ++++
 include/linux/pci.h  | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 33e0f033a48e..4c6044ad7368 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -60,6 +60,10 @@ int pci_for_each_dma_alias(struct pci_dev *pdev,
 
 		tmp = bus->self;
 
+		/* stop at bridge where translation unit is associated */
+		if (tmp->dev_flags & PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT)
+			return ret;
+
 		/*
 		 * PCIe-to-PCI/X bridges alias transactions from downstream
 		 * devices using the subordinate bus number (PCI Express to
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..3f596acc05be 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -178,6 +178,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
 	/* Get VPD from function 0 VPD */
 	PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8),
+	/* a non-root bridge where translation occurs, stop alias search here */
+	PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From c0c379e2931b05facef538e53bf3b21f283d9a0b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 13 Apr 2017 14:56:23 -0700
Subject: mm: drop unused pmdp_huge_get_and_clear_notify()

Dave noticed that after fixing MADV_DONTNEED vs numa balancing race the
last pmdp_huge_get_and_clear_notify() user is gone.

Let's drop the helper.

Link: http://lkml.kernel.org/r/20170306112047.24809-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 51891fb0d3ce..c91b3bcd158f 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -394,18 +394,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pud;								\
 })
 
-#define pmdp_huge_get_and_clear_notify(__mm, __haddr, __pmd)		\
-({									\
-	unsigned long ___haddr = __haddr & HPAGE_PMD_MASK;		\
-	pmd_t ___pmd;							\
-									\
-	___pmd = pmdp_huge_get_and_clear(__mm, __haddr, __pmd);		\
-	mmu_notifier_invalidate_range(__mm, ___haddr,			\
-				      ___haddr + HPAGE_PMD_SIZE);	\
-									\
-	___pmd;								\
-})
-
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -489,7 +477,6 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define pmdp_huge_get_and_clear_notify pmdp_huge_get_and_clear
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
-- 
cgit v1.2.3


From c7ef8f0c020ac43c8a692bf989017c06ab1fdf0f Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 14 Apr 2017 10:05:36 +0200
Subject: net: Add ESP offload features

This patch adds netdev features to configure IPsec offloads.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdev_features.h | 8 +++++++-
 include/linux/netdevice.h       | 1 +
 include/linux/skbuff.h          | 2 ++
 net/core/ethtool.c              | 3 +++
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9a0419594e84..1d4737cffc71 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -54,8 +54,9 @@ enum {
 					 */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
+	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_SCTP_BIT,
+		NETIF_F_GSO_ESP_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -73,6 +74,8 @@ enum {
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
+	NETIF_F_HW_ESP_BIT,		/* Hardware ESP transformation offload */
+	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -129,11 +132,14 @@ enum {
 #define NETIF_F_GSO_PARTIAL	 __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_GSO_SCTP	__NETIF_F(GSO_SCTP)
+#define NETIF_F_GSO_ESP		__NETIF_F(GSO_ESP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
+#define NETIF_F_HW_ESP		__NETIF_F(HW_ESP)
+#define NETIF_F_HW_ESP_TX_CSUM	__NETIF_F(HW_ESP_TX_CSUM)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cc07c3be2705..5bb03d181848 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4070,6 +4070,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 741d75cfc686..81ef53f06534 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -492,6 +492,8 @@ enum {
 	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 
 	SKB_GSO_SCTP = 1 << 15,
+
+	SKB_GSO_ESP = 1 << 16,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 905a88ad28e0..03111a2d6653 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -90,6 +90,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 	[NETIF_F_GSO_SCTP_BIT] =	 "tx-sctp-segmentation",
+	[NETIF_F_GSO_ESP_BIT] =		 "tx-esp-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
@@ -103,6 +104,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
+	[NETIF_F_HW_ESP_BIT] =		 "esp-hw-offload",
+	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From d77e38e612a017480157fe6d2c1422f42cb5b7e3 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 14 Apr 2017 10:06:10 +0200
Subject: xfrm: Add an IPsec hardware offloading API

This patch adds all the bits that are needed to do
IPsec hardware offload for IPsec states and ESP packets.
We add xfrmdev_ops to the net_device. xfrmdev_ops has
function pointers that are needed to manage the xfrm
states in the hardware and to do a per packet
offloading decision.

Joint work with:
Ilan Tayari <ilant@mellanox.com>
Guy Shapiro <guysh@mellanox.com>
Yossi Kuperman <yossiku@mellanox.com>

Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Yossi Kuperman <yossiku@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |  14 +++++
 include/net/xfrm.h        |  65 +++++++++++++++++++++-
 include/uapi/linux/xfrm.h |   8 +++
 net/ipv4/esp4.c           |   7 +--
 net/ipv4/xfrm4_output.c   |   3 +-
 net/ipv6/esp6.c           |   4 +-
 net/ipv6/xfrm6_output.c   |   9 ++-
 net/xfrm/Makefile         |   3 +-
 net/xfrm/xfrm_device.c    | 138 +++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_input.c     |  41 +++++++++++++-
 net/xfrm/xfrm_output.c    |  44 +++++++++++++--
 net/xfrm/xfrm_policy.c    |  10 ++--
 net/xfrm/xfrm_state.c     |  74 +++++++++++++++++++++++++
 net/xfrm/xfrm_user.c      |  28 ++++++++++
 14 files changed, 424 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5bb03d181848..b3eb83db0223 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -824,6 +824,16 @@ struct netdev_xdp {
 	};
 };
 
+#ifdef CONFIG_XFRM_OFFLOAD
+struct xfrmdev_ops {
+	int	(*xdo_dev_state_add) (struct xfrm_state *x);
+	void	(*xdo_dev_state_delete) (struct xfrm_state *x);
+	void	(*xdo_dev_state_free) (struct xfrm_state *x);
+	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
+				       struct xfrm_state *x);
+};
+#endif
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1697,6 +1707,10 @@ struct net_device {
 	const struct ndisc_ops *ndisc_ops;
 #endif
 
+#ifdef CONFIG_XFRM
+	const struct xfrmdev_ops *xfrmdev_ops;
+#endif
+
 	const struct header_ops *header_ops;
 
 	unsigned int		flags;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 54515d989365..17603bf190c1 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -120,6 +120,13 @@ struct xfrm_state_walk {
 	struct xfrm_address_filter *filter;
 };
 
+struct xfrm_state_offload {
+	struct net_device	*dev;
+	unsigned long		offload_handle;
+	unsigned int		num_exthdrs;
+	u8			flags;
+};
+
 /* Full description of state of transformer. */
 struct xfrm_state {
 	possible_net_t		xs_net;
@@ -207,6 +214,8 @@ struct xfrm_state {
 	struct xfrm_lifetime_cur curlft;
 	struct tasklet_hrtimer	mtimer;
 
+	struct xfrm_state_offload xso;
+
 	/* used to fix curlft->add_time when changing date */
 	long		saved_tmo;
 
@@ -1453,7 +1462,6 @@ struct xfrm6_tunnel {
 void xfrm_init(void);
 void xfrm4_init(void);
 int xfrm_state_init(struct net *net);
-void xfrm_dev_init(void);
 void xfrm_state_fini(struct net *net);
 void xfrm4_state_init(void);
 void xfrm4_protocol_init(void);
@@ -1559,6 +1567,7 @@ struct xfrmk_spdinfo {
 struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
 int xfrm_state_delete(struct xfrm_state *x);
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
+int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
@@ -1641,6 +1650,11 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
 }
 #endif
 
+struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
+				    const xfrm_address_t *saddr,
+				    const xfrm_address_t *daddr,
+				    int family);
+
 struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);
 
 void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type);
@@ -1846,6 +1860,55 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
 }
 #endif
 
+#ifdef CONFIG_XFRM_OFFLOAD
+void __net_init xfrm_dev_init(void);
+int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
+		       struct xfrm_user_offload *xuo);
+bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
+
+static inline void xfrm_dev_state_delete(struct xfrm_state *x)
+{
+	struct xfrm_state_offload *xso = &x->xso;
+
+	if (xso->dev)
+		xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
+}
+
+static inline void xfrm_dev_state_free(struct xfrm_state *x)
+{
+	struct xfrm_state_offload *xso = &x->xso;
+	 struct net_device *dev = xso->dev;
+
+	if (dev && dev->xfrmdev_ops) {
+		dev->xfrmdev_ops->xdo_dev_state_free(x);
+		xso->dev = NULL;
+		dev_put(dev);
+	}
+}
+#else
+static inline void __net_init xfrm_dev_init(void)
+{
+}
+
+static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo)
+{
+	return 0;
+}
+
+static inline void xfrm_dev_state_delete(struct xfrm_state *x)
+{
+}
+
+static inline void xfrm_dev_state_free(struct xfrm_state *x)
+{
+}
+
+static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+{
+	return false;
+}
+#endif
+
 static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
 {
 	if (attrs[XFRMA_MARK])
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 1fc62b239f1b..2b384ff09fa0 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -303,6 +303,7 @@ enum xfrm_attr_type_t {
 	XFRMA_PROTO,		/* __u8 */
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
 	XFRMA_PAD,
+	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -494,6 +495,13 @@ struct xfrm_address_filter {
 	__u8				dplen;
 };
 
+struct xfrm_user_offload {
+	int				ifindex;
+	__u8				flags;
+};
+#define XFRM_OFFLOAD_IPV6	1
+#define XFRM_OFFLOAD_INBOUND	2
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b1e24446e297..c6aba234b6e9 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -435,9 +435,6 @@ skip_cow2:
 	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
-	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
-			    ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
-
 	memset(iv, 0, ivlen);
 	memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8),
 	       min(ivlen, 8));
@@ -470,6 +467,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 {
 	const struct iphdr *iph;
 	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct crypto_aead *aead = x->data;
 	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
@@ -478,7 +476,8 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 	u8 nexthdr[2];
 	int padlen;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+		kfree(ESP_SKB_CB(skb)->tmp);
 
 	if (unlikely(err))
 		goto out;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 7ee6518afa86..94b8702603bc 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -29,7 +29,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 		goto out;
 
 	mtu = dst_mtu(skb_dst(skb));
-	if (skb->len > mtu) {
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) {
 		skb->protocol = htons(ETH_P_IP);
 
 		if (skb->sk)
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ff54faa75631..3d3757d20d0a 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -450,6 +450,7 @@ error:
 static int esp_input_done2(struct sk_buff *skb, int err)
 {
 	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	struct crypto_aead *aead = x->data;
 	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
@@ -458,7 +459,8 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 	int padlen;
 	u8 nexthdr[2];
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+		kfree(ESP_SKB_CB(skb)->tmp);
 
 	if (unlikely(err))
 		goto out;
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 4d09ce6fa90e..8ae87d4ec5ff 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -73,11 +73,16 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 	int mtu, ret = 0;
 	struct dst_entry *dst = skb_dst(skb);
 
+	if (skb->ignore_df)
+		goto out;
+
 	mtu = dst_mtu(dst);
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 
-	if (!skb->ignore_df && skb->len > mtu) {
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) &&
+	     skb_gso_network_seglen(skb) > ip6_skb_dst_mtu(skb))) {
 		skb->dev = dst->dev;
 		skb->protocol = htons(ETH_P_IPV6);
 
@@ -89,7 +94,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		ret = -EMSGSIZE;
 	}
-
+out:
 	return ret;
 }
 
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 55b2ac300995..abf81b329dc1 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -4,7 +4,8 @@
 
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 		      xfrm_input.o xfrm_output.o \
-		      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
+		      xfrm_sysctl.o xfrm_replay.o
+obj-$(CONFIG_XFRM_OFFLOAD) += xfrm_device.o
 obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
 obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 34a260a61be9..9bac2ba9052c 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -22,13 +22,149 @@
 #include <net/xfrm.h>
 #include <linux/notifier.h>
 
+int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
+		       struct xfrm_user_offload *xuo)
+{
+	int err;
+	struct dst_entry *dst;
+	struct net_device *dev;
+	struct xfrm_state_offload *xso = &x->xso;
+	xfrm_address_t *saddr;
+	xfrm_address_t *daddr;
+
+	if (!x->type_offload)
+		return 0;
+
+	/* We don't yet support UDP encapsulation, TFC padding and ESN. */
+	if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN))
+		return 0;
+
+	dev = dev_get_by_index(net, xuo->ifindex);
+	if (!dev) {
+		if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
+			saddr = &x->props.saddr;
+			daddr = &x->id.daddr;
+		} else {
+			saddr = &x->id.daddr;
+			daddr = &x->props.saddr;
+		}
+
+		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family);
+		if (IS_ERR(dst))
+			return 0;
+
+		dev = dst->dev;
+
+		dev_hold(dev);
+		dst_release(dst);
+	}
+
+	if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
+		dev_put(dev);
+		return 0;
+	}
+
+	xso->dev = dev;
+	xso->num_exthdrs = 1;
+	xso->flags = xuo->flags;
+
+	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
+	if (err) {
+		dev_put(dev);
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_state_add);
+
+bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+{
+	int mtu;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct net_device *dev = x->xso.dev;
+
+	if (!x->type_offload || x->encap)
+		return false;
+
+	if ((x->xso.offload_handle && (dev == dst->path->dev)) &&
+	     !dst->child->xfrm && x->type->get_mtu) {
+		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
+
+		if (skb->len <= mtu)
+			goto ok;
+
+		if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+			goto ok;
+	}
+
+	return false;
+
+ok:
+	if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok)
+		return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);
+
+int xfrm_dev_register(struct net_device *dev)
+{
+	if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops)
+		return NOTIFY_BAD;
+	if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
+	    !(dev->features & NETIF_F_HW_ESP))
+		return NOTIFY_BAD;
+
+	return NOTIFY_DONE;
+}
+
+static int xfrm_dev_unregister(struct net_device *dev)
+{
+	return NOTIFY_DONE;
+}
+
+static int xfrm_dev_feat_change(struct net_device *dev)
+{
+	if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops)
+		return NOTIFY_BAD;
+	else if (!(dev->features & NETIF_F_HW_ESP))
+		dev->xfrmdev_ops = NULL;
+
+	if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
+	    !(dev->features & NETIF_F_HW_ESP))
+		return NOTIFY_BAD;
+
+	return NOTIFY_DONE;
+}
+
+static int xfrm_dev_down(struct net_device *dev)
+{
+	if (dev->hw_features & NETIF_F_HW_ESP)
+		xfrm_dev_state_flush(dev_net(dev), dev, true);
+
+	xfrm_garbage_collect(dev_net(dev));
+
+	return NOTIFY_DONE;
+}
+
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
+	case NETDEV_REGISTER:
+		return xfrm_dev_register(dev);
+
+	case NETDEV_UNREGISTER:
+		return xfrm_dev_unregister(dev);
+
+	case NETDEV_FEAT_CHANGE:
+		return xfrm_dev_feat_change(dev);
+
 	case NETDEV_DOWN:
-		xfrm_garbage_collect(dev_net(dev));
+		return xfrm_dev_down(dev);
 	}
 	return NOTIFY_DONE;
 }
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 46bdb4fbed0b..362d655eac27 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -107,6 +107,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
 	sp->len = 0;
 	sp->olen = 0;
 
+	memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH]));
+
 	if (src) {
 		int i;
 
@@ -207,8 +209,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	unsigned int family;
 	int decaps = 0;
 	int async = 0;
-	struct xfrm_offload *xo;
 	bool xfrm_gro = false;
+	bool crypto_done = false;
+	struct xfrm_offload *xo = xfrm_offload(skb);
 
 	if (encap_type < 0) {
 		x = xfrm_input_state(skb);
@@ -226,6 +229,37 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		goto lock;
 	}
 
+	if (xo && (xo->flags & CRYPTO_DONE)) {
+		crypto_done = true;
+		x = xfrm_input_state(skb);
+		family = XFRM_SPI_SKB_CB(skb)->family;
+
+		if (!(xo->status & CRYPTO_SUCCESS)) {
+			if (xo->status &
+			    (CRYPTO_TRANSPORT_AH_AUTH_FAILED |
+			     CRYPTO_TRANSPORT_ESP_AUTH_FAILED |
+			     CRYPTO_TUNNEL_AH_AUTH_FAILED |
+			     CRYPTO_TUNNEL_ESP_AUTH_FAILED)) {
+
+				xfrm_audit_state_icvfail(x, skb,
+							 x->type->proto);
+				x->stats.integrity_failed++;
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
+				goto drop;
+			}
+
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+			goto drop;
+		}
+
+		if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+			goto drop;
+		}
+
+		goto lock;
+	}
+
 	daddr = (xfrm_address_t *)(skb_network_header(skb) +
 				   XFRM_SPI_SKB_CB(skb)->daddroff);
 	family = XFRM_SPI_SKB_CB(skb)->family;
@@ -311,7 +345,10 @@ lock:
 		skb_dst_force(skb);
 		dev_hold(skb->dev);
 
-		nexthdr = x->type->input(x, skb);
+		if (crypto_done)
+			nexthdr = x->type_offload->input_tail(x, skb);
+		else
+			nexthdr = x->type->input(x, skb);
 
 		if (nexthdr == -EINPROGRESS)
 			return 0;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 8ba29fe58352..a15088613a6c 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -99,12 +99,13 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 
 		skb_dst_force(skb);
 
-		/* Inner headers are invalid now. */
-		skb->encapsulation = 0;
-
-		err = x->type->output(x, skb);
-		if (err == -EINPROGRESS)
-			goto out;
+		if (xfrm_offload(skb)) {
+			x->type_offload->encap(x, skb);
+		} else {
+			err = x->type->output(x, skb);
+			if (err == -EINPROGRESS)
+				goto out;
+		}
 
 resume:
 		if (err) {
@@ -200,8 +201,38 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
 int xfrm_output(struct sock *sk, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
 	int err;
 
+	secpath_reset(skb);
+
+	if (xfrm_dev_offload_ok(skb, x)) {
+		struct sec_path *sp;
+
+		sp = secpath_dup(skb->sp);
+		if (!sp) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+
+		sp->olen++;
+		sp->xvec[skb->sp->len++] = x;
+		xfrm_state_hold(x);
+
+		if (skb_is_gso(skb)) {
+			skb_shinfo(skb)->gso_type |= SKB_GSO_ESP;
+
+			return xfrm_output2(net, sk, skb);
+		}
+
+		if (x->xso.dev && x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM)
+			goto out;
+	}
+
 	if (skb_is_gso(skb))
 		return xfrm_output_gso(net, sk, skb);
 
@@ -214,6 +245,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
+out:
 	return xfrm_output2(net, sk, skb);
 }
 EXPORT_SYMBOL_GPL(xfrm_output);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7befca2a0773..dd44ddc1aea5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -116,11 +116,10 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
 	return afinfo;
 }
 
-static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
-						  int tos, int oif,
-						  const xfrm_address_t *saddr,
-						  const xfrm_address_t *daddr,
-						  int family)
+struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
+				    const xfrm_address_t *saddr,
+				    const xfrm_address_t *daddr,
+				    int family)
 {
 	const struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
@@ -135,6 +134,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 
 	return dst;
 }
+EXPORT_SYMBOL(__xfrm_dst_lookup);
 
 static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 						int tos, int oif,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 47fefe97d1e3..fc3c5aa38754 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -440,6 +440,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
 	}
+	xfrm_dev_state_free(x);
 	security_xfrm_state_free(x);
 	kfree(x);
 }
@@ -609,6 +610,8 @@ int __xfrm_state_delete(struct xfrm_state *x)
 		net->xfrm.state_num--;
 		spin_unlock(&net->xfrm.xfrm_state_lock);
 
+		xfrm_dev_state_delete(x);
+
 		/* All xfrm_state objects are created by xfrm_state_alloc.
 		 * The xfrm_state_alloc call gives a reference, and that
 		 * is what we are dropping here.
@@ -653,12 +656,41 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
 
 	return err;
 }
+
+static inline int
+xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
+{
+	int i, err = 0;
+
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct xfrm_state *x;
+		struct xfrm_state_offload *xso;
+
+		hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+			xso = &x->xso;
+
+			if (xso->dev == dev &&
+			   (err = security_xfrm_state_delete(x)) != 0) {
+				xfrm_audit_state_delete(x, 0, task_valid);
+				return err;
+			}
+		}
+	}
+
+	return err;
+}
 #else
 static inline int
 xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
 {
 	return 0;
 }
+
+static inline int
+xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
+{
+	return 0;
+}
 #endif
 
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
@@ -701,6 +733,48 @@ out:
 }
 EXPORT_SYMBOL(xfrm_state_flush);
 
+int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid)
+{
+	int i, err = 0, cnt = 0;
+
+	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid);
+	if (err)
+		goto out;
+
+	err = -ESRCH;
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct xfrm_state *x;
+		struct xfrm_state_offload *xso;
+restart:
+		hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+			xso = &x->xso;
+
+			if (!xfrm_state_kern(x) && xso->dev == dev) {
+				xfrm_state_hold(x);
+				spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+				err = xfrm_state_delete(x);
+				xfrm_audit_state_delete(x, err ? 0 : 1,
+							task_valid);
+				xfrm_state_put(x);
+				if (!err)
+					cnt++;
+
+				spin_lock_bh(&net->xfrm.xfrm_state_lock);
+				goto restart;
+			}
+		}
+	}
+	if (cnt)
+		err = 0;
+
+out:
+	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_dev_state_flush);
+
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 {
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4f7e62ddc17e..de3332e3f9e2 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -595,6 +595,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 			goto error;
 	}
 
+	if (attrs[XFRMA_OFFLOAD_DEV] &&
+	    xfrm_dev_state_add(net, x, nla_data(attrs[XFRMA_OFFLOAD_DEV])))
+		goto error;
+
 	if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
 					       attrs[XFRMA_REPLAY_ESN_VAL])))
 		goto error;
@@ -779,6 +783,23 @@ static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb)
 	return 0;
 }
 
+static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb)
+{
+	struct xfrm_user_offload *xuo;
+	struct nlattr *attr;
+
+	attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo));
+	if (attr == NULL)
+		return -EMSGSIZE;
+
+	xuo = nla_data(attr);
+
+	xuo->ifindex = xso->dev->ifindex;
+	xuo->flags = xso->flags;
+
+	return 0;
+}
+
 static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
 {
 	struct xfrm_algo *algo;
@@ -869,6 +890,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 			      &x->replay);
 	if (ret)
 		goto out;
+	if(x->xso.dev)
+		ret = copy_user_offload(&x->xso, skb);
+	if (ret)
+		goto out;
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
 out:
@@ -2406,6 +2431,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_SA_EXTRA_FLAGS]	= { .type = NLA_U32 },
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
+	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2622,6 +2648,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(*x->coaddr));
 	if (x->props.extra_flags)
 		l += nla_total_size(sizeof(x->props.extra_flags));
+	if (x->xso.dev)
+		 l += nla_total_size(sizeof(x->xso));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
-- 
cgit v1.2.3


From 4adec7da0536a345d901d7ba55b6c93a14eeeaff Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@linaro.org>
Date: Tue, 4 Apr 2017 09:47:51 +0200
Subject: iio: stm32 trigger: Add quadrature encoder device

One of the features of STM32 trigger hardware block is a quadrature
encoder that can counts up/down depending of the levels and edges
of the selected external pins.

This patch allow to read/write the counter, get it direction,
set/get quadrature modes and get scale factor.

When counting up preset value is the limit of the counter.
When counting down the counter start from preset value down to 0.
This preset value could be set/get by using
/sys/bus/iio/devices/iio:deviceX/in_count0_preset attribute.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
Reviewed-by: William Breathitt Gray <vilhelm.gray@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 .../ABI/testing/sysfs-bus-iio-timer-stm32          |  46 +++-
 drivers/iio/trigger/stm32-timer-trigger.c          | 244 ++++++++++++++++++++-
 include/linux/mfd/stm32-timers.h                   |   2 +
 3 files changed, 282 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
index 6534a60037ff..a55ad44446c5 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
+++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
@@ -3,11 +3,15 @@ KernelVersion:	4.11
 Contact:	benjamin.gaignard@st.com
 Description:
 		Reading returns the list possible master modes which are:
-		- "reset"     :	The UG bit from the TIMx_EGR register is used as trigger output (TRGO).
-		- "enable"    : The Counter Enable signal CNT_EN is used as trigger output.
+		- "reset"     :	The UG bit from the TIMx_EGR register is
+				used as trigger output (TRGO).
+		- "enable"    : The Counter Enable signal CNT_EN is used
+				as trigger output.
 		- "update"    : The update event is selected as trigger output.
-				For instance a master timer can then be used as a prescaler for a slave timer.
-		- "compare_pulse" : The trigger output send a positive pulse when the CC1IF flag is to be set.
+				For instance a master timer can then be used
+				as a prescaler for a slave timer.
+		- "compare_pulse" : The trigger output send a positive pulse
+				    when the CC1IF flag is to be set.
 		- "OC1REF"    : OC1REF signal is used as trigger output.
 		- "OC2REF"    : OC2REF signal is used as trigger output.
 		- "OC3REF"    : OC3REF signal is used as trigger output.
@@ -27,3 +31,37 @@ Description:
 		Reading returns the current sampling frequency.
 		Writing an value different of 0 set and start sampling.
 		Writing 0 stop sampling.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_count0_preset
+KernelVersion:	4.12
+Contact:	benjamin.gaignard@st.com
+Description:
+		Reading returns the current preset value.
+		Writing sets the preset value.
+		When counting up the counter starts from 0 and fires an
+		event when reach preset value.
+		When counting down the counter start from preset value
+		and fire event when reach 0.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available
+KernelVersion:	4.12
+Contact:	benjamin.gaignard@st.com
+Description:
+		Reading returns the list possible quadrature modes.
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_count0_quadrature_mode
+KernelVersion:	4.12
+Contact:	benjamin.gaignard@st.com
+Description:
+		Configure the device counter quadrature modes:
+		channel_A:
+			Encoder A input servers as the count input and B as
+			the UP/DOWN direction control input.
+
+		channel_B:
+			Encoder B input serves as the count input and A as
+			the UP/DOWN direction control input.
+
+		quadrature:
+			Encoder A and B inputs are mixed to get direction
+			and count with a scale of 0.25.
diff --git a/drivers/iio/trigger/stm32-timer-trigger.c b/drivers/iio/trigger/stm32-timer-trigger.c
index 994b96d19750..7db904cae926 100644
--- a/drivers/iio/trigger/stm32-timer-trigger.c
+++ b/drivers/iio/trigger/stm32-timer-trigger.c
@@ -15,6 +15,7 @@
 #include <linux/platform_device.h>
 
 #define MAX_TRIGGERS 6
+#define MAX_VALIDS 5
 
 /* List the triggers created by each timer */
 static const void *triggers_table[][MAX_TRIGGERS] = {
@@ -32,12 +33,29 @@ static const void *triggers_table[][MAX_TRIGGERS] = {
 	{ TIM12_TRGO, TIM12_CH1, TIM12_CH2,},
 };
 
+/* List the triggers accepted by each timer */
+static const void *valids_table[][MAX_VALIDS] = {
+	{ TIM5_TRGO, TIM2_TRGO, TIM3_TRGO, TIM4_TRGO,},
+	{ TIM1_TRGO, TIM8_TRGO, TIM3_TRGO, TIM4_TRGO,},
+	{ TIM1_TRGO, TIM2_TRGO, TIM5_TRGO, TIM4_TRGO,},
+	{ TIM1_TRGO, TIM2_TRGO, TIM3_TRGO, TIM8_TRGO,},
+	{ TIM2_TRGO, TIM3_TRGO, TIM4_TRGO, TIM8_TRGO,},
+	{ }, /* timer 6 */
+	{ }, /* timer 7 */
+	{ TIM1_TRGO, TIM2_TRGO, TIM4_TRGO, TIM5_TRGO,},
+	{ TIM2_TRGO, TIM3_TRGO,},
+	{ }, /* timer 10 */
+	{ }, /* timer 11 */
+	{ TIM4_TRGO, TIM5_TRGO,},
+};
+
 struct stm32_timer_trigger {
 	struct device *dev;
 	struct regmap *regmap;
 	struct clk *clk;
 	u32 max_arr;
 	const void *triggers;
+	const void *valids;
 };
 
 static int stm32_timer_start(struct stm32_timer_trigger *priv,
@@ -180,8 +198,7 @@ static ssize_t stm32_tt_show_master_mode(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	struct stm32_timer_trigger *priv = dev_get_drvdata(dev);
 	u32 cr2;
 
 	regmap_read(priv->regmap, TIM_CR2, &cr2);
@@ -194,8 +211,7 @@ static ssize_t stm32_tt_store_master_mode(struct device *dev,
 					  struct device_attribute *attr,
 					  const char *buf, size_t len)
 {
-	struct iio_dev *indio_dev = dev_to_iio_dev(dev);
-	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	struct stm32_timer_trigger *priv = dev_get_drvdata(dev);
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(master_mode_table); i++) {
@@ -275,6 +291,216 @@ static int stm32_setup_iio_triggers(struct stm32_timer_trigger *priv)
 	return 0;
 }
 
+static int stm32_counter_read_raw(struct iio_dev *indio_dev,
+				  struct iio_chan_spec const *chan,
+				  int *val, int *val2, long mask)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+	{
+		u32 cnt;
+
+		regmap_read(priv->regmap, TIM_CNT, &cnt);
+		*val = cnt;
+
+		return IIO_VAL_INT;
+	}
+	case IIO_CHAN_INFO_SCALE:
+	{
+		u32 smcr;
+
+		regmap_read(priv->regmap, TIM_SMCR, &smcr);
+		smcr &= TIM_SMCR_SMS;
+
+		*val = 1;
+		*val2 = 0;
+
+		/* in quadrature case scale = 0.25 */
+		if (smcr == 3)
+			*val2 = 2;
+
+		return IIO_VAL_FRACTIONAL_LOG2;
+	}
+	}
+
+	return -EINVAL;
+}
+
+static int stm32_counter_write_raw(struct iio_dev *indio_dev,
+				   struct iio_chan_spec const *chan,
+				   int val, int val2, long mask)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+
+	switch (mask) {
+	case IIO_CHAN_INFO_RAW:
+		regmap_write(priv->regmap, TIM_CNT, val);
+
+		return IIO_VAL_INT;
+	case IIO_CHAN_INFO_SCALE:
+		/* fixed scale */
+		return -EINVAL;
+	}
+
+	return -EINVAL;
+}
+
+static const struct iio_info stm32_trigger_info = {
+	.driver_module = THIS_MODULE,
+	.read_raw = stm32_counter_read_raw,
+	.write_raw = stm32_counter_write_raw
+};
+
+static const char *const stm32_quadrature_modes[] = {
+	"channel_A",
+	"channel_B",
+	"quadrature",
+};
+
+static int stm32_set_quadrature_mode(struct iio_dev *indio_dev,
+				     const struct iio_chan_spec *chan,
+				     unsigned int mode)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+
+	regmap_update_bits(priv->regmap, TIM_SMCR, TIM_SMCR_SMS, mode + 1);
+
+	return 0;
+}
+
+static int stm32_get_quadrature_mode(struct iio_dev *indio_dev,
+				     const struct iio_chan_spec *chan)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	u32 smcr;
+
+	regmap_read(priv->regmap, TIM_SMCR, &smcr);
+	smcr &= TIM_SMCR_SMS;
+
+	return smcr - 1;
+}
+
+static const struct iio_enum stm32_quadrature_mode_enum = {
+	.items = stm32_quadrature_modes,
+	.num_items = ARRAY_SIZE(stm32_quadrature_modes),
+	.set = stm32_set_quadrature_mode,
+	.get = stm32_get_quadrature_mode
+};
+
+static const char *const stm32_count_direction_states[] = {
+	"up",
+	"down"
+};
+
+static int stm32_set_count_direction(struct iio_dev *indio_dev,
+				     const struct iio_chan_spec *chan,
+				     unsigned int mode)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_DIR, mode);
+
+	return 0;
+}
+
+static int stm32_get_count_direction(struct iio_dev *indio_dev,
+				     const struct iio_chan_spec *chan)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	u32 cr1;
+
+	regmap_read(priv->regmap, TIM_CR1, &cr1);
+
+	return (cr1 & TIM_CR1_DIR);
+}
+
+static const struct iio_enum stm32_count_direction_enum = {
+	.items = stm32_count_direction_states,
+	.num_items = ARRAY_SIZE(stm32_count_direction_states),
+	.set = stm32_set_count_direction,
+	.get = stm32_get_count_direction
+};
+
+static ssize_t stm32_count_get_preset(struct iio_dev *indio_dev,
+				      uintptr_t private,
+				      const struct iio_chan_spec *chan,
+				      char *buf)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	u32 arr;
+
+	regmap_read(priv->regmap, TIM_ARR, &arr);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", arr);
+}
+
+static ssize_t stm32_count_set_preset(struct iio_dev *indio_dev,
+				      uintptr_t private,
+				      const struct iio_chan_spec *chan,
+				      const char *buf, size_t len)
+{
+	struct stm32_timer_trigger *priv = iio_priv(indio_dev);
+	unsigned int preset;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &preset);
+	if (ret)
+		return ret;
+
+	regmap_write(priv->regmap, TIM_ARR, preset);
+	regmap_update_bits(priv->regmap, TIM_CR1, TIM_CR1_ARPE, TIM_CR1_ARPE);
+
+	return len;
+}
+
+static const struct iio_chan_spec_ext_info stm32_trigger_count_info[] = {
+	{
+		.name = "preset",
+		.shared = IIO_SEPARATE,
+		.read = stm32_count_get_preset,
+		.write = stm32_count_set_preset
+	},
+	IIO_ENUM("count_direction", IIO_SEPARATE, &stm32_count_direction_enum),
+	IIO_ENUM_AVAILABLE("count_direction", &stm32_count_direction_enum),
+	IIO_ENUM("quadrature_mode", IIO_SEPARATE, &stm32_quadrature_mode_enum),
+	IIO_ENUM_AVAILABLE("quadrature_mode", &stm32_quadrature_mode_enum),
+	{}
+};
+
+static const struct iio_chan_spec stm32_trigger_channel = {
+	.type = IIO_COUNT,
+	.channel = 0,
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | BIT(IIO_CHAN_INFO_SCALE),
+	.ext_info = stm32_trigger_count_info,
+	.indexed = 1
+};
+
+static struct stm32_timer_trigger *stm32_setup_counter_device(struct device *dev)
+{
+	struct iio_dev *indio_dev;
+	int ret;
+
+	indio_dev = devm_iio_device_alloc(dev,
+					  sizeof(struct stm32_timer_trigger));
+	if (!indio_dev)
+		return NULL;
+
+	indio_dev->name = dev_name(dev);
+	indio_dev->dev.parent = dev;
+	indio_dev->info = &stm32_trigger_info;
+	indio_dev->num_channels = 1;
+	indio_dev->channels = &stm32_trigger_channel;
+	indio_dev->dev.of_node = dev->of_node;
+
+	ret = devm_iio_device_register(dev, indio_dev);
+	if (ret)
+		return NULL;
+
+	return iio_priv(indio_dev);
+}
+
 /**
  * is_stm32_timer_trigger
  * @trig: trigger to be checked
@@ -299,10 +525,15 @@ static int stm32_timer_trigger_probe(struct platform_device *pdev)
 	if (of_property_read_u32(dev->of_node, "reg", &index))
 		return -EINVAL;
 
-	if (index >= ARRAY_SIZE(triggers_table))
+	if (index >= ARRAY_SIZE(triggers_table) ||
+	    index >= ARRAY_SIZE(valids_table))
 		return -EINVAL;
 
-	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	/* Create an IIO device only if we have triggers to be validated */
+	if (*valids_table[index])
+		priv = stm32_setup_counter_device(dev);
+	else
+		priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
 
 	if (!priv)
 		return -ENOMEM;
@@ -312,6 +543,7 @@ static int stm32_timer_trigger_probe(struct platform_device *pdev)
 	priv->clk = ddata->clk;
 	priv->max_arr = ddata->max_arr;
 	priv->triggers = triggers_table[index];
+	priv->valids = valids_table[index];
 
 	ret = stm32_setup_iio_triggers(priv);
 	if (ret)
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index d0300045f04a..4a0abbc10ef6 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -21,6 +21,7 @@
 #define TIM_CCMR1	0x18	/* Capt/Comp 1 Mode Reg    */
 #define TIM_CCMR2	0x1C	/* Capt/Comp 2 Mode Reg    */
 #define TIM_CCER	0x20	/* Capt/Comp Enable Reg    */
+#define TIM_CNT		0x24	/* Counter		   */
 #define TIM_PSC		0x28	/* Prescaler               */
 #define TIM_ARR		0x2c	/* Auto-Reload Register    */
 #define TIM_CCR1	0x34	/* Capt/Comp Register 1    */
@@ -30,6 +31,7 @@
 #define TIM_BDTR	0x44	/* Break and Dead-Time Reg */
 
 #define TIM_CR1_CEN	BIT(0)	/* Counter Enable	   */
+#define TIM_CR1_DIR	BIT(4)  /* Counter Direction	   */
 #define TIM_CR1_ARPE	BIT(7)	/* Auto-reload Preload Ena */
 #define TIM_CR2_MMS	(BIT(4) | BIT(5) | BIT(6)) /* Master mode selection */
 #define TIM_SMCR_SMS	(BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */
-- 
cgit v1.2.3


From f7e30f01a9e221067bb4b579e3cfc25cd2617467 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 12 Apr 2017 11:20:29 -0700
Subject: cpumask: Add helper cpumask_available()

With CONFIG_CPUMASK_OFFSTACK=y cpumask_var_t is a struct cpumask
pointer, otherwise a struct cpumask array with a single element.

Some code dealing with cpumasks needs to validate that a cpumask_var_t
is not a NULL pointer when CONFIG_CPUMASK_OFFSTACK=y. This is typically
done by performing the check always, regardless of the underlying type
of cpumask_var_t. This works in both cases, however clang raises a
warning like this when CONFIG_CPUMASK_OFFSTACK=n:

kernel/irq/manage.c:839:28: error: address of array
'desc->irq_common_data.affinity' will always evaluate to 'true'
[-Werror,-Wpointer-bool-conversion]

Add the inline helper cpumask_available() which only performs the
pointer check if CONFIG_CPUMASK_OFFSTACK=y.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Michael Davidson <md@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20170412182030.83657-1-mka@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpumask.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 96f1e88b767c..1a675604b17d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -667,6 +667,11 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
 
+static inline bool cpumask_available(cpumask_var_t mask)
+{
+	return mask != NULL;
+}
+
 #else
 typedef struct cpumask cpumask_var_t[1];
 
@@ -708,6 +713,11 @@ static inline void free_cpumask_var(cpumask_var_t mask)
 static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 }
+
+static inline bool cpumask_available(cpumask_var_t mask)
+{
+	return true;
+}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
 /* It's common to want to use cpu_all_mask in struct member initializers,
-- 
cgit v1.2.3


From 2ac00f17b2e110c67ed2af3713bc04aec62e4608 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:12 -0700
Subject: time: Delete do_sys_setimeofday()

struct timespec is not y2038 safe on 32 bit machines and needs to be
replaced with struct timespec64.

do_sys_timeofday() is just a wrapper function.  Replace all calls to this
function with direct calls to do_sys_timeofday64() instead and delete
do_sys_timeofday().

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/r/1490555058-4603-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/kernel/osf_sys.c |  4 +++-
 include/linux/timekeeping.h | 15 ---------------
 kernel/compat.c             |  4 ++--
 kernel/time/posix-stubs.c   |  5 ++++-
 kernel/time/posix-timers.c  |  5 ++++-
 kernel/time/time.c          |  4 ++--
 6 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 0b961093ca5c..9de47a9c6df2 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1016,6 +1016,7 @@ SYSCALL_DEFINE2(osf_gettimeofday, struct timeval32 __user *, tv,
 SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
 		struct timezone __user *, tz)
 {
+	struct timespec64 kts64;
 	struct timespec kts;
 	struct timezone ktz;
 
@@ -1023,13 +1024,14 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
 		if (get_tv32((struct timeval *)&kts, tv))
 			return -EFAULT;
 		kts.tv_nsec *= 1000;
+		kts64 = timespec_to_timespec64(kts);
 	}
 	if (tz) {
 		if (copy_from_user(&ktz, tz, sizeof(*tz)))
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+	return do_sys_settimeofday64(tv ? &kts64 : NULL, tz ? &ktz : NULL);
 }
 
 asmlinkage long sys_ni_posix_timers(void);
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index b598cbc7b576..3617a78897bb 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -19,21 +19,6 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday64(const struct timespec64 *ts);
 extern int do_sys_settimeofday64(const struct timespec64 *tv,
 				 const struct timezone *tz);
-static inline int do_sys_settimeofday(const struct timespec *tv,
-				      const struct timezone *tz)
-{
-	struct timespec64 ts64;
-
-	if (!tv)
-		return do_sys_settimeofday64(NULL, tz);
-
-	if (!timespec_valid(tv))
-		return -EINVAL;
-
-	ts64 = timespec_to_timespec64(*tv);
-	return do_sys_settimeofday64(&ts64, tz);
-}
-
 /*
  * Kernel time accessors
  */
diff --git a/kernel/compat.c b/kernel/compat.c
index 19aec5d98108..1eb9e8aad869 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
 COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
 		       struct timezone __user *, tz)
 {
+	struct timespec64 new_ts;
 	struct timeval user_tv;
-	struct timespec	new_ts;
 	struct timezone new_tz;
 
 	if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
 static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..95a1b1fc3968 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -49,13 +49,16 @@ SYS_NI(alarm);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
+	struct timespec64 new_tp64;
 	struct timespec new_tp;
 
 	if (which_clock != CLOCK_REALTIME)
 		return -EINVAL;
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
-	return do_sys_settimeofday(&new_tp, NULL);
+
+	new_tp64 = timespec_to_timespec64(new_tp);
+	return do_sys_settimeofday64(&new_tp64, NULL);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 50a6a47020de..f215ef792772 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -214,7 +214,10 @@ static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
 static int posix_clock_realtime_set(const clockid_t which_clock,
 				    const struct timespec *tp)
 {
-	return do_sys_settimeofday(tp, NULL);
+	struct timespec64 tp64;
+
+	tp64 = timespec_to_timespec64(*tp);
+	return do_sys_settimeofday64(&tp64, NULL);
 }
 
 static int posix_clock_realtime_adj(const clockid_t which_clock,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 25bdd2504571..6574bba44b55 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
 		struct timezone __user *, tz)
 {
+	struct timespec64 new_ts;
 	struct timeval user_tv;
-	struct timespec	new_ts;
 	struct timezone new_tz;
 
 	if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
 SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
-- 
cgit v1.2.3


From d340266e19ddb70dbd608f9deedcfb35fdb9d419 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:13 -0700
Subject: time: Change posix clocks ops interfaces to use timespec64

struct timespec is not y2038 safe on 32 bit machines.

The posix clocks apis use struct timespec directly and through struct
itimerspec.

Replace the posix clock interfaces to use struct timespec64 and struct
itimerspec64 instead.  Also fix up their implementations accordingly.

Note that the clock_getres() interface has also been changed to use
timespec64 even though this particular interface is not affected by the
y2038 problem. This helps verification for internal kernel code for y2038
readiness by getting rid of time_t/ timeval/ timespec.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: arnd@arndb.de
Cc: y2038@lists.linaro.org
Cc: netdev@vger.kernel.org
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/1490555058-4603-3-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/ptp/ptp_clock.c     | 18 +++++++-----------
 include/linux/posix-clock.h | 10 +++++-----
 kernel/time/posix-clock.c   | 34 ++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index e8142803a1a7..b77435783ef3 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -97,30 +97,26 @@ static s32 scaled_ppm_to_ppb(long ppm)
 
 /* posix clock implementation */
 
-static int ptp_clock_getres(struct posix_clock *pc, struct timespec *tp)
+static int ptp_clock_getres(struct posix_clock *pc, struct timespec64 *tp)
 {
 	tp->tv_sec = 0;
 	tp->tv_nsec = 1;
 	return 0;
 }
 
-static int ptp_clock_settime(struct posix_clock *pc, const struct timespec *tp)
+static int ptp_clock_settime(struct posix_clock *pc, const struct timespec64 *tp)
 {
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
-	struct timespec64 ts = timespec_to_timespec64(*tp);
 
-	return  ptp->info->settime64(ptp->info, &ts);
+	return  ptp->info->settime64(ptp->info, tp);
 }
 
-static int ptp_clock_gettime(struct posix_clock *pc, struct timespec *tp)
+static int ptp_clock_gettime(struct posix_clock *pc, struct timespec64 *tp)
 {
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
-	struct timespec64 ts;
 	int err;
 
-	err = ptp->info->gettime64(ptp->info, &ts);
-	if (!err)
-		*tp = timespec64_to_timespec(ts);
+	err = ptp->info->gettime64(ptp->info, tp);
 	return err;
 }
 
@@ -133,7 +129,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
 	ops = ptp->info;
 
 	if (tx->modes & ADJ_SETOFFSET) {
-		struct timespec ts;
+		struct timespec64 ts;
 		ktime_t kt;
 		s64 delta;
 
@@ -146,7 +142,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
 		if ((unsigned long) ts.tv_nsec >= NSEC_PER_SEC)
 			return -EINVAL;
 
-		kt = timespec_to_ktime(ts);
+		kt = timespec64_to_ktime(ts);
 		delta = ktime_to_ns(kt);
 		err = ops->adjtime(ops, delta);
 	} else if (tx->modes & ADJ_FREQUENCY) {
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 34c4498b800f..83b22ae9ae12 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -59,23 +59,23 @@ struct posix_clock_operations {
 
 	int  (*clock_adjtime)(struct posix_clock *pc, struct timex *tx);
 
-	int  (*clock_gettime)(struct posix_clock *pc, struct timespec *ts);
+	int  (*clock_gettime)(struct posix_clock *pc, struct timespec64 *ts);
 
-	int  (*clock_getres) (struct posix_clock *pc, struct timespec *ts);
+	int  (*clock_getres) (struct posix_clock *pc, struct timespec64 *ts);
 
 	int  (*clock_settime)(struct posix_clock *pc,
-			      const struct timespec *ts);
+			      const struct timespec64 *ts);
 
 	int  (*timer_create) (struct posix_clock *pc, struct k_itimer *kit);
 
 	int  (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit);
 
 	void (*timer_gettime)(struct posix_clock *pc,
-			      struct k_itimer *kit, struct itimerspec *tsp);
+			      struct k_itimer *kit, struct itimerspec64 *tsp);
 
 	int  (*timer_settime)(struct posix_clock *pc,
 			      struct k_itimer *kit, int flags,
-			      struct itimerspec *tsp, struct itimerspec *old);
+			      struct itimerspec64 *tsp, struct itimerspec64 *old);
 	/*
 	 * Optional character device methods:
 	 */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9cff0ab82b63..e24008c098c6 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -300,14 +300,17 @@ out:
 static int pc_clock_gettime(clockid_t id, struct timespec *ts)
 {
 	struct posix_clock_desc cd;
+	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_gettime)
-		err = cd.clk->ops.clock_gettime(cd.clk, ts);
+	if (cd.clk->ops.clock_gettime) {
+		err = cd.clk->ops.clock_gettime(cd.clk, &ts64);
+		*ts = timespec64_to_timespec(ts64);
+	}
 	else
 		err = -EOPNOTSUPP;
 
@@ -319,14 +322,17 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
 static int pc_clock_getres(clockid_t id, struct timespec *ts)
 {
 	struct posix_clock_desc cd;
+	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_getres)
-		err = cd.clk->ops.clock_getres(cd.clk, ts);
+	if (cd.clk->ops.clock_getres) {
+		err = cd.clk->ops.clock_getres(cd.clk, &ts64);
+		*ts = timespec64_to_timespec(ts64);
+	}
 	else
 		err = -EOPNOTSUPP;
 
@@ -337,6 +343,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
 
 static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 {
+	struct timespec64 ts64 = timespec_to_timespec64(*ts);
 	struct posix_clock_desc cd;
 	int err;
 
@@ -350,7 +357,7 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 	}
 
 	if (cd.clk->ops.clock_settime)
-		err = cd.clk->ops.clock_settime(cd.clk, ts);
+		err = cd.clk->ops.clock_settime(cd.clk, &ts64);
 	else
 		err = -EOPNOTSUPP;
 out:
@@ -403,29 +410,36 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
 {
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
+	struct itimerspec64 ts64;
 
 	if (get_clock_desc(id, &cd))
 		return;
 
-	if (cd.clk->ops.timer_gettime)
-		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
-
+	if (cd.clk->ops.timer_gettime) {
+		cd.clk->ops.timer_gettime(cd.clk, kit, &ts64);
+		*ts = itimerspec64_to_itimerspec(&ts64);
+	}
 	put_clock_desc(&cd);
 }
 
 static int pc_timer_settime(struct k_itimer *kit, int flags,
 			    struct itimerspec *ts, struct itimerspec *old)
 {
+	struct itimerspec64 ts64 = itimerspec_to_itimerspec64(ts);
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
+	struct itimerspec64 old64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.timer_settime)
-		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+	if (cd.clk->ops.timer_settime) {
+		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, &ts64, &old64);
+		if (old)
+			*old = itimerspec64_to_itimerspec(&old64);
+	}
 	else
 		err = -EOPNOTSUPP;
 
-- 
cgit v1.2.3


From 3c9c12f4b4610dba864038c7822b427816f5893c Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:14 -0700
Subject: time: Change k_clock clock_get() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

The syscall interfaces themselves will be changed in a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         |  4 ++--
 include/linux/posix-timers.h   |  2 +-
 include/linux/timekeeping.h    |  5 +++++
 kernel/time/alarmtimer.c       |  4 ++--
 kernel/time/posix-clock.c      |  9 +++------
 kernel/time/posix-cpu-timers.c | 10 +++++-----
 kernel/time/posix-stubs.c      |  9 ++++++---
 kernel/time/posix-timers.c     | 32 +++++++++++++++++---------------
 8 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index b708c85dc9c1..40d880b8c02f 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -478,13 +478,13 @@ static int sgi_clock_period;
 static struct timespec sgi_clock_offset;
 static int sgi_clock_period;
 
-static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
+static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
 {
 	u64 nsec;
 
 	nsec = rtc_time() * sgi_clock_period
 			+ sgi_clock_offset.tv_nsec;
-	*tp = ns_to_timespec(nsec);
+	*tp = ns_to_timespec64(nsec);
 	tp->tv_sec += sgi_clock_offset.tv_sec;
 	return 0;
 };
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 64aa189efe21..0688f3975da7 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -90,7 +90,7 @@ struct k_clock {
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
-	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3617a78897bb..ddc229ff6d1e 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -258,6 +258,11 @@ static inline void timekeeping_clocktai(struct timespec *ts)
 	*ts = ktime_to_timespec(ktime_get_clocktai());
 }
 
+static inline void timekeeping_clocktai64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_clocktai());
+}
+
 /*
  * RTC specific
  */
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ce3a31e8eb36..944ca6e6f1c2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
  *
  * Provides the underlying alarm base time.
  */
-static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
 {
 	struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
 
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
-	*tp = ktime_to_timespec(base->gettime());
+	*tp = ktime_to_timespec64(base->gettime());
 	return 0;
 }
 
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index e24008c098c6..fab6bd33155e 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -297,20 +297,17 @@ out:
 	return err;
 }
 
-static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
 {
 	struct posix_clock_desc cd;
-	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_gettime) {
-		err = cd.clk->ops.clock_gettime(cd.clk, &ts64);
-		*ts = timespec64_to_timespec(ts64);
-	}
+	if (cd.clk->ops.clock_gettime)
+		err = cd.clk->ops.clock_gettime(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 76bea3a47d4b..082231cc7953 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 
 static int posix_cpu_clock_get_task(struct task_struct *tsk,
 				    const clockid_t which_clock,
-				    struct timespec *tp)
+				    struct timespec64 *tp)
 {
 	int err = -EINVAL;
 	u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 	}
 
 	if (!err)
-		*tp = ns_to_timespec(rtn);
+		*tp = ns_to_timespec64(rtn);
 
 	return err;
 }
 
 
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int err = -EINVAL;
@@ -1374,7 +1374,7 @@ static int process_cpu_clock_getres(const clockid_t which_clock,
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
 static int process_cpu_clock_get(const clockid_t which_clock,
-				 struct timespec *tp)
+				 struct timespec64 *tp)
 {
 	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1399,7 +1399,7 @@ static int thread_cpu_clock_getres(const clockid_t which_clock,
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
 static int thread_cpu_clock_get(const clockid_t which_clock,
-				struct timespec *tp)
+				struct timespec64 *tp)
 {
 	return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 95a1b1fc3968..0fbd0c5b95ca 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -64,14 +64,17 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
+	struct timespec64 kernel_tp64;
 	struct timespec kernel_tp;
 
 	switch (which_clock) {
-	case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
-	case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
-	case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+	case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
+	case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
+	case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
 	default: return -EINVAL;
 	}
+
+	kernel_tp = timespec64_to_timespec(kernel_tp64);
 	if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		return -EFAULT;
 	return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f215ef792772..68170642c77c 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -204,9 +204,9 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 }
 
 /* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
 {
-	ktime_get_real_ts(tp);
+	ktime_get_real_ts64(tp);
 	return 0;
 }
 
@@ -229,32 +229,32 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
 /*
  * Get monotonic time for posix timers
  */
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
 {
-	ktime_get_ts(tp);
+	ktime_get_ts64(tp);
 	return 0;
 }
 
 /*
  * Get monotonic-raw time for posix timers
  */
-static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
 {
-	getrawmonotonic(tp);
+	getrawmonotonic64(tp);
 	return 0;
 }
 
 
-static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
 {
-	*tp = current_kernel_time();
+	*tp = current_kernel_time64();
 	return 0;
 }
 
 static int posix_get_monotonic_coarse(clockid_t which_clock,
-						struct timespec *tp)
+						struct timespec64 *tp)
 {
-	*tp = get_monotonic_coarse();
+	*tp = get_monotonic_coarse64();
 	return 0;
 }
 
@@ -264,15 +264,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
 	return 0;
 }
 
-static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
-	get_monotonic_boottime(tp);
+	get_monotonic_boottime64(tp);
 	return 0;
 }
 
-static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 {
-	timekeeping_clocktai(tp);
+	timekeeping_clocktai64(tp);
 	return 0;
 }
 
@@ -1032,13 +1032,15 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 kernel_tp64;
 	struct timespec kernel_tp;
 	int error;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_get(which_clock, &kernel_tp);
+	error = kc->clock_get(which_clock, &kernel_tp64);
+	kernel_tp = timespec64_to_timespec(kernel_tp64);
 
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
-- 
cgit v1.2.3


From d2e3e0ca5df7f4ffe90a04790b3be20485df056a Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:15 -0700
Subject: time: Change k_clock clock_getres() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel. The syscall
interfaces themselves will be changed in a separate series.

The clock_getres() interface has also been changed to use timespec64 even
though this particular interface is not affected by the y2038 problem. This
helps verification for internal kernel code for y2038 readiness by getting
rid of time_t/ timeval/ timespec completely.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-5-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         |  2 +-
 include/linux/posix-timers.h   |  2 +-
 kernel/time/alarmtimer.c       |  2 +-
 kernel/time/posix-clock.c      |  9 +++------
 kernel/time/posix-cpu-timers.c |  6 +++---
 kernel/time/posix-timers.c     | 10 ++++++----
 6 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 40d880b8c02f..79d8ada08a80 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -765,7 +765,7 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 	return err;
 }
 
-static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
 	tp->tv_sec = 0;
 	tp->tv_nsec = sgi_clock_period;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 0688f3975da7..dd05b49074f3 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -87,7 +87,7 @@ struct k_itimer {
 };
 
 struct k_clock {
-	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
+	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 944ca6e6f1c2..e8a45e2c3d95 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  *
  * Returns the granularity of underlying alarm base clock
  */
-static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index fab6bd33155e..af91031f64de 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -316,20 +316,17 @@ static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
 	return err;
 }
 
-static int pc_clock_getres(clockid_t id, struct timespec *ts)
+static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
 {
 	struct posix_clock_desc cd;
-	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_getres) {
-		err = cd.clk->ops.clock_getres(cd.clk, &ts64);
-		*ts = timespec64_to_timespec(ts64);
-	}
+	if (cd.clk->ops.clock_getres)
+		err = cd.clk->ops.clock_getres(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 082231cc7953..37ce9edfb968 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
 }
 
 static int
-posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -1369,7 +1369,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
 static int process_cpu_clock_getres(const clockid_t which_clock,
-				    struct timespec *tp)
+				    struct timespec64 *tp)
 {
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
@@ -1394,7 +1394,7 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
 	return -EINVAL;
 }
 static int thread_cpu_clock_getres(const clockid_t which_clock,
-				   struct timespec *tp)
+				   struct timespec64 *tp)
 {
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 68170642c77c..f67dae9f3bdf 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -258,9 +258,9 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
 	return 0;
 }
 
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
 {
-	*tp = ktime_to_timespec(KTIME_LOW_RES);
+	*tp = ktime_to_timespec64(KTIME_LOW_RES);
 	return 0;
 }
 
@@ -276,7 +276,7 @@ static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 	return 0;
 }
 
-static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 {
 	tp->tv_sec = 0;
 	tp->tv_nsec = hrtimer_resolution;
@@ -1075,13 +1075,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 rtn_tp64;
 	struct timespec rtn_tp;
 	int error;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_getres(which_clock, &rtn_tp);
+	error = kc->clock_getres(which_clock, &rtn_tp64);
+	rtn_tp = timespec64_to_timespec(rtn_tp64);
 
 	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
 		error = -EFAULT;
-- 
cgit v1.2.3


From 0fe6afe3834ba13d75fa1168f0f66f08b427e1c0 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:16 -0700
Subject: time: Change k_clock clock_set() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

The syscall interfaces themselves will be changed in a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-6-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         |  2 +-
 include/linux/posix-timers.h   |  2 +-
 kernel/time/posix-clock.c      |  5 ++---
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c     | 11 +++++------
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 79d8ada08a80..ba1b8925c4c8 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -489,7 +489,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
 	return 0;
 };
 
-static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp)
+static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp)
 {
 
 	u64 nsec;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index dd05b49074f3..7825e242b128 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -89,7 +89,7 @@ struct k_itimer {
 struct k_clock {
 	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_set) (const clockid_t which_clock,
-			  const struct timespec *tp);
+			  const struct timespec64 *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index af91031f64de..3807a34519c4 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -335,9 +335,8 @@ static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
 	return err;
 }
 
-static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
 {
-	struct timespec64 ts64 = timespec_to_timespec64(*ts);
 	struct posix_clock_desc cd;
 	int err;
 
@@ -351,7 +350,7 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 	}
 
 	if (cd.clk->ops.clock_settime)
-		err = cd.clk->ops.clock_settime(cd.clk, &ts64);
+		err = cd.clk->ops.clock_settime(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
 out:
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 37ce9edfb968..2cd4428c81f0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 }
 
 static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f67dae9f3bdf..7742da826f02 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -212,12 +212,9 @@ static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp
 
 /* Set clock_realtime */
 static int posix_clock_realtime_set(const clockid_t which_clock,
-				    const struct timespec *tp)
+				    const struct timespec64 *tp)
 {
-	struct timespec64 tp64;
-
-	tp64 = timespec_to_timespec64(*tp);
-	return do_sys_settimeofday64(&tp64, NULL);
+	return do_sys_settimeofday64(tp, NULL);
 }
 
 static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -1017,6 +1014,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 new_tp64;
 	struct timespec new_tp;
 
 	if (!kc || !kc->clock_set)
@@ -1024,8 +1022,9 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
+	new_tp64 = timespec_to_timespec64(new_tp);
 
-	return kc->clock_set(which_clock, &new_tp);
+	return kc->clock_set(which_clock, &new_tp64);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-- 
cgit v1.2.3


From 5f252b325625c13db1dbc76ac6cdb49ee3bd062e Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:17 -0700
Subject: time: Change k_clock timer_set() and timer_get() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

struct itimerspec internally uses struct timespec.  Use struct itimerspec64
which uses struct timespec64.

The syscall interfaces themselves will be changed in a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-7-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         | 20 ++++++++++----------
 include/linux/posix-timers.h   | 12 ++++++------
 kernel/time/alarmtimer.c       | 14 +++++++-------
 kernel/time/posix-clock.c      | 21 +++++++--------------
 kernel/time/posix-cpu-timers.c | 28 ++++++++++++++++------------
 kernel/time/posix-timers.c     | 37 +++++++++++++++++++++----------------
 6 files changed, 67 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index ba1b8925c4c8..0e7fcb04f01e 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -657,7 +657,7 @@ static int sgi_timer_del(struct k_itimer *timr)
 }
 
 /* Assumption: it_lock is already held with irq's disabled */
-static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
 
 	if (timr->it.mmtimer.clock == TIMER_OFF) {
@@ -668,14 +668,14 @@ static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 		return;
 	}
 
-	cur_setting->it_interval = ns_to_timespec(timr->it.mmtimer.incr * sgi_clock_period);
-	cur_setting->it_value = ns_to_timespec((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
+	cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period);
+	cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
 }
 
 
 static int sgi_timer_set(struct k_itimer *timr, int flags,
-	struct itimerspec * new_setting,
-	struct itimerspec * old_setting)
+	struct itimerspec64 *new_setting,
+	struct itimerspec64 *old_setting)
 {
 	unsigned long when, period, irqflags;
 	int err = 0;
@@ -687,8 +687,8 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 		sgi_timer_get(timr, old_setting);
 
 	sgi_timer_del(timr);
-	when = timespec_to_ns(&new_setting->it_value);
-	period = timespec_to_ns(&new_setting->it_interval);
+	when = timespec64_to_ns(&new_setting->it_value);
+	period = timespec64_to_ns(&new_setting->it_interval);
 
 	if (when == 0)
 		/* Clear timer */
@@ -699,11 +699,11 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 		return -ENOMEM;
 
 	if (flags & TIMER_ABSTIME) {
-		struct timespec n;
+		struct timespec64 n;
 		unsigned long now;
 
-		getnstimeofday(&n);
-		now = timespec_to_ns(&n);
+		getnstimeofday64(&n);
+		now = timespec64_to_ns(&n);
 		if (when > now)
 			when -= now;
 		else
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 7825e242b128..ebc4c4945339 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -96,13 +96,13 @@ struct k_clock {
 	int (*nsleep) (const clockid_t which_clock, int flags,
 		       struct timespec *, struct timespec __user *);
 	long (*nsleep_restart) (struct restart_block *restart_block);
-	int (*timer_set) (struct k_itimer * timr, int flags,
-			  struct itimerspec * new_setting,
-			  struct itimerspec * old_setting);
-	int (*timer_del) (struct k_itimer * timr);
+	int (*timer_set) (struct k_itimer *timr, int flags,
+			  struct itimerspec64 *new_setting,
+			  struct itimerspec64 *old_setting);
+	int (*timer_del) (struct k_itimer *timr);
 #define TIMER_RETRY 1
-	void (*timer_get) (struct k_itimer * timr,
-			   struct itimerspec * cur_setting);
+	void (*timer_get) (struct k_itimer *timr,
+			   struct itimerspec64 *cur_setting);
 };
 
 extern struct k_clock clock_posix_cpu;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e8a45e2c3d95..e069f94999a8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
  * Copies out the current itimerspec data
  */
 static void alarm_timer_get(struct k_itimer *timr,
-				struct itimerspec *cur_setting)
+			    struct itimerspec64 *cur_setting)
 {
 	ktime_t relative_expiry_time =
 		alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
 
 	if (ktime_to_ns(relative_expiry_time) > 0) {
-		cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+		cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
 	} else {
 		cur_setting->it_value.tv_sec = 0;
 		cur_setting->it_value.tv_nsec = 0;
 	}
 
-	cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+	cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
 }
 
 /**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
  * Sets the timer to new_setting, and starts the timer.
  */
 static int alarm_timer_set(struct k_itimer *timr, int flags,
-				struct itimerspec *new_setting,
-				struct itimerspec *old_setting)
+			   struct itimerspec64 *new_setting,
+			   struct itimerspec64 *old_setting)
 {
 	ktime_t exp;
 
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 		return TIMER_RETRY;
 
 	/* start the timer */
-	timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-	exp = timespec_to_ktime(new_setting->it_value);
+	timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+	exp = timespec64_to_ktime(new_setting->it_value);
 	/* Convert (if necessary) to absolute time */
 	if (flags != TIMER_ABSTIME) {
 		ktime_t now;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 3807a34519c4..31d588d37a17 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -399,40 +399,33 @@ static int pc_timer_delete(struct k_itimer *kit)
 	return err;
 }
 
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
 {
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
-	struct itimerspec64 ts64;
 
 	if (get_clock_desc(id, &cd))
 		return;
 
-	if (cd.clk->ops.timer_gettime) {
-		cd.clk->ops.timer_gettime(cd.clk, kit, &ts64);
-		*ts = itimerspec64_to_itimerspec(&ts64);
-	}
+	if (cd.clk->ops.timer_gettime)
+		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
 	put_clock_desc(&cd);
 }
 
 static int pc_timer_settime(struct k_itimer *kit, int flags,
-			    struct itimerspec *ts, struct itimerspec *old)
+			    struct itimerspec64 *ts, struct itimerspec64 *old)
 {
-	struct itimerspec64 ts64 = itimerspec_to_itimerspec64(ts);
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
-	struct itimerspec64 old64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.timer_settime) {
-		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, &ts64, &old64);
-		if (old)
-			*old = itimerspec64_to_itimerspec(&old64);
-	}
+	if (cd.clk->ops.timer_settime)
+		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
 	else
 		err = -EOPNOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2cd4428c81f0..441064d1216f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
 static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
-			       struct itimerspec *new, struct itimerspec *old)
+			       struct itimerspec64 *new, struct itimerspec64 *old)
 {
 	unsigned long flags;
 	struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 	WARN_ON_ONCE(p == NULL);
 
-	new_expires = timespec_to_ns(&new->it_value);
+	new_expires = timespec64_to_ns(&new->it_value);
 
 	/*
 	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			bump_cpu_timer(timer, val);
 			if (val < timer->it.cpu.expires) {
 				old_expires = timer->it.cpu.expires - val;
-				old->it_value = ns_to_timespec(old_expires);
+				old->it_value = ns_to_timespec64(old_expires);
 			} else {
 				old->it_value.tv_nsec = 1;
 				old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * Install the new reload setting, and
 	 * set up the signal and overrun bookkeeping.
 	 */
-	timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
+	timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
 
 	/*
 	 * This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	ret = 0;
  out:
 	if (old)
-		old->it_interval = ns_to_timespec(old_incr);
+		old->it_interval = ns_to_timespec64(old_incr);
 
 	return ret;
 }
 
-static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
 	u64 now;
 	struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	/*
 	 * Easy part: convert the reload time.
 	 */
-	itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
+	itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
 
 	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */
 		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 * Call the timer disarmed, nothing else to do.
 			 */
 			timer->it.cpu.expires = 0;
-			itp->it_value = ns_to_timespec(timer->it.cpu.expires);
+			itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
 			return;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	}
 
 	if (now < timer->it.cpu.expires) {
-		itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
+		itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
 	} else {
 		/*
 		 * The timer should have expired already, but the firing
@@ -1221,6 +1221,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 			    struct timespec *rqtp, struct itimerspec *it)
 {
+	struct itimerspec64 it64;
 	struct k_itimer timer;
 	int error;
 
@@ -1234,13 +1235,14 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
 	if (!error) {
-		static struct itimerspec zero_it;
+		static struct itimerspec64 zero_it;
 
 		memset(it, 0, sizeof *it);
 		it->it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		error = posix_cpu_timer_set(&timer, flags, it, NULL);
+		it64 = itimerspec_to_itimerspec64(it);
+		error = posix_cpu_timer_set(&timer, flags, &it64, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1270,7 +1272,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 * We were interrupted by a signal.
 		 */
 		*rqtp = ns_to_timespec(timer.it.cpu.expires);
-		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		it64 = itimerspec_to_itimerspec64(it);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it64);
+		*it = itimerspec64_to_itimerspec(&it64);
 		if (!error) {
 			/*
 			 * Timer is now unarmed, deletion can not fail.
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 7742da826f02..9da4797d0c63 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -133,9 +133,9 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 static int common_nsleep(const clockid_t, int flags, struct timespec *t,
 			 struct timespec __user *rmtp);
 static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
 static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec *, struct itimerspec *);
+			    struct itimerspec64 *, struct itimerspec64 *);
 static int common_timer_del(struct k_itimer *timer);
 
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
  * report.
  */
 static void
-common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
 	ktime_t now, remaining, iv;
 	struct hrtimer *timer = &timr->it.real.timer;
 
-	memset(cur_setting, 0, sizeof(struct itimerspec));
+	memset(cur_setting, 0, sizeof(*cur_setting));
 
 	iv = timr->it.real.interval;
 
 	/* interval timer ? */
 	if (iv)
-		cur_setting->it_interval = ktime_to_timespec(iv);
+		cur_setting->it_interval = ktime_to_timespec64(iv);
 	else if (!hrtimer_active(timer) &&
 		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
 		return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 		if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
 			cur_setting->it_value.tv_nsec = 1;
 	} else
-		cur_setting->it_value = ktime_to_timespec(remaining);
+		cur_setting->it_value = ktime_to_timespec64(remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		struct itimerspec __user *, setting)
 {
+	struct itimerspec64 cur_setting64;
 	struct itimerspec cur_setting;
 	struct k_itimer *timr;
 	struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	if (WARN_ON_ONCE(!kc || !kc->timer_get))
 		ret = -EINVAL;
 	else
-		kc->timer_get(timr, &cur_setting);
+		kc->timer_get(timr, &cur_setting64);
 
 	unlock_timer(timr, flags);
 
+	cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
 	if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
 		return -EFAULT;
 
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 /* timr->it_lock is taken. */
 static int
 common_timer_set(struct k_itimer *timr, int flags,
-		 struct itimerspec *new_setting, struct itimerspec *old_setting)
+		 struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 	enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
 	timr->it.real.timer.function = posix_timer_fn;
 
-	hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+	hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
 
 	/* Convert interval */
-	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+	timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
 
 	/* SIGEV_NONE timers are not queued ! See common_timer_get */
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 		const struct itimerspec __user *, new_setting,
 		struct itimerspec __user *, old_setting)
 {
-	struct k_itimer *timr;
+	struct itimerspec64 new_spec64, old_spec64;
+	struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
 	struct itimerspec new_spec, old_spec;
-	int error = 0;
+	struct k_itimer *timr;
 	unsigned long flag;
-	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
 	struct k_clock *kc;
+	int error = 0;
 
 	if (!new_setting)
 		return -EINVAL;
 
 	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
 		return -EFAULT;
+	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
 
-	if (!timespec_valid(&new_spec.it_interval) ||
-	    !timespec_valid(&new_spec.it_value))
+	if (!timespec64_valid(&new_spec64.it_interval) ||
+	    !timespec64_valid(&new_spec64.it_value))
 		return -EINVAL;
 retry:
 	timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
 	else
-		error = kc->timer_set(timr, flags, &new_spec, rtn);
+		error = kc->timer_set(timr, flags, &new_spec64, rtn);
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
 		goto retry;
 	}
 
+	old_spec = itimerspec64_to_itimerspec(&old_spec64);
 	if (old_setting && !error &&
 	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
 		error = -EFAULT;
-- 
cgit v1.2.3


From ad19638463a4b5c909fcebf0f19358c4bf4fed48 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:18 -0700
Subject: time: Change k_clock nsleep() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

The syscall interfaces themselves will be changed in a separate series.

Note that the restart_block parameter for nanosleep has also been left
unchanged and will be part of syscall series noted above.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-8-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h        |  2 +-
 include/linux/posix-timers.h   |  2 +-
 kernel/compat.c                |  6 ++++--
 kernel/time/alarmtimer.c       |  7 ++++---
 kernel/time/hrtimer.c          | 10 ++++++----
 kernel/time/posix-cpu-timers.c | 36 ++++++++++++++++++------------------
 kernel/time/posix-stubs.c      |  6 ++++--
 kernel/time/posix-timers.c     | 10 ++++++----
 8 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 23d58fcd4d9a..8c5b10eb7265 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -452,7 +452,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 }
 
 /* Precise sleep: */
-extern long hrtimer_nanosleep(struct timespec *rqtp,
+extern long hrtimer_nanosleep(struct timespec64 *rqtp,
 			      struct timespec __user *rmtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ebc4c4945339..8c1e43ab14a9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -94,7 +94,7 @@ struct k_clock {
 	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
-		       struct timespec *, struct timespec __user *);
+		       struct timespec64 *, struct timespec __user *);
 	long (*nsleep_restart) (struct restart_block *restart_block);
 	int (*timer_set) (struct k_itimer *timr, int flags,
 			  struct itimerspec64 *new_setting,
diff --git a/kernel/compat.c b/kernel/compat.c
index 1eb9e8aad869..933bcb31ae10 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
 {
 	struct timespec tu, rmt;
+	struct timespec64 tu64;
 	mm_segment_t oldfs;
 	long ret;
 
 	if (compat_get_timespec(&tu, rqtp))
 		return -EFAULT;
 
-	if (!timespec_valid(&tu))
+	tu64 = timespec_to_timespec64(tu);
+	if (!timespec64_valid(&tu64))
 		return -EINVAL;
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	ret = hrtimer_nanosleep(&tu,
+	ret = hrtimer_nanosleep(&tu64,
 				rmtp ? (struct timespec __user *)&rmt : NULL,
 				HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 	set_fs(oldfs);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e069f94999a8..5cb5b0008d97 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -790,13 +790,14 @@ out:
  * Handles clock_nanosleep calls against _ALARM clockids
  */
 static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *tsreq, struct timespec __user *rmtp)
+			      struct timespec64 *tsreq,
+			      struct timespec __user *rmtp)
 {
 	enum  alarmtimer_type type = clock2alarm(which_clock);
+	struct restart_block *restart;
 	struct alarm alarm;
 	ktime_t exp;
 	int ret = 0;
-	struct restart_block *restart;
 
 	if (!alarmtimer_get_rtcdev())
 		return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 
 	alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
 
-	exp = timespec_to_ktime(*tsreq);
+	exp = timespec64_to_ktime(*tsreq);
 	/* Convert (if necessary) to absolute time */
 	if (flags != TIMER_ABSTIME) {
 		ktime_t now = alarm_bases[type].gettime();
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1ef82cdb61ff..a7560123617c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1503,7 +1503,7 @@ out:
 	return ret;
 }
 
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
@@ -1516,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		slack = 0;
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
-	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+	hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
 	if (do_nanosleep(&t, mode))
 		goto out;
 
@@ -1547,15 +1547,17 @@ out:
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
+	struct timespec64 tu64;
 	struct timespec tu;
 
 	if (copy_from_user(&tu, rqtp, sizeof(tu)))
 		return -EFAULT;
 
-	if (!timespec_valid(&tu))
+	tu64 = timespec_to_timespec64(tu);
+	if (!timespec64_valid(&tu64))
 		return -EINVAL;
 
-	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 /*
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 441064d1216f..949e434d3536 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1219,9 +1219,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
-			    struct timespec *rqtp, struct itimerspec *it)
+			    struct timespec64 *rqtp, struct itimerspec64 *it)
 {
-	struct itimerspec64 it64;
 	struct k_itimer timer;
 	int error;
 
@@ -1241,8 +1240,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		it->it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		it64 = itimerspec_to_itimerspec64(it);
-		error = posix_cpu_timer_set(&timer, flags, &it64, NULL);
+		error = posix_cpu_timer_set(&timer, flags, it, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1271,10 +1269,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * We were interrupted by a signal.
 		 */
-		*rqtp = ns_to_timespec(timer.it.cpu.expires);
-		it64 = itimerspec_to_itimerspec64(it);
-		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it64);
-		*it = itimerspec64_to_itimerspec(&it64);
+		*rqtp = ns_to_timespec64(timer.it.cpu.expires);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
 		if (!error) {
 			/*
 			 * Timer is now unarmed, deletion can not fail.
@@ -1310,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-			    struct timespec *rqtp, struct timespec __user *rmtp)
+			    struct timespec64 *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block = &current->restart_block;
-	struct itimerspec it;
+	struct itimerspec64 it;
+	struct timespec ts;
 	int error;
 
 	/*
@@ -1333,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		ts = timespec64_to_timespec(it.it_value);
+		if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
 			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
 		restart_block->nanosleep.clockid = which_clock;
 		restart_block->nanosleep.rmtp = rmtp;
-		restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+		restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
 	}
 	return error;
 }
@@ -1347,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.clockid;
-	struct timespec t;
-	struct itimerspec it;
+	struct itimerspec64 it;
+	struct timespec64 t;
+	struct timespec tmp;
 	int error;
 
-	t = ns_to_timespec(restart_block->nanosleep.expires);
+	t = ns_to_timespec64(restart_block->nanosleep.expires);
 
 	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
 
@@ -1360,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		 tmp = timespec64_to_timespec(it.it_value);
+		if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
 			return -EFAULT;
 
-		restart_block->nanosleep.expires = timespec_to_ns(&t);
+		restart_block->nanosleep.expires = timespec64_to_ns(&t);
 	}
 	return error;
 
@@ -1388,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp,
+			      struct timespec64 *rqtp,
 			      struct timespec __user *rmtp)
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 0fbd0c5b95ca..c0cd53eb018a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -103,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
+	struct timespec64 t64;
 	struct timespec t;
 
 	switch (which_clock) {
@@ -111,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	case CLOCK_BOOTTIME:
 		if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 			return -EFAULT;
-		if (!timespec_valid(&t))
+		t64 = timespec_to_timespec64(t);
+		if (!timespec64_valid(&t64))
 			return -EINVAL;
-		return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+		return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
 					 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 					 which_clock);
 	default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9da4797d0c63..4d7b2ce09c27 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -130,7 +130,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 /*
  * These ones are defined below.
  */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
 			 struct timespec __user *rmtp);
 static int common_timer_create(struct k_itimer *new_timer);
 static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
@@ -1099,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
  * nanosleep for monotonic and realtime clocks
  */
 static int common_nsleep(const clockid_t which_clock, int flags,
-			 struct timespec *tsave, struct timespec __user *rmtp)
+			 struct timespec64 *tsave, struct timespec __user *rmtp)
 {
 	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1111,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		struct timespec __user *, rmtp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 t64;
 	struct timespec t;
 
 	if (!kc)
@@ -1121,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
 
-	if (!timespec_valid(&t))
+	t64 = timespec_to_timespec64(t);
+	if (!timespec64_valid(&t64))
 		return -EINVAL;
 
-	return kc->nsleep(which_clock, flags, &t, rmtp);
+	return kc->nsleep(which_clock, flags, &t64, rmtp);
 }
 
 /*
-- 
cgit v1.2.3


From 5a8d75a1b8c99bdc926ba69b7b7dbe4fae81a5af Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 14 Apr 2017 13:58:29 -0600
Subject: block: fix bio_will_gap() for first bvec with offset

Commit 729204ef49ec("block: relax check on sg gap") allows us to merge
bios, if both are physically contiguous.  This change can merge a huge
number of small bios, through mkfs for example, mkfs.ntfs running time
can be decreased to ~1/10.

But if one rq starts with a non-aligned buffer (the 1st bvec's bv_offset
is non-zero) and if we allow the merge, it is quite difficult to respect
sg gap limit, especially the max segment size, or we risk having an
unaligned virtual boundary.  This patch tries to avoid the issue by
disallowing a merge, if the req starts with an unaligned buffer.

Also add comments to explain why the merged segment can't end in
unaligned virt boundary.

Fixes: 729204ef49ec ("block: relax check on sg gap")
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>

Rewrote parts of the commit message and comments.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7548f332121a..01a696b0a4d3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1672,12 +1672,36 @@ static inline bool bios_segs_mergeable(struct request_queue *q,
 	return true;
 }
 
-static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
-			 struct bio *next)
+static inline bool bio_will_gap(struct request_queue *q,
+				struct request *prev_rq,
+				struct bio *prev,
+				struct bio *next)
 {
 	if (bio_has_data(prev) && queue_virt_boundary(q)) {
 		struct bio_vec pb, nb;
 
+		/*
+		 * don't merge if the 1st bio starts with non-zero
+		 * offset, otherwise it is quite difficult to respect
+		 * sg gap limit. We work hard to merge a huge number of small
+		 * single bios in case of mkfs.
+		 */
+		if (prev_rq)
+			bio_get_first_bvec(prev_rq->bio, &pb);
+		else
+			bio_get_first_bvec(prev, &pb);
+		if (pb.bv_offset)
+			return true;
+
+		/*
+		 * We don't need to worry about the situation that the
+		 * merged segment ends in unaligned virt boundary:
+		 *
+		 * - if 'pb' ends aligned, the merged segment ends aligned
+		 * - if 'pb' ends unaligned, the next bio must include
+		 *   one single bvec of 'nb', otherwise the 'nb' can't
+		 *   merge with 'pb'
+		 */
 		bio_get_last_bvec(prev, &pb);
 		bio_get_first_bvec(next, &nb);
 
@@ -1690,12 +1714,12 @@ static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
 {
-	return bio_will_gap(req->q, req->biotail, bio);
+	return bio_will_gap(req->q, req, req->biotail, bio);
 }
 
 static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
 {
-	return bio_will_gap(req->q, bio, req->bio);
+	return bio_will_gap(req->q, NULL, bio, req->bio);
 }
 
 int kblockd_schedule_work(struct work_struct *work);
-- 
cgit v1.2.3


From 84253394927c4352652d0b118ad9583f5646959b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Apr 2017 13:28:46 +0200
Subject: remove the mg_disk driver

This drivers was added in 2008, but as far as a I can tell we never had a
single platform that actually registered resources for the platform driver.

It's also been unmaintained for a long time and apparently has a ATA mode
that can be driven using the IDE/libata subsystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/blockdev/mflash.txt |   84 ---
 drivers/block/Kconfig             |   17 -
 drivers/block/Makefile            |    1 -
 drivers/block/mg_disk.c           | 1110 -------------------------------------
 include/linux/mg_disk.h           |   45 --
 5 files changed, 1257 deletions(-)
 delete mode 100644 Documentation/blockdev/mflash.txt
 delete mode 100644 drivers/block/mg_disk.c
 delete mode 100644 include/linux/mg_disk.h

(limited to 'include/linux')

diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
deleted file mode 100644
index f7e050551487..000000000000
--- a/Documentation/blockdev/mflash.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-This document describes m[g]flash support in linux.
-
-Contents
-  1. Overview
-  2. Reserved area configuration
-  3. Example of mflash platform driver registration
-
-1. Overview
-
-Mflash and gflash are embedded flash drive. The only difference is mflash is
-MCP(Multi Chip Package) device. These two device operate exactly same way.
-So the rest mflash repersents mflash and gflash altogether.
-
-Internally, mflash has nand flash and other hardware logics and supports
-2 different operation (ATA, IO) modes. ATA mode doesn't need any new
-driver and currently works well under standard IDE subsystem. Actually it's
-one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
-IDE interface.
-
-Following are brief descriptions about IO mode.
-A. IO mode based on ATA protocol and uses some custom command. (read confirm,
-write confirm)
-B. IO mode uses SRAM bus interface.
-C. IO mode supports 4kB boot area, so host can boot from mflash.
-
-2. Reserved area configuration
-If host boot from mflash, usually needs raw area for boot loader image. All of
-the mflash's block device operation will be taken this value as start offset.
-Note that boot loader's size of reserved area and kernel configuration value
-must be same.
-
-3. Example of mflash platform driver registration
-Working mflash is very straight forward. Adding platform device stuff to board
-configuration file is all. Here is some pseudo example.
-
-static struct mg_drv_data mflash_drv_data = {
-	/* If you want to polling driver set to 1 */
-	.use_polling = 0,
-	/* device attribution */
-	.dev_attr = MG_BOOT_DEV
-};
-
-static struct resource mg_mflash_rsc[] = {
-	/* Base address of mflash */
-	[0] = {
-		.start = 0x08000000,
-		.end = 0x08000000 + SZ_64K - 1,
-		.flags = IORESOURCE_MEM
-	},
-	/* mflash interrupt pin */
-	[1] = {
-		.start = IRQ_GPIO(84),
-		.end = IRQ_GPIO(84),
-		.flags = IORESOURCE_IRQ
-	},
-	/* mflash reset pin */
-	[2] = {
-		.start = 43,
-		.end = 43,
-		.name = MG_RST_PIN,
-		.flags = IORESOURCE_IO
-	},
-	/* mflash reset-out pin
-	 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
-	 * should assign this */
-	[3] = {
-		.start = 51,
-		.end = 51,
-		.name = MG_RSTOUT_PIN,
-		.flags = IORESOURCE_IO
-	}
-};
-
-static struct platform_device mflash_dev = {
-	.name = MG_DEV_NAME,
-	.id = -1,
-	.dev = {
-		.platform_data = &mflash_drv_data,
-	},
-	.num_resources = ARRAY_SIZE(mg_mflash_rsc),
-	.resource = mg_mflash_rsc
-};
-
-platform_device_register(&mflash_dev);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a1c2e816128f..ebe8c1a6195e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -434,23 +434,6 @@ config ATA_OVER_ETH
 	This driver provides Support for ATA over Ethernet block
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
-config MG_DISK
-	tristate "mGine mflash, gflash support"
-	depends on ARM && GPIOLIB
-	help
-	  mGine mFlash(gFlash) block device driver
-
-config MG_DISK_RES
-	int "Size of reserved area before MBR"
-	depends on MG_DISK
-	default 0
-	help
-	  Define size of reserved area that usually used for boot. Unit is KB.
-	  All of the block device operation will be taken this value as start
-	  offset
-	  Examples:
-			1024 => 1 MB
-
 config SUNVDC
 	tristate "Sun Virtual Disk Client support"
 	depends on SUN_LDOMS
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index b12c772bbeb3..5ceead8b52d7 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
-obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
deleted file mode 100644
index e88e7b06c616..000000000000
--- a/drivers/block/mg_disk.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-/*
- *  drivers/block/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-#include <linux/ata.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/mg_disk.h>
-#include <linux/slab.h>
-
-#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET			0x8000
-#define MG_REG_OFFSET			0xC000
-#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
-#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
-#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
-#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
-#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
-
-/* handy status */
-#define MG_STAT_READY	(ATA_DRDY | ATA_DSC)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
-				 ATA_ERR))) == MG_STAT_READY)
-
-/* error code for others */
-#define MG_ERR_NONE		0
-#define MG_ERR_TIMEOUT		0x100
-#define MG_ERR_INIT_STAT	0x101
-#define MG_ERR_TRANSLATION	0x102
-#define MG_ERR_CTRL_RST		0x103
-#define MG_ERR_INV_STAT		0x104
-#define MG_ERR_RSTOUT		0x105
-
-#define MG_MAX_ERRORS	6	/* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD	1
-#define MG_TMAX_WAIT_RD_DRQ	10
-#define MG_TMAX_WAIT_WR_DRQ	500
-#define MG_TMAX_RST_TO_BUSY	10
-#define MG_TMAX_HDRST_TO_RDY	500
-#define MG_TMAX_SWRST_TO_RDY	500
-#define MG_TMAX_RSTOUT		3000
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* main structure for mflash driver */
-struct mg_host {
-	struct device *dev;
-
-	struct request_queue *breq;
-	struct request *req;
-	spinlock_t lock;
-	struct gendisk *gd;
-
-	struct timer_list timer;
-	void (*mg_do_intr) (struct mg_host *);
-
-	u16 id[ATA_ID_WORDS];
-
-	u16 cyls;
-	u16 heads;
-	u16 sectors;
-	u32 n_sectors;
-	u32 nres_sectors;
-
-	void __iomem *dev_base;
-	unsigned int irq;
-	unsigned int rst;
-	unsigned int rstout;
-
-	u32 major;
-	u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-static void mg_request(struct request_queue *);
-
-static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
-{
-	if (__blk_end_request(host->req, err, nr_bytes))
-		return true;
-
-	host->req = NULL;
-	return false;
-}
-
-static bool mg_end_request_cur(struct mg_host *host, int err)
-{
-	return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
-}
-
-static void mg_dump_status(const char *msg, unsigned int stat,
-		struct mg_host *host)
-{
-	char *name = MG_DISK_NAME;
-
-	if (host->req)
-		name = host->req->rq_disk->disk_name;
-
-	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-	if (stat & ATA_BUSY)
-		printk("Busy ");
-	if (stat & ATA_DRDY)
-		printk("DriveReady ");
-	if (stat & ATA_DF)
-		printk("WriteFault ");
-	if (stat & ATA_DSC)
-		printk("SeekComplete ");
-	if (stat & ATA_DRQ)
-		printk("DataRequest ");
-	if (stat & ATA_CORR)
-		printk("CorrectedError ");
-	if (stat & ATA_ERR)
-		printk("Error ");
-	printk("}\n");
-	if ((stat & ATA_ERR) == 0) {
-		host->error = 0;
-	} else {
-		host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
-		printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
-				host->error & 0xff);
-		if (host->error & ATA_BBK)
-			printk("BadSector ");
-		if (host->error & ATA_UNC)
-			printk("UncorrectableError ");
-		if (host->error & ATA_IDNF)
-			printk("SectorIdNotFound ");
-		if (host->error & ATA_ABORTED)
-			printk("DriveStatusError ");
-		if (host->error & ATA_AMNF)
-			printk("AddrMarkNotFound ");
-		printk("}");
-		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
-			if (host->req)
-				printk(", sector=%u",
-				       (unsigned int)blk_rq_pos(host->req));
-		}
-		printk("\n");
-	}
-}
-
-static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
-{
-	u8 status;
-	unsigned long expire, cur_jiffies;
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-
-	host->error = MG_ERR_NONE;
-	expire = jiffies + msecs_to_jiffies(msec);
-
-	/* These 2 times dummy status read prevents reading invalid
-	 * status. A very little time (3 times of mflash operating clk)
-	 * is required for busy bit is set. Use dummy read instead of
-	 * busy wait, because mflash's PLL is machine dependent.
-	 */
-	if (prv_data->use_polling) {
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-	}
-
-	status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-	do {
-		cur_jiffies = jiffies;
-		if (status & ATA_BUSY) {
-			if (expect == ATA_BUSY)
-				break;
-		} else {
-			/* Check the error condition! */
-			if (status & ATA_ERR) {
-				mg_dump_status("mg_wait", status, host);
-				break;
-			}
-
-			if (expect == MG_STAT_READY)
-				if (MG_READY_OK(status))
-					break;
-
-			if (expect == ATA_DRQ)
-				if (status & ATA_DRQ)
-					break;
-		}
-		if (!msec) {
-			mg_dump_status("not ready", status, host);
-			return MG_ERR_INV_STAT;
-		}
-
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-	} while (time_before(cur_jiffies, expire));
-
-	if (time_after_eq(cur_jiffies, expire) && msec)
-		host->error = MG_ERR_TIMEOUT;
-
-	return host->error;
-}
-
-static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
-{
-	unsigned long expire;
-
-	expire = jiffies + msecs_to_jiffies(msec);
-	while (time_before(jiffies, expire)) {
-		if (gpio_get_value(rstout) == 1)
-			return MG_ERR_NONE;
-		msleep(10);
-	}
-
-	return MG_ERR_RSTOUT;
-}
-
-static void mg_unexpected_intr(struct mg_host *host)
-{
-	u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-	mg_dump_status("mg_unexpected_intr", status, host);
-}
-
-static irqreturn_t mg_irq(int irq, void *dev_id)
-{
-	struct mg_host *host = dev_id;
-	void (*handler)(struct mg_host *) = host->mg_do_intr;
-
-	spin_lock(&host->lock);
-
-	host->mg_do_intr = NULL;
-	del_timer(&host->timer);
-	if (!handler)
-		handler = mg_unexpected_intr;
-	handler(host);
-
-	spin_unlock(&host->lock);
-
-	return IRQ_HANDLED;
-}
-
-/* local copy of ata_id_string() */
-static void mg_id_string(const u16 *id, unsigned char *s,
-			 unsigned int ofs, unsigned int len)
-{
-	unsigned int c;
-
-	BUG_ON(len & 1);
-
-	while (len > 0) {
-		c = id[ofs] >> 8;
-		*s = c;
-		s++;
-
-		c = id[ofs] & 0xff;
-		*s = c;
-		s++;
-
-		ofs++;
-		len -= 2;
-	}
-}
-
-/* local copy of ata_id_c_string() */
-static void mg_id_c_string(const u16 *id, unsigned char *s,
-			   unsigned int ofs, unsigned int len)
-{
-	unsigned char *p;
-
-	mg_id_string(id, s, ofs, len - 1);
-
-	p = s + strnlen(s, len - 1);
-	while (p > s && p[-1] == ' ')
-		p--;
-	*p = '\0';
-}
-
-static int mg_get_disk_id(struct mg_host *host)
-{
-	u32 i;
-	s32 err;
-	const u16 *id = host->id;
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-	char fwrev[ATA_ID_FW_REV_LEN + 1];
-	char model[ATA_ID_PROD_LEN + 1];
-	char serial[ATA_ID_SERNO_LEN + 1];
-
-	if (!prv_data->use_polling)
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
-	if (err)
-		return err;
-
-	for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
-		host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + i * 2));
-
-	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
-	if (err)
-		return err;
-
-	if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
-		return MG_ERR_TRANSLATION;
-
-	host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-	host->cyls = id[ATA_ID_CYLS];
-	host->heads = id[ATA_ID_HEADS];
-	host->sectors = id[ATA_ID_SECTORS];
-
-	if (MG_RES_SEC && host->heads && host->sectors) {
-		/* modify cyls, n_sectors */
-		host->cyls = (host->n_sectors - MG_RES_SEC) /
-			host->heads / host->sectors;
-		host->nres_sectors = host->n_sectors - host->cyls *
-			host->heads * host->sectors;
-		host->n_sectors -= host->nres_sectors;
-	}
-
-	mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
-	mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
-	mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
-	printk(KERN_INFO "mg_disk: model: %s\n", model);
-	printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
-	printk(KERN_INFO "mg_disk: serial: %s\n", serial);
-	printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
-			host->n_sectors, host->nres_sectors);
-
-	if (!prv_data->use_polling)
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	return err;
-}
-
-
-static int mg_disk_init(struct mg_host *host)
-{
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-	s32 err;
-	u8 init_status;
-
-	/* hdd rst low */
-	gpio_set_value(host->rst, 0);
-	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-	if (err)
-		return err;
-
-	/* hdd rst high */
-	gpio_set_value(host->rst, 1);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
-	if (err)
-		return err;
-
-	/* soft reset on */
-	outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
-			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-	if (err)
-		return err;
-
-	/* soft reset off */
-	outb(prv_data->use_polling ? ATA_NIEN : 0,
-			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
-	if (err)
-		return err;
-
-	init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
-
-	if (init_status == 0xf)
-		return MG_ERR_INIT_STAT;
-
-	return err;
-}
-
-static void mg_bad_rw_intr(struct mg_host *host)
-{
-	if (host->req)
-		if (++host->req->errors >= MG_MAX_ERRORS ||
-		    host->error == MG_ERR_TIMEOUT)
-			mg_end_request_cur(host, -EIO);
-}
-
-static unsigned int mg_out(struct mg_host *host,
-		unsigned int sect_num,
-		unsigned int sect_cnt,
-		unsigned int cmd,
-		void (*intr_addr)(struct mg_host *))
-{
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return host->error;
-
-	if (!prv_data->use_polling) {
-		host->mg_do_intr = intr_addr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-	if (MG_RES_SEC)
-		sect_num += MG_RES_SEC;
-	outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
-	outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
-	outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
-			MG_REG_CYL_LOW);
-	outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
-			MG_REG_CYL_HIGH);
-	outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
-			(unsigned long)host->dev_base + MG_REG_DRV_HEAD);
-	outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	return MG_ERR_NONE;
-}
-
-static void mg_read_one(struct mg_host *host, struct request *req)
-{
-	u16 *buff = (u16 *)bio_data(req->bio);
-	u32 i;
-
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-			      (i << 1));
-}
-
-static void mg_read(struct request *req)
-{
-	struct mg_host *host = req->rq_disk->private_data;
-
-	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
-		   MG_CMD_RD, NULL) != MG_ERR_NONE)
-		mg_bad_rw_intr(host);
-
-	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
-
-	do {
-		if (mg_wait(host, ATA_DRQ,
-			    MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		}
-
-		mg_read_one(host, req);
-
-		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_write_one(struct mg_host *host, struct request *req)
-{
-	u16 *buff = (u16 *)bio_data(req->bio);
-	u32 i;
-
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
-		     (i << 1));
-}
-
-static void mg_write(struct request *req)
-{
-	struct mg_host *host = req->rq_disk->private_data;
-	unsigned int rem = blk_rq_sectors(req);
-
-	if (mg_out(host, blk_rq_pos(req), rem,
-		   MG_CMD_WR, NULL) != MG_ERR_NONE) {
-		mg_bad_rw_intr(host);
-		return;
-	}
-
-	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       rem, blk_rq_pos(req), bio_data(req->bio));
-
-	if (mg_wait(host, ATA_DRQ,
-		    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-		mg_bad_rw_intr(host);
-		return;
-	}
-
-	do {
-		mg_write_one(host, req);
-
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-
-		rem--;
-		if (rem > 1 && mg_wait(host, ATA_DRQ,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		} else if (mg_wait(host, MG_STAT_READY,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		}
-	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_read_intr(struct mg_host *host)
-{
-	struct request *req = host->req;
-	u32 i;
-
-	/* check status */
-	do {
-		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & ATA_BUSY)
-			break;
-		if (!MG_READY_OK(i))
-			break;
-		if (i & ATA_DRQ)
-			goto ok_to_read;
-	} while (0);
-	mg_dump_status("mg_read_intr", i, host);
-	mg_bad_rw_intr(host);
-	mg_request(host->breq);
-	return;
-
-ok_to_read:
-	mg_read_one(host, req);
-
-	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-	       blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
-
-	/* send read confirm */
-	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-	if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
-		/* set handler if read remains */
-		host->mg_do_intr = mg_read_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	} else /* goto next request */
-		mg_request(host->breq);
-}
-
-static void mg_write_intr(struct mg_host *host)
-{
-	struct request *req = host->req;
-	u32 i;
-	bool rem;
-
-	/* check status */
-	do {
-		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & ATA_BUSY)
-			break;
-		if (!MG_READY_OK(i))
-			break;
-		if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
-			goto ok_to_write;
-	} while (0);
-	mg_dump_status("mg_write_intr", i, host);
-	mg_bad_rw_intr(host);
-	mg_request(host->breq);
-	return;
-
-ok_to_write:
-	if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
-		/* write 1 sector and set handler if remains */
-		mg_write_one(host, req);
-		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-		       blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
-		host->mg_do_intr = mg_write_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-
-	/* send write confirm */
-	outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-	if (!rem)
-		mg_request(host->breq);
-}
-
-static void mg_times_out(unsigned long data)
-{
-	struct mg_host *host = (struct mg_host *)data;
-	char *name;
-
-	spin_lock_irq(&host->lock);
-
-	if (!host->req)
-		goto out_unlock;
-
-	host->mg_do_intr = NULL;
-
-	name = host->req->rq_disk->disk_name;
-	printk(KERN_DEBUG "%s: timeout\n", name);
-
-	host->error = MG_ERR_TIMEOUT;
-	mg_bad_rw_intr(host);
-
-out_unlock:
-	mg_request(host->breq);
-	spin_unlock_irq(&host->lock);
-}
-
-static void mg_request_poll(struct request_queue *q)
-{
-	struct mg_host *host = q->queuedata;
-
-	while (1) {
-		if (!host->req) {
-			host->req = blk_fetch_request(q);
-			if (!host->req)
-				break;
-		}
-
-		switch (req_op(host->req)) {
-		case REQ_OP_READ:
-			mg_read(host->req);
-			break;
-		case REQ_OP_WRITE:
-			mg_write(host->req);
-			break;
-		default:
-			mg_end_request_cur(host, -EIO);
-			break;
-		}
-	}
-}
-
-static unsigned int mg_issue_req(struct request *req,
-		struct mg_host *host,
-		unsigned int sect_num,
-		unsigned int sect_cnt)
-{
-	switch (req_op(host->req)) {
-	case REQ_OP_READ:
-		if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
-				!= MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		break;
-	case REQ_OP_WRITE:
-		/* TODO : handler */
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
-				!= MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		del_timer(&host->timer);
-		mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		if (host->error) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		mg_write_one(host, req);
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-		break;
-	default:
-		mg_end_request_cur(host, -EIO);
-		break;
-	}
-	return MG_ERR_NONE;
-}
-
-/* This function also called from IRQ context */
-static void mg_request(struct request_queue *q)
-{
-	struct mg_host *host = q->queuedata;
-	struct request *req;
-	u32 sect_num, sect_cnt;
-
-	while (1) {
-		if (!host->req) {
-			host->req = blk_fetch_request(q);
-			if (!host->req)
-				break;
-		}
-		req = host->req;
-
-		/* check unwanted request call */
-		if (host->mg_do_intr)
-			return;
-
-		del_timer(&host->timer);
-
-		sect_num = blk_rq_pos(req);
-		/* deal whole segments */
-		sect_cnt = blk_rq_sectors(req);
-
-		/* sanity check */
-		if (sect_num >= get_capacity(req->rq_disk) ||
-				((sect_num + sect_cnt) >
-				 get_capacity(req->rq_disk))) {
-			printk(KERN_WARNING
-					"%s: bad access: sector=%d, count=%d\n",
-					req->rq_disk->disk_name,
-					sect_num, sect_cnt);
-			mg_end_request_cur(host, -EIO);
-			continue;
-		}
-
-		if (!mg_issue_req(req, host, sect_num, sect_cnt))
-			return;
-	}
-}
-
-static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct mg_host *host = bdev->bd_disk->private_data;
-
-	geo->cylinders = (unsigned short)host->cyls;
-	geo->heads = (unsigned char)host->heads;
-	geo->sectors = (unsigned char)host->sectors;
-	return 0;
-}
-
-static const struct block_device_operations mg_disk_ops = {
-	.getgeo = mg_getgeo
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int mg_suspend(struct device *dev)
-{
-	struct mg_drv_data *prv_data = dev->platform_data;
-	struct mg_host *host = prv_data->host;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	if (!prv_data->use_polling)
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	/* wait until mflash deep sleep */
-	msleep(1);
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
-		if (!prv_data->use_polling)
-			outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int mg_resume(struct device *dev)
-{
-	struct mg_drv_data *prv_data = dev->platform_data;
-	struct mg_host *host = prv_data->host;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	/* wait until mflash wakeup */
-	msleep(1);
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	if (!prv_data->use_polling)
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
-
-static int mg_probe(struct platform_device *plat_dev)
-{
-	struct mg_host *host;
-	struct resource *rsc;
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-	int err = 0;
-
-	if (!prv_data) {
-		printk(KERN_ERR	"%s:%d fail (no driver_data)\n",
-				__func__, __LINE__);
-		err = -EINVAL;
-		goto probe_err;
-	}
-
-	/* alloc mg_host */
-	host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
-	if (!host) {
-		printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
-				__func__, __LINE__);
-		err = -ENOMEM;
-		goto probe_err;
-	}
-	host->major = MG_DISK_MAJ;
-
-	/* link each other */
-	prv_data->host = host;
-	host->dev = &plat_dev->dev;
-
-	/* io remap */
-	rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
-	if (!rsc) {
-		printk(KERN_ERR "%s:%d platform_get_resource fail\n",
-				__func__, __LINE__);
-		err = -EINVAL;
-		goto probe_err_2;
-	}
-	host->dev_base = ioremap(rsc->start, resource_size(rsc));
-	if (!host->dev_base) {
-		printk(KERN_ERR "%s:%d ioremap fail\n",
-				__func__, __LINE__);
-		err = -EIO;
-		goto probe_err_2;
-	}
-	MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
-
-	/* get reset pin */
-	rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-			MG_RST_PIN);
-	if (!rsc) {
-		printk(KERN_ERR "%s:%d get reset pin fail\n",
-				__func__, __LINE__);
-		err = -EIO;
-		goto probe_err_3;
-	}
-	host->rst = rsc->start;
-
-	/* init rst pin */
-	err = gpio_request(host->rst, MG_RST_PIN);
-	if (err)
-		goto probe_err_3;
-	gpio_direction_output(host->rst, 1);
-
-	/* reset out pin */
-	if (!(prv_data->dev_attr & MG_DEV_MASK)) {
-		err = -EINVAL;
-		goto probe_err_3a;
-	}
-
-	if (prv_data->dev_attr != MG_BOOT_DEV) {
-		rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-				MG_RSTOUT_PIN);
-		if (!rsc) {
-			printk(KERN_ERR "%s:%d get reset-out pin fail\n",
-					__func__, __LINE__);
-			err = -EIO;
-			goto probe_err_3a;
-		}
-		host->rstout = rsc->start;
-		err = gpio_request(host->rstout, MG_RSTOUT_PIN);
-		if (err)
-			goto probe_err_3a;
-		gpio_direction_input(host->rstout);
-	}
-
-	/* disk reset */
-	if (prv_data->dev_attr == MG_STORAGE_DEV) {
-		/* If POR seq. not yet finished, wait */
-		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
-		if (err)
-			goto probe_err_3b;
-		err = mg_disk_init(host);
-		if (err) {
-			printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-					__func__, __LINE__, err);
-			err = -EIO;
-			goto probe_err_3b;
-		}
-	}
-
-	/* get irq resource */
-	if (!prv_data->use_polling) {
-		host->irq = platform_get_irq(plat_dev, 0);
-		if (host->irq == -ENXIO) {
-			err = host->irq;
-			goto probe_err_3b;
-		}
-		err = request_irq(host->irq, mg_irq,
-				IRQF_TRIGGER_RISING,
-				MG_DEV_NAME, host);
-		if (err) {
-			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
-					__func__, __LINE__, err);
-			goto probe_err_3b;
-		}
-
-	}
-
-	/* get disk id */
-	err = mg_get_disk_id(host);
-	if (err) {
-		printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-				__func__, __LINE__, err);
-		err = -EIO;
-		goto probe_err_4;
-	}
-
-	err = register_blkdev(host->major, MG_DISK_NAME);
-	if (err < 0) {
-		printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
-				__func__, __LINE__, err);
-		goto probe_err_4;
-	}
-	if (!host->major)
-		host->major = err;
-
-	spin_lock_init(&host->lock);
-
-	if (prv_data->use_polling)
-		host->breq = blk_init_queue(mg_request_poll, &host->lock);
-	else
-		host->breq = blk_init_queue(mg_request, &host->lock);
-
-	if (!host->breq) {
-		err = -ENOMEM;
-		printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
-				__func__, __LINE__);
-		goto probe_err_5;
-	}
-	host->breq->queuedata = host;
-
-	/* mflash is random device, thanx for the noop */
-	err = elevator_change(host->breq, "noop");
-	if (err) {
-		printk(KERN_ERR "%s:%d (elevator_init) fail\n",
-				__func__, __LINE__);
-		goto probe_err_6;
-	}
-	blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
-
-	setup_timer(&host->timer, mg_times_out, (unsigned long)host);
-
-	host->gd = alloc_disk(MG_DISK_MAX_PART);
-	if (!host->gd) {
-		printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
-				__func__, __LINE__);
-		err = -ENOMEM;
-		goto probe_err_7;
-	}
-	host->gd->major = host->major;
-	host->gd->first_minor = 0;
-	host->gd->fops = &mg_disk_ops;
-	host->gd->queue = host->breq;
-	host->gd->private_data = host;
-	sprintf(host->gd->disk_name, MG_DISK_NAME"a");
-
-	set_capacity(host->gd, host->n_sectors);
-
-	add_disk(host->gd);
-
-	return err;
-
-probe_err_7:
-	del_timer_sync(&host->timer);
-probe_err_6:
-	blk_cleanup_queue(host->breq);
-probe_err_5:
-	unregister_blkdev(host->major, MG_DISK_NAME);
-probe_err_4:
-	if (!prv_data->use_polling)
-		free_irq(host->irq, host);
-probe_err_3b:
-	gpio_free(host->rstout);
-probe_err_3a:
-	gpio_free(host->rst);
-probe_err_3:
-	iounmap(host->dev_base);
-probe_err_2:
-	kfree(host);
-probe_err:
-	return err;
-}
-
-static int mg_remove(struct platform_device *plat_dev)
-{
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-	struct mg_host *host = prv_data->host;
-	int err = 0;
-
-	/* delete timer */
-	del_timer_sync(&host->timer);
-
-	/* remove disk */
-	if (host->gd) {
-		del_gendisk(host->gd);
-		put_disk(host->gd);
-	}
-	/* remove queue */
-	if (host->breq)
-		blk_cleanup_queue(host->breq);
-
-	/* unregister blk device */
-	unregister_blkdev(host->major, MG_DISK_NAME);
-
-	/* free irq */
-	if (!prv_data->use_polling)
-		free_irq(host->irq, host);
-
-	/* free reset-out pin */
-	if (prv_data->dev_attr != MG_BOOT_DEV)
-		gpio_free(host->rstout);
-
-	/* free rst pin */
-	if (host->rst)
-		gpio_free(host->rst);
-
-	/* unmap io */
-	if (host->dev_base)
-		iounmap(host->dev_base);
-
-	/* free mg_host */
-	kfree(host);
-
-	return err;
-}
-
-static struct platform_driver mg_disk_driver = {
-	.probe = mg_probe,
-	.remove = mg_remove,
-	.driver = {
-		.name = MG_DEV_NAME,
-		.pm = &mg_pm,
-	}
-};
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init mg_init(void)
-{
-	printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
-	return platform_driver_register(&mg_disk_driver);
-}
-
-static void __exit mg_exit(void)
-{
-	printk(KERN_INFO "mflash driver : bye bye\n");
-	platform_driver_unregister(&mg_disk_driver);
-}
-
-module_init(mg_init);
-module_exit(mg_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
-MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index e11f4d9f1c2e..000000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Private data for mflash platform driver
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	void *host;
-};
-
-#endif
-- 
cgit v1.2.3


From c05e66733788118377c21a913c1bc7b64bccc167 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 00:59:58 -0700
Subject: sbitmap: add sbitmap_get_shallow() operation

This operation supports the use case of limiting the number of bits that
can be allocated for a given operation. Rather than setting aside some
bits at the end of the bitmap, we can set aside bits in each word of the
bitmap. This means we can keep the allocation hints spread out and
support sbitmap_resize() nicely at the cost of lower granularity for the
allowed depth.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/sbitmap.h | 55 ++++++++++++++++++++++++++++++++++++
 lib/sbitmap.c           | 75 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 123 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d4e0a204c118..a1904aadbc45 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -175,6 +175,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
  */
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
 
+/**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+			unsigned long shallow_depth);
+
 /**
  * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
  * @sb: Bitmap to check.
@@ -325,6 +344,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
  */
 int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
+/**
+ * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word, with preemption
+ * already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+				unsigned int shallow_depth);
+
 /**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue.
@@ -345,6 +377,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 	return nr;
 }
 
+/**
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ *       sbitmap_queue_clear()).
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+					    unsigned int *cpu,
+					    unsigned int shallow_depth)
+{
+	int nr;
+
+	*cpu = get_cpu();
+	nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
+	put_cpu();
+	return nr;
+}
+
 /**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 60e800e0b5a0..80aa8d5463fa 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
-static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
-			      bool wrap)
+static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+			      unsigned int hint, bool wrap)
 {
 	unsigned int orig_hint = hint;
 	int nr;
 
 	while (1) {
-		nr = find_next_zero_bit(&word->word, word->depth, hint);
-		if (unlikely(nr >= word->depth)) {
+		nr = find_next_zero_bit(word, depth, hint);
+		if (unlikely(nr >= depth)) {
 			/*
 			 * We started with an offset, and we didn't reset the
 			 * offset to 0 in a failure case, so start from 0 to
@@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
 			return -1;
 		}
 
-		if (!test_and_set_bit(nr, &word->word))
+		if (!test_and_set_bit(nr, word))
 			break;
 
 		hint = nr + 1;
-		if (hint >= word->depth - 1)
+		if (hint >= depth - 1)
 			hint = 0;
 	}
 
@@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 	index = SB_NR_TO_INDEX(sb, alloc_hint);
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index],
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth,
 					SB_NR_TO_BIT(sb, alloc_hint),
 					!round_robin);
 		if (nr != -1) {
@@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 }
 EXPORT_SYMBOL_GPL(sbitmap_get);
 
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+			unsigned long shallow_depth)
+{
+	unsigned int i, index;
+	int nr = -1;
+
+	index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					min(sb->map[index].depth, shallow_depth),
+					SB_NR_TO_BIT(sb, alloc_hint), true);
+		if (nr != -1) {
+			nr += index << sb->shift;
+			break;
+		}
+
+		/* Jump to next index. */
+		index++;
+		alloc_hint = index << sb->shift;
+
+		if (index >= sb->map_nr) {
+			index = 0;
+			alloc_hint = 0;
+		}
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
+
 bool sbitmap_any_bit_set(const struct sbitmap *sb)
 {
 	unsigned int i;
@@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
 
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+				unsigned int shallow_depth)
+{
+	unsigned int hint, depth;
+	int nr;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	depth = READ_ONCE(sbq->sb.depth);
+	if (unlikely(hint >= depth)) {
+		hint = depth ? prandom_u32() % depth : 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+	nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
+
+	if (nr == -1) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+	} else if (nr == hint || unlikely(sbq->round_robin)) {
+		/* Only update the hint if we used it. */
+		hint = nr + 1;
+		if (hint >= depth - 1)
+			hint = 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
-- 
cgit v1.2.3


From 5b72727299307e53888277729f980ab03264dac8 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 01:00:00 -0700
Subject: blk-mq: export helpers

blk_mq_finish_request() is required for schedulers that define their own
put_request(). blk_mq_run_hw_queue() is required for schedulers that
hold back requests to be run later.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e536dacfae4c..7138cd98146e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -368,6 +368,7 @@ void blk_mq_finish_request(struct request *rq)
 {
 	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_finish_request);
 
 void blk_mq_free_request(struct request *rq)
 {
@@ -1183,6 +1184,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	__blk_mq_delay_run_hw_queue(hctx, async, 0);
 }
+EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b90c3d5766cd..d75de612845d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -238,6 +238,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.2.3


From c05f8525f67b7d6489b0502211d4ed35622d9beb Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 01:00:01 -0700
Subject: blk-mq-sched: make completed_request() callback more useful

Currently, this callback is called right after put_request() and has no
distinguishable purpose. Instead, let's call it before put_request() as
soon as I/O has completed on the request, before we account it in
blk-stat. With this, Kyber can enable stats when it sees a latency
outlier and make sure the outlier gets accounted.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.h     | 11 +++--------
 block/blk-mq.c           |  5 ++++-
 include/linux/elevator.h |  2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index f4bc186c3440..120c6abc37cc 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -82,17 +82,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
 	return true;
 }
 
-static inline void
-blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq)
 {
-	struct elevator_queue *e = hctx->queue->elevator;
+	struct elevator_queue *e = rq->q->elevator;
 
 	if (e && e->type->ops.mq.completed_request)
-		e->type->ops.mq.completed_request(hctx, rq);
-
-	BUG_ON(rq->internal_tag == -1);
-
-	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+		e->type->ops.mq.completed_request(rq);
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7138cd98146e..e2ef7b460924 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -350,7 +350,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
-		blk_mq_sched_completed_request(hctx, rq);
+		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 	blk_mq_sched_restart(hctx);
 	blk_queue_exit(q);
 }
@@ -444,6 +444,9 @@ static void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	if (rq->internal_tag != -1)
+		blk_mq_sched_completed_request(rq);
+
 	blk_mq_stat_add(rq);
 
 	if (!q->softirq_done_fn)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b7ec315ee7e7..3a216318ae73 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -106,7 +106,7 @@ struct elevator_mq_ops {
 	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
 	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
 	bool (*has_work)(struct blk_mq_hw_ctx *);
-	void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+	void (*completed_request)(struct request *);
 	void (*started_request)(struct request *);
 	void (*requeue_request)(struct request *);
 	struct request *(*former_request)(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From 0e8d6a9336b487a1dd6f1991ff376e669d4c87c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:28 +0200
Subject: workqueue: Provide work_on_cpu_safe()

work_on_cpu() is not protected against CPU hotplug. For code which requires
to be either executed on an online CPU or to fail if the CPU is not
available the callsite would have to protect against CPU hotplug.

Provide a function which does get/put_online_cpus() around the call to
work_on_cpu() and fails the call with -ENODEV if the target CPU is not
online.

Preparatory patch to convert several racy task affinity manipulations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.262610721@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/workqueue.h |  5 +++++
 kernel/workqueue.c        | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bde063cefd04..c102ef65cb64 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
 	return fn(arg);
 }
+static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+	return fn(arg);
+}
 #else
 long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..5bf1be018628 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4735,6 +4735,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 	return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn:  the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+	long ret = -ENODEV;
+
+	get_online_cpus();
+	if (cpu_online(cpu))
+		ret = work_on_cpu(cpu, fn, arg);
+	put_online_cpus();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
-- 
cgit v1.2.3


From 17912c49edfa6ab552329bf63d1b757eb874673b Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:37 +0200
Subject: lightnvm: submit erases using the I/O path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Until now erases have been submitted as synchronous commands through a
dedicated erase function. In order to enable targets implementing
asynchronous erases, refactor the erase path so that it uses the normal
async I/O submission functions. If a target requires sync I/O, it can
implement it internally. Also, adapt rrpc to use the new erase path.

Signed-off-by: Javier González <javier@cnexlabs.com>
Fixed spelling error.
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      | 54 +++++++++++++++++++++++++++-----------------
 drivers/lightnvm/rrpc.c      |  3 +--
 drivers/nvme/host/lightnvm.c | 32 ++++++++------------------
 include/linux/lightnvm.h     |  8 +++----
 4 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5262ba66a7a7..95105c47e082 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -590,11 +590,11 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
 	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
 	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-	nvm_free_rqd_ppalist(dev, &rqd);
+	nvm_free_rqd_ppalist(tgt_dev, &rqd);
 	if (ret) {
 		pr_err("nvm: failed bb mark\n");
 		return -EINVAL;
@@ -626,34 +626,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
+static void nvm_end_io_sync(struct nvm_rq *rqd)
 {
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_rq rqd;
-	int ret;
+	struct completion *waiting = rqd->private;
 
-	if (!dev->ops->erase_block)
-		return 0;
+	complete(waiting);
+}
 
-	nvm_map_to_dev(tgt_dev, ppas);
+int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+								int nr_ppas)
+{
+	struct nvm_geo *geo = &tgt_dev->geo;
+	struct nvm_rq rqd;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+	rqd.opcode = NVM_OP_ERASE;
+	rqd.end_io = nvm_end_io_sync;
+	rqd.private = &wait;
+	rqd.flags = geo->plane_mode >> 1;
+
+	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
 	if (ret)
 		return ret;
 
-	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
-
-	rqd.flags = flags;
-
-	ret = dev->ops->erase_block(dev, &rqd);
+	ret = nvm_submit_io(tgt_dev, &rqd);
+	if (ret) {
+		pr_err("rrpr: erase I/O submission failed: %d\n", ret);
+		goto free_ppa_list;
+	}
+	wait_for_completion_io(&wait);
 
-	nvm_free_rqd_ppalist(dev, &rqd);
+free_ppa_list:
+	nvm_free_rqd_ppalist(tgt_dev, &rqd);
 
 	return ret;
 }
-EXPORT_SYMBOL(nvm_erase_blk);
+EXPORT_SYMBOL(nvm_erase_sync);
 
 int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
 		    nvm_l2p_update_fn *update_l2p, void *priv)
@@ -732,10 +743,11 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 }
 EXPORT_SYMBOL(nvm_put_area);
 
-int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
 			const struct ppa_addr *ppas, int nr_ppas, int vblk)
 {
-	struct nvm_geo *geo = &dev->geo;
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_geo *geo = &tgt_dev->geo;
 	int i, plane_cnt, pl_idx;
 	struct ppa_addr ppa;
 
@@ -773,12 +785,12 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 }
 EXPORT_SYMBOL(nvm_set_rqd_ppalist);
 
-void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
 	if (!rqd->ppa_list)
 		return;
 
-	nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e68efbcf1188..4e4c2997337c 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -414,7 +414,6 @@ static void rrpc_block_gc(struct work_struct *work)
 	struct rrpc *rrpc = gcb->rrpc;
 	struct rrpc_block *rblk = gcb->rblk;
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct ppa_addr ppa;
 
 	mempool_free(gcb, rrpc->gcb_pool);
@@ -430,7 +429,7 @@ static void rrpc_block_gc(struct work_struct *work)
 	ppa.g.lun = rlun->bppa.g.lun;
 	ppa.g.blk = rblk->id;
 
-	if (nvm_erase_blk(dev, &ppa, 0))
+	if (nvm_erase_sync(rrpc->dev, &ppa, 1))
 		goto put_back;
 
 	rrpc_put_blk(rrpc, rblk);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index fd9895423f55..4ea9c93fbbe0 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -510,12 +510,16 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	}
 	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
-	rq->ioprio = bio_prio(bio);
-	if (bio_has_data(bio))
-		rq->nr_phys_segments = bio_phys_segments(q, bio);
-
-	rq->__data_len = bio->bi_iter.bi_size;
-	rq->bio = rq->biotail = bio;
+	if (bio) {
+		rq->ioprio = bio_prio(bio);
+		rq->__data_len = bio->bi_iter.bi_size;
+		rq->bio = rq->biotail = bio;
+		if (bio_has_data(bio))
+			rq->nr_phys_segments = bio_phys_segments(q, bio);
+	} else {
+		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+		rq->__data_len = 0;
+	}
 
 	nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
 
@@ -526,21 +530,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return 0;
 }
 
-static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	struct request_queue *q = dev->q;
-	struct nvme_ns *ns = q->queuedata;
-	struct nvme_nvm_command c = {};
-
-	c.erase.opcode = NVM_OP_ERASE;
-	c.erase.nsid = cpu_to_le32(ns->ns_id);
-	c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
-	c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
-	c.erase.control = cpu_to_le16(rqd->flags);
-
-	return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
-}
-
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -576,7 +565,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
 	.submit_io		= nvme_nvm_submit_io,
-	.erase_block		= nvme_nvm_erase_block,
 
 	.create_dma_pool	= nvme_nvm_create_dma_pool,
 	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ca45e4a088a9..e11163f9b3b7 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -70,7 +69,6 @@ struct nvm_dev_ops {
 	nvm_op_set_bb_fn	*set_bb_tbl;
 
 	nvm_submit_io_fn	*submit_io;
-	nvm_erase_blk_fn	*erase_block;
 
 	nvm_create_dma_pool_fn	*create_dma_pool;
 	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
@@ -479,10 +477,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
-extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
-- 
cgit v1.2.3


From a7737f39c70d9c63ba530d6316724d7be67de541 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:38 +0200
Subject: lightnvm: rename scrambler controller hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the OCSSD 1.2 specification, the 0x200 hint enables the
media scrambler for the read/write opcode, providing that the controller
has been correctly configured by the firmware. Rename the macro to
represent this meaning.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index e11163f9b3b7..eff7d1f312a8 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -123,7 +123,7 @@ enum {
 	/* NAND Access Modes */
 	NVM_IO_SUSPEND		= 0x80,
 	NVM_IO_SLC_MODE		= 0x100,
-	NVM_IO_SCRAMBLE_DISABLE	= 0x200,
+	NVM_IO_SCRAMBLE_ENABLE	= 0x200,
 
 	/* Block Types */
 	NVM_BLK_T_FREE		= 0x0,
-- 
cgit v1.2.3


From 4af3f75d7992dd0dc49da95fbc039fa3806fba4f Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:45 +0200
Subject: lightnvm: allow to init targets on factory mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Target initialization has two responsibilities: creating the target
partition and instantiating the target. This patch enables to create a
factory partition (e.g., do not trigger recovery on the given target).
This is useful for target development and for being able to restore the
device state at any moment in time without requiring a full-device
erase.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c       | 14 +++++++++++---
 drivers/lightnvm/rrpc.c       |  3 ++-
 include/linux/lightnvm.h      |  3 ++-
 include/uapi/linux/lightnvm.h |  4 ++++
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5f84d2a418f6..a63b563b1a8a 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -280,7 +280,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	tdisk->fops = &nvm_fops;
 	tdisk->queue = tqueue;
 
-	targetdata = tt->init(tgt_dev, tdisk);
+	targetdata = tt->init(tgt_dev, tdisk, create->flags);
 	if (IS_ERR(targetdata))
 		goto err_init;
 
@@ -1244,8 +1244,16 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
 	create.tgtname[DISK_NAME_LEN - 1] = '\0';
 
 	if (create.flags != 0) {
-		pr_err("nvm: no flags supported\n");
-		return -EINVAL;
+		__u32 flags = create.flags;
+
+		/* Check for valid flags */
+		if (flags & NVM_TARGET_FACTORY)
+			flags &= ~NVM_TARGET_FACTORY;
+
+		if (flags) {
+			pr_err("nvm: flag not supported\n");
+			return -EINVAL;
+		}
 	}
 
 	return __nvm_configure_create(&create);
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index a8acf9e06401..5dba54470c2a 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1506,7 +1506,8 @@ err:
 
 static struct nvm_tgt_type tt_rrpc;
 
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+		       int flags)
 {
 	struct request_queue *bqueue = dev->q;
 	struct request_queue *tqueue = tdisk->queue;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index eff7d1f312a8..7dfa56ebbc6d 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -436,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
 
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
+				int flags);
 typedef void (nvm_tgt_exit_fn)(void *);
 typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
 typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index fd19f36b3129..c8aec4b9e73b 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -85,6 +85,10 @@ struct nvm_ioctl_create_conf {
 	};
 };
 
+enum {
+	NVM_TARGET_FACTORY = 1 << 0,	/* Init target in factory mode */
+};
+
 struct nvm_ioctl_create {
 	char dev[DISK_NAME_LEN];		/* open-channel SSD device */
 	char tgttype[NVM_TTYPE_NAME_MAX];	/* target type name */
-- 
cgit v1.2.3


From 605f8fc2244236f8d6bf15bcc0586644af3a32e7 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 5 Apr 2017 00:03:33 +0200
Subject: i2c: core: Add new i2c_acpi_new_device helper function

By default the i2c subsys creates an i2c-client for the first I2cSerialBus
resource of an acpi_device, but some acpi_devices have multiple
I2cSerialBus resources and we may want to instantiate i2c-clients for
the others.

This commit adds a new i2c_acpi_new_device function which can be used to
create an i2c-client for any I2cSerialBus resource of an acpi_device.

Note that the other resources may even be on a different i2c bus, so just
retrieving the client address is not enough.

Here is an example DSDT excerpt from such a device:

Device (WIDR)
{
    Name (_HID, "INT33FE" /* XPOWER Battery Device */)
    Name (_CID, "INT33FE" /* XPOWER Battery Device */)
    Name (_DDN, "WC PMIC Battery Device")
<snip>
    Name (RBUF, ResourceTemplate ()
    {
        I2cSerialBusV2 (0x005E, ControllerInitiated, 0x000186A0,
            AddressingMode7Bit, "\\_SB.PCI0.I2C7",
            0x00, ResourceConsumer, , Exclusive,
            )
        I2cSerialBusV2 (0x0036, ControllerInitiated, 0x000186A0,
            AddressingMode7Bit, "\\_SB.PCI0.I2C1",
            0x00, ResourceConsumer, , Exclusive,
            )
        I2cSerialBusV2 (0x0022, ControllerInitiated, 0x00061A80,
            AddressingMode7Bit, "\\_SB.PCI0.I2C1",
            0x00, ResourceConsumer, , Exclusive,
            )
        I2cSerialBusV2 (0x0054, ControllerInitiated, 0x00061A80,
            AddressingMode7Bit, "\\_SB.PCI0.I2C1",
            0x00, ResourceConsumer, , Exclusive,
            )
        GpioInt (Level, ActiveLow, Exclusive, PullNone, 0x0000,
            "\\_SB.PCI0.I2C7.PMI5", 0x00, ResourceConsumer, ,
            )
            {   // Pin list
        0x0012
            }
        GpioInt (Edge, ActiveLow, ExclusiveAndWake, PullNone, 0x0000,
            "\\_SB.GPO1", 0x00, ResourceConsumer, ,
            )
            {   // Pin list
        0x0005
            }
        GpioInt (Level, ActiveLow, Exclusive, PullNone, 0x0000,
            "\\_SB.PCI0.I2C7.PMI5", 0x00, ResourceConsumer, ,
            )
            {   // Pin list
        0x0013
            }
    })
    Method (_CRS, 0, NotSerialized)  // _CRS: Current Resource Settings
    {
        Return (RBUF) /* \_SB_.PCI0.I2C7.WIDR.RBUF */
    }
<snip>
}

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c.h    |  7 +++++++
 2 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index f7faa991a44c..00c4cef716f7 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -421,6 +421,55 @@ static int i2c_acpi_notify(struct notifier_block *nb, unsigned long value,
 static struct notifier_block i2c_acpi_notifier = {
 	.notifier_call = i2c_acpi_notify,
 };
+
+/**
+ * i2c_acpi_new_device - Create i2c-client for the Nth I2cSerialBus resource
+ * @dev:     Device owning the ACPI resources to get the client from
+ * @index:   Index of ACPI resource to get
+ * @info:    describes the I2C device; note this is modified (addr gets set)
+ * Context: can sleep
+ *
+ * By default the i2c subsys creates an i2c-client for the first I2cSerialBus
+ * resource of an acpi_device, but some acpi_devices have multiple I2cSerialBus
+ * resources, in that case this function can be used to create an i2c-client
+ * for other I2cSerialBus resources in the Current Resource Settings table.
+ *
+ * Also see i2c_new_device, which this function calls to create the i2c-client.
+ *
+ * Returns a pointer to the new i2c-client, or NULL if the adapter is not found.
+ */
+struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
+				       struct i2c_board_info *info)
+{
+	struct i2c_acpi_lookup lookup;
+	struct i2c_adapter *adapter;
+	struct acpi_device *adev;
+	LIST_HEAD(resource_list);
+	int ret;
+
+	adev = ACPI_COMPANION(dev);
+	if (!adev)
+		return NULL;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.info = info;
+	lookup.device_handle = acpi_device_handle(adev);
+	lookup.index = index;
+
+	ret = acpi_dev_get_resources(adev, &resource_list,
+				     i2c_acpi_fill_info, &lookup);
+	acpi_dev_free_resource_list(&resource_list);
+
+	if (ret < 0 || !info->addr)
+		return NULL;
+
+	adapter = i2c_acpi_find_adapter_by_handle(lookup.adapter_handle);
+	if (!adapter)
+		return NULL;
+
+	return i2c_new_device(adapter, info);
+}
+EXPORT_SYMBOL_GPL(i2c_acpi_new_device);
 #else /* CONFIG_ACPI */
 static inline void i2c_acpi_register_devices(struct i2c_adapter *adap) { }
 extern struct notifier_block i2c_acpi_notifier;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 6b183521c616..53fa50fc63fb 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -824,11 +824,18 @@ static inline const struct of_device_id
 
 #if IS_ENABLED(CONFIG_ACPI)
 u32 i2c_acpi_find_bus_speed(struct device *dev);
+struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
+				       struct i2c_board_info *info);
 #else
 static inline u32 i2c_acpi_find_bus_speed(struct device *dev)
 {
 	return 0;
 }
+static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
+					int index, struct i2c_board_info *info)
+{
+	return NULL;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3


From d1d84bb95364ed604015c2b788caaf3dbca0262f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 5 Apr 2017 00:03:34 +0200
Subject: i2c: core: Allow drivers to disable i2c-core irq mapping

By default the i2c-core will try to get an irq with index 0 on ACPI / of
instantiated devices. This is troublesome on some ACPI systems where the
irq info at index 0 in the CRS table may contain nonsense and/or point
to an irqchip for which there is no Linux driver.

If this happens then before this commit the driver's probe method would
never get called because i2c_device_probe will try to get an irq by
calling acpi_dev_gpio_irq_get which will always return -EPROBE in this
case, as it waits for a matching irqchip driver to load. Thus causing
the driver to not get a chance to bind.

This commit adds a new disable_i2c_core_irq_mapping flag to struct
i2c_driver which a driver can set to tell the core to skip irq mapping.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 6 +++---
 include/linux/i2c.h    | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 00c4cef716f7..7a065c4260f3 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -985,7 +985,9 @@ static int i2c_device_probe(struct device *dev)
 	if (!client)
 		return 0;
 
-	if (!client->irq) {
+	driver = to_i2c_driver(dev->driver);
+
+	if (!client->irq && !driver->disable_i2c_core_irq_mapping) {
 		int irq = -ENOENT;
 
 		if (client->flags & I2C_CLIENT_HOST_NOTIFY) {
@@ -1007,8 +1009,6 @@ static int i2c_device_probe(struct device *dev)
 		client->irq = irq;
 	}
 
-	driver = to_i2c_driver(dev->driver);
-
 	/*
 	 * An I2C ID table is not mandatory, if and only if, a suitable Device
 	 * Tree match table entry is supplied for the probing device.
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 53fa50fc63fb..3a57e3dc9bec 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -149,6 +149,7 @@ enum i2c_alert_protocol {
  * @detect: Callback for device detection
  * @address_list: The I2C addresses to probe (for detect)
  * @clients: List of detected clients we created (for i2c-core use only)
+ * @disable_i2c_core_irq_mapping: Tell the i2c-core to not do irq-mapping
  *
  * The driver.owner field should be set to the module owner of this driver.
  * The driver.name field should be set to the name of this driver.
@@ -212,6 +213,8 @@ struct i2c_driver {
 	int (*detect)(struct i2c_client *, struct i2c_board_info *);
 	const unsigned short *address_list;
 	struct list_head clients;
+
+	bool disable_i2c_core_irq_mapping;
 };
 #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver)
 
-- 
cgit v1.2.3


From b54807fa52ae21bdf6bad72b0f00fd400af412eb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 24 Mar 2017 08:38:20 -0500
Subject: sysctl: Remove dead register_sysctl_root

The function no longer does anything.  The is only a single caller of
register_sysctl_root when semantically there should be two.  Remove
this function so that if someone decides this functionality is needed
again it will be obvious all of the callers of setup_sysctl_set need
to be audited and modified appropriately.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  | 4 ----
 include/linux/sysctl.h | 1 -
 net/sysctl_net.c       | 1 -
 3 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8f91ec66baa3..35eed95b26d5 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr
 	*pentry = entry;
 }
 
-void register_sysctl_root(struct ctl_table_root *root)
-{
-}
-
 /*
  * sysctl_perm does NOT grant the superuser all rights automatically, because
  * some sysctl variables are readonly even to root.
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b7e82049fec7..80d07816def0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -180,7 +180,6 @@ extern void setup_sysctl_set(struct ctl_table_set *p,
 	int (*is_seen)(struct ctl_table_set *));
 extern void retire_sysctl_set(struct ctl_table_set *set);
 
-void register_sysctl_root(struct ctl_table_root *root);
 struct ctl_table_header *__register_sysctl_table(
 	struct ctl_table_set *set,
 	const char *path, struct ctl_table *table);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 919981324171..9aed6fe1bf1a 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -106,7 +106,6 @@ __init int net_sysctl_init(void)
 	ret = register_pernet_subsys(&sysctl_pernet_ops);
 	if (ret)
 		goto out1;
-	register_sysctl_root(&net_sysctl_root);
 out:
 	return ret;
 out1:
-- 
cgit v1.2.3


From 500a3d0ded5ee41072d0f084bff938747ee0c125 Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Thu, 13 Apr 2017 06:36:51 +0300
Subject: net/mlx5: Add IPoIB enhanced offloads bits to mlx5_ifc

New capability bit: ipoib_enhanced_offloads, indicates new ability for UD
QP to do RSS and enhanced IPoIB offloads and acceleration.

Add underlay_qpn to the TIS and flow_table objects In order to support
SET_ROOT command, to connect between IPoIB QPs and flow steering tables.

Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/mlx5_ifc.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1993adbd2c82..7c50bd39b297 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -872,7 +872,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         compact_address_vector[0x1];
 	u8         striding_rq[0x1];
-	u8         reserved_at_202[0x2];
+	u8         reserved_at_202[0x1];
+	u8         ipoib_enhanced_offloads[0x1];
 	u8         ipoib_basic_offloads[0x1];
 	u8         reserved_at_205[0xa];
 	u8         drain_sigerr[0x1];
@@ -2293,7 +2294,9 @@ struct mlx5_ifc_tisc_bits {
 	u8         reserved_at_120[0x8];
 	u8         transport_domain[0x18];
 
-	u8         reserved_at_140[0x3c0];
+	u8         reserved_at_140[0x8];
+	u8         underlay_qpn[0x18];
+	u8         reserved_at_160[0x3a0];
 };
 
 enum {
@@ -8218,7 +8221,9 @@ struct mlx5_ifc_set_flow_table_root_in_bits {
 	u8         reserved_at_a0[0x8];
 	u8         table_id[0x18];
 
-	u8         reserved_at_c0[0x140];
+	u8         reserved_at_c0[0x8];
+	u8         underlay_qpn[0x18];
+	u8         reserved_at_e0[0x120];
 };
 
 enum {
-- 
cgit v1.2.3


From b3ba51498bddd72a526d9067b8b0ecf4932ce57e Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Thu, 13 Apr 2017 06:36:52 +0300
Subject: net/mlx5: Refactor create flow table method to accept underlay QP

IB flow tables need the underlay qp to perform flow steering.
Here we change the API of the flow tables creation to accept the
underlay QP number as a parameter in order to support IB (IPoIB) flow
steering.

Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c  | 10 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    | 25 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |  5 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 16 +++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |  8 +++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  | 84 +++++++++++++---------
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  1 +
 include/linux/mlx5/fs.h                            | 14 ++--
 8 files changed, 113 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index c4e9cc79f5c7..c8a005326e30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -321,10 +321,16 @@ static int arfs_create_table(struct mlx5e_priv *priv,
 {
 	struct mlx5e_arfs_tables *arfs = &priv->fs.arfs;
 	struct mlx5e_flow_table *ft = &arfs->arfs_tables[type].ft;
+	struct mlx5_flow_table_attr ft_attr = {};
 	int err;
 
-	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_ARFS_TABLE_SIZE, MLX5E_ARFS_FT_LEVEL, 0);
+	ft->num_groups = 0;
+
+	ft_attr.max_fte = MLX5E_ARFS_TABLE_SIZE;
+	ft_attr.level = MLX5E_ARFS_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
 		ft->t = NULL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 5376d69a6b1a..729904c43801 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -803,11 +803,15 @@ static void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv)
 static int mlx5e_create_ttc_table(struct mlx5e_priv *priv)
 {
 	struct mlx5e_ttc_table *ttc = &priv->fs.ttc;
+	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5e_flow_table *ft = &ttc->ft;
 	int err;
 
-	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_TTC_TABLE_SIZE, MLX5E_TTC_FT_LEVEL, 0);
+	ft_attr.max_fte = MLX5E_TTC_TABLE_SIZE;
+	ft_attr.level = MLX5E_TTC_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
 		ft->t = NULL;
@@ -973,12 +977,16 @@ static int mlx5e_create_l2_table(struct mlx5e_priv *priv)
 {
 	struct mlx5e_l2_table *l2_table = &priv->fs.l2;
 	struct mlx5e_flow_table *ft = &l2_table->ft;
+	struct mlx5_flow_table_attr ft_attr = {};
 	int err;
 
 	ft->num_groups = 0;
-	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_L2_TABLE_SIZE, MLX5E_L2_FT_LEVEL, 0);
 
+	ft_attr.max_fte = MLX5E_L2_TABLE_SIZE;
+	ft_attr.level = MLX5E_L2_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
 		ft->t = NULL;
@@ -1076,11 +1084,16 @@ static int mlx5e_create_vlan_table_groups(struct mlx5e_flow_table *ft)
 static int mlx5e_create_vlan_table(struct mlx5e_priv *priv)
 {
 	struct mlx5e_flow_table *ft = &priv->fs.vlan.ft;
+	struct mlx5_flow_table_attr ft_attr = {};
 	int err;
 
 	ft->num_groups = 0;
-	ft->t = mlx5_create_flow_table(priv->fs.ns, MLX5E_NIC_PRIO,
-				       MLX5E_VLAN_TABLE_SIZE, MLX5E_VLAN_FT_LEVEL, 0);
+
+	ft_attr.max_fte = MLX5E_VLAN_TABLE_SIZE;
+	ft_attr.level = MLX5E_VLAN_FT_LEVEL;
+	ft_attr.prio = MLX5E_NIC_PRIO;
+
+	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
 
 	if (IS_ERR(ft->t)) {
 		err = PTR_ERR(ft->t);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index fcd5bc7e31db..b3281d1118b3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -337,6 +337,7 @@ esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u32 vport)
 static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw, int nvports)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_flow_namespace *root_ns;
 	struct mlx5_flow_table *fdb;
@@ -362,7 +363,9 @@ static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw, int nvports)
 	memset(flow_group_in, 0, inlen);
 
 	table_size = BIT(MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
-	fdb = mlx5_create_flow_table(root_ns, 0, table_size, 0, 0);
+
+	ft_attr.max_fte = table_size;
+	fdb = mlx5_create_flow_table(root_ns, &ft_attr);
 	if (IS_ERR(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create FDB Table err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index fff962dac8e3..992b380d36be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -432,6 +432,7 @@ out:
 static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+	struct mlx5_flow_table_attr ft_attr = {};
 	int table_size, ix, esw_size, err = 0;
 	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_flow_namespace *root_ns;
@@ -475,7 +476,11 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
 	esw->fdb_table.fdb = fdb;
 
 	table_size = nvports + MAX_PF_SQ + 1;
-	fdb = mlx5_create_flow_table(root_ns, FDB_SLOW_PATH, table_size, 0, 0);
+
+	ft_attr.max_fte = table_size;
+	ft_attr.prio = FDB_SLOW_PATH;
+
+	fdb = mlx5_create_flow_table(root_ns, &ft_attr);
 	if (IS_ERR(fdb)) {
 		err = PTR_ERR(fdb);
 		esw_warn(dev, "Failed to create slow path FDB Table err %d\n", err);
@@ -556,9 +561,10 @@ static void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
 
 static int esw_create_offloads_table(struct mlx5_eswitch *esw)
 {
-	struct mlx5_flow_namespace *ns;
-	struct mlx5_flow_table *ft_offloads;
+	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_core_dev *dev = esw->dev;
+	struct mlx5_flow_table *ft_offloads;
+	struct mlx5_flow_namespace *ns;
 	int err = 0;
 
 	ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS);
@@ -567,7 +573,9 @@ static int esw_create_offloads_table(struct mlx5_eswitch *esw)
 		return -EOPNOTSUPP;
 	}
 
-	ft_offloads = mlx5_create_flow_table(ns, 0, dev->priv.sriov.num_vfs + 2, 0, 0);
+	ft_attr.max_fte = dev->priv.sriov.num_vfs + 2;
+
+	ft_offloads = mlx5_create_flow_table(ns, &ft_attr);
 	if (IS_ERR(ft_offloads)) {
 		err = PTR_ERR(ft_offloads);
 		esw_warn(esw->dev, "Failed to create offloads table, err %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index c6178ea1a461..19e3d2fc2099 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -45,6 +45,10 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 	u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)]   = {0};
 	u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0};
 
+	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
+	    ft->underlay_qpn == 0)
+		return 0;
+
 	MLX5_SET(set_flow_table_root_in, in, opcode,
 		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
 	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
@@ -54,6 +58,10 @@ int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
 		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
 	}
 
+	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
+	    ft->underlay_qpn != 0)
+		MLX5_SET(set_flow_table_root_in, in, underlay_qpn, ft->underlay_qpn);
+
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 27ff815600f7..55182d0b06e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -778,18 +778,16 @@ static void list_add_flow_table(struct mlx5_flow_table *ft,
 }
 
 static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
+							struct mlx5_flow_table_attr *ft_attr,
 							enum fs_flow_table_op_mod op_mod,
-							u16 vport, int prio,
-							int max_fte, u32 level,
-							u32 flags)
+							u16 vport)
 {
+	struct mlx5_flow_root_namespace *root = find_root(&ns->node);
 	struct mlx5_flow_table *next_ft = NULL;
+	struct fs_prio *fs_prio = NULL;
 	struct mlx5_flow_table *ft;
-	int err;
 	int log_table_sz;
-	struct mlx5_flow_root_namespace *root =
-		find_root(&ns->node);
-	struct fs_prio *fs_prio = NULL;
+	int err;
 
 	if (!root) {
 		pr_err("mlx5: flow steering failed to find root of namespace\n");
@@ -797,29 +795,31 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 	}
 
 	mutex_lock(&root->chain_lock);
-	fs_prio = find_prio(ns, prio);
+	fs_prio = find_prio(ns, ft_attr->prio);
 	if (!fs_prio) {
 		err = -EINVAL;
 		goto unlock_root;
 	}
-	if (level >= fs_prio->num_levels) {
+	if (ft_attr->level >= fs_prio->num_levels) {
 		err = -ENOSPC;
 		goto unlock_root;
 	}
 	/* The level is related to the
 	 * priority level range.
 	 */
-	level += fs_prio->start_level;
-	ft = alloc_flow_table(level,
+	ft_attr->level += fs_prio->start_level;
+	ft = alloc_flow_table(ft_attr->level,
 			      vport,
-			      max_fte ? roundup_pow_of_two(max_fte) : 0,
+			      ft_attr->max_fte ? roundup_pow_of_two(ft_attr->max_fte) : 0,
 			      root->table_type,
-			      op_mod, flags);
+			      op_mod, ft_attr->flags);
 	if (!ft) {
 		err = -ENOMEM;
 		goto unlock_root;
 	}
 
+	ft->underlay_qpn = ft_attr->underlay_qpn;
+
 	tree_init_node(&ft->node, 1, del_flow_table);
 	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
 	next_ft = find_next_chained_ft(fs_prio);
@@ -849,44 +849,56 @@ unlock_root:
 }
 
 struct mlx5_flow_table *mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-					       int prio, int max_fte,
-					       u32 level,
-					       u32 flags)
+					       struct mlx5_flow_table_attr *ft_attr)
 {
-	return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_NORMAL, 0, prio,
-					max_fte, level, flags);
+	return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_NORMAL, 0);
 }
 
 struct mlx5_flow_table *mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 						     int prio, int max_fte,
 						     u32 level, u16 vport)
 {
-	return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_NORMAL, vport, prio,
-					max_fte, level, 0);
+	struct mlx5_flow_table_attr ft_attr = {};
+
+	ft_attr.max_fte = max_fte;
+	ft_attr.level   = level;
+	ft_attr.prio    = prio;
+
+	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_NORMAL, 0);
 }
 
-struct mlx5_flow_table *mlx5_create_lag_demux_flow_table(
-					       struct mlx5_flow_namespace *ns,
-					       int prio, u32 level)
+struct mlx5_flow_table*
+mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns,
+				 int prio, u32 level)
 {
-	return __mlx5_create_flow_table(ns, FS_FT_OP_MOD_LAG_DEMUX, 0, prio, 0,
-					level, 0);
+	struct mlx5_flow_table_attr ft_attr = {};
+
+	ft_attr.level = level;
+	ft_attr.prio  = prio;
+	return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0);
 }
 EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table);
 
-struct mlx5_flow_table *mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
-							    int prio,
-							    int num_flow_table_entries,
-							    int max_num_groups,
-							    u32 level,
-							    u32 flags)
+struct mlx5_flow_table*
+mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
+				    int prio,
+				    int num_flow_table_entries,
+				    int max_num_groups,
+				    u32 level,
+				    u32 flags)
 {
+	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_flow_table *ft;
 
 	if (max_num_groups > num_flow_table_entries)
 		return ERR_PTR(-EINVAL);
 
-	ft = mlx5_create_flow_table(ns, prio, num_flow_table_entries, level, flags);
+	ft_attr.max_fte = num_flow_table_entries;
+	ft_attr.prio    = prio;
+	ft_attr.level   = level;
+	ft_attr.flags   = flags;
+
+	ft = mlx5_create_flow_table(ns, &ft_attr);
 	if (IS_ERR(ft))
 		return ft;
 
@@ -1828,12 +1840,18 @@ static void set_prio_attrs(struct mlx5_flow_root_namespace *root_ns)
 static int create_anchor_flow_table(struct mlx5_flow_steering *steering)
 {
 	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_flow_table_attr ft_attr = {};
 	struct mlx5_flow_table *ft;
 
 	ns = mlx5_get_flow_namespace(steering->dev, MLX5_FLOW_NAMESPACE_ANCHOR);
 	if (WARN_ON(!ns))
 		return -EINVAL;
-	ft = mlx5_create_flow_table(ns, ANCHOR_PRIO, ANCHOR_SIZE, ANCHOR_LEVEL, 0);
+
+	ft_attr.max_fte = ANCHOR_SIZE;
+	ft_attr.level   = ANCHOR_LEVEL;
+	ft_attr.prio    = ANCHOR_PRIO;
+
+	ft = mlx5_create_flow_table(ns, &ft_attr);
 	if (IS_ERR(ft)) {
 		mlx5_core_err(steering->dev, "Failed to create last anchor flow table");
 		return PTR_ERR(ft);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 03af2e7989f3..577d056bf3df 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -118,6 +118,7 @@ struct mlx5_flow_table {
 	/* FWD rules that point on this flow table */
 	struct list_head		fwd_rules;
 	u32				flags;
+	u32				underlay_qpn;
 };
 
 struct mlx5_fc_cache {
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ae91a4bda1a3..1b166d2e19c5 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -104,12 +104,18 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns,
 				    u32 level,
 				    u32 flags);
 
+struct mlx5_flow_table_attr {
+	int prio;
+	int max_fte;
+	u32 level;
+	u32 flags;
+	u32 underlay_qpn;
+};
+
 struct mlx5_flow_table *
 mlx5_create_flow_table(struct mlx5_flow_namespace *ns,
-		       int prio,
-		       int num_flow_table_entries,
-		       u32 level,
-		       u32 flags);
+		       struct mlx5_flow_table_attr *ft_attr);
+
 struct mlx5_flow_table *
 mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns,
 			     int prio,
-- 
cgit v1.2.3


From 258545449b7b410727b516b782256f8a3bde8bf2 Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Thu, 13 Apr 2017 06:37:02 +0300
Subject: net/mlx5e: IPoIB, Xmit flow

Implement mlx5e's IPoIB SKB transmit using the helper functions provided
by mlx5e ethernet tx flow, the only difference in the code between
mlx5e_xmit and mlx5i_xmit is that IPoIB has some extra fields to fill
(UD datagram segment) in the TX descriptor (WQE) and it doesn't need to
have any vlan handling.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Reviewed-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h            | 10 ---
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 87 +++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/ipoib.c | 10 +++
 drivers/net/ethernet/mellanox/mlx5/core/ipoib.h |  3 +
 include/linux/mlx5/qp.h                         | 10 +++
 5 files changed, 110 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 3cd064b5f0bf..ce8ba617d46e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -729,16 +729,6 @@ static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw)
 	return container_of(ibmw, struct mlx5_ib_mw, ibmw);
 }
 
-struct mlx5_ib_ah {
-	struct ib_ah		ibah;
-	struct mlx5_av		av;
-};
-
-static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah)
-{
-	return container_of(ibah, struct mlx5_ib_ah, ibah);
-}
-
 int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
 			struct mlx5_db *db);
 void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index ba664a1126cf..dda7db503043 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -503,3 +503,90 @@ void mlx5e_free_txqsq_descs(struct mlx5e_txqsq *sq)
 		sq->cc += wi->num_wqebbs;
 	}
 }
+
+#ifdef CONFIG_MLX5_CORE_IPOIB
+
+struct mlx5_wqe_eth_pad {
+	u8 rsvd0[16];
+};
+
+struct mlx5i_tx_wqe {
+	struct mlx5_wqe_ctrl_seg     ctrl;
+	struct mlx5_wqe_datagram_seg datagram;
+	struct mlx5_wqe_eth_pad      pad;
+	struct mlx5_wqe_eth_seg      eth;
+};
+
+static inline void
+mlx5i_txwqe_build_datagram(struct mlx5_av *av, u32 dqpn, u32 dqkey,
+			   struct mlx5_wqe_datagram_seg *dseg)
+{
+	memcpy(&dseg->av, av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(dqpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(dqkey);
+}
+
+netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5_av *av, u32 dqpn, u32 dqkey)
+{
+	struct mlx5_wq_cyc       *wq   = &sq->wq;
+	u16                       pi   = sq->pc & wq->sz_m1;
+	struct mlx5i_tx_wqe      *wqe  = mlx5_wq_cyc_get_wqe(wq, pi);
+	struct mlx5e_tx_wqe_info *wi   = &sq->db.wqe_info[pi];
+
+	struct mlx5_wqe_ctrl_seg     *cseg = &wqe->ctrl;
+	struct mlx5_wqe_datagram_seg *datagram = &wqe->datagram;
+	struct mlx5_wqe_eth_seg      *eseg = &wqe->eth;
+
+	unsigned char *skb_data = skb->data;
+	unsigned int skb_len = skb->len;
+	u8  opcode = MLX5_OPCODE_SEND;
+	unsigned int num_bytes;
+	int num_dma;
+	u16 headlen;
+	u16 ds_cnt;
+	u16 ihs;
+
+	memset(wqe, 0, sizeof(*wqe));
+
+	mlx5i_txwqe_build_datagram(av, dqpn, dqkey, datagram);
+
+	mlx5e_txwqe_build_eseg_csum(sq, skb, eseg);
+
+	if (skb_is_gso(skb)) {
+		opcode = MLX5_OPCODE_LSO;
+		ihs = mlx5e_txwqe_build_eseg_gso(sq, skb, eseg, &num_bytes);
+	} else {
+		ihs = mlx5e_calc_min_inline(sq->min_inline_mode, skb);
+		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+	}
+
+	ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS;
+	if (ihs) {
+		memcpy(eseg->inline_hdr.start, skb_data, ihs);
+		mlx5e_tx_skb_pull_inline(&skb_data, &skb_len, ihs);
+		eseg->inline_hdr.sz = cpu_to_be16(ihs);
+		ds_cnt += DIV_ROUND_UP(ihs - sizeof(eseg->inline_hdr.start), MLX5_SEND_WQE_DS);
+	}
+
+	headlen = skb_len - skb->data_len;
+	num_dma = mlx5e_txwqe_build_dsegs(sq, skb, skb_data, headlen,
+					  (struct mlx5_wqe_data_seg *)cseg + ds_cnt);
+	if (unlikely(num_dma < 0))
+		goto dma_unmap_wqe_err;
+
+	mlx5e_txwqe_complete(sq, skb, opcode, ds_cnt + num_dma,
+			     num_bytes, num_dma, wi, cseg);
+
+	return NETDEV_TX_OK;
+
+dma_unmap_wqe_err:
+	sq->stats.dropped++;
+	mlx5e_dma_unmap_wqe_err(sq, wi->num_dma);
+
+	dev_kfree_skb_any(skb);
+
+	return NETDEV_TX_OK;
+}
+
+#endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
index bd56f36066b3..c468aaedf0a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
@@ -393,6 +393,16 @@ int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca,
 	return err;
 }
 
+int mlx5i_xmit(struct net_device *dev, struct sk_buff *skb,
+	       struct ib_ah *address, u32 dqpn, u32 dqkey)
+{
+	struct mlx5e_priv *epriv = mlx5i_epriv(dev);
+	struct mlx5e_txqsq *sq   = epriv->txq2sq[skb_get_queue_mapping(skb)];
+	struct mlx5_ib_ah *mah   = to_mah(address);
+
+	return mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, dqkey);
+}
+
 static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
 {
 	if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_IB)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h
index 25b8b8a33e24..89bca182464c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h
@@ -47,4 +47,7 @@ struct mlx5i_priv {
 /* Extract mlx5e_priv from IPoIB netdev */
 #define mlx5i_epriv(netdev) ((void *)(((struct mlx5i_priv *)netdev_priv(netdev))->mlx5e_priv))
 
+netdev_tx_t mlx5i_sq_xmit(struct mlx5e_txqsq *sq, struct sk_buff *skb,
+			  struct mlx5_av *av, u32 dqpn, u32 dqkey);
+
 #endif /* __MLX5E_IPOB_H__ */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3096370fe831..bef80d0a0e30 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -295,6 +295,16 @@ struct mlx5_av {
 	u8	rgid[16];
 };
 
+struct mlx5_ib_ah {
+	struct ib_ah		ibah;
+	struct mlx5_av		av;
+};
+
+static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx5_ib_ah, ibah);
+}
+
 struct mlx5_wqe_datagram_seg {
 	struct mlx5_av	av;
 };
-- 
cgit v1.2.3


From 1b72e7fd304639f1cd49d1e11955c4974936d88c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 11 Apr 2017 00:20:41 +0200
Subject: cpufreq: schedutil: Use policy-dependent transition delays

Make the schedutil governor take the initial (default) value of the
rate_limit_us sysfs attribute from the (new) transition_delay_us
policy parameter (to be set by the scaling driver).

That will allow scaling drivers to make schedutil use smaller default
values of rate_limit_us and reduce the default average time interval
between consecutive frequency changes.

Make intel_pstate set transition_delay_us to 500.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/intel_pstate.c   |  2 ++
 include/linux/cpufreq.h          |  7 +++++++
 kernel/sched/cpufreq_schedutil.c | 15 ++++++++++-----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c31b72b16c2b..b7de5bd76a31 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -41,6 +41,7 @@
 #define INTEL_PSTATE_HWP_SAMPLING_INTERVAL	(50 * NSEC_PER_MSEC)
 
 #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
+#define INTEL_CPUFREQ_TRANSITION_DELAY		500
 
 #ifdef CONFIG_ACPI
 #include <acpi/processor.h>
@@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		return ret;
 
 	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
 	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
 	policy->cur = policy->cpuinfo.min_freq;
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 87165f06a307..a5ce0bbeadb5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -120,6 +120,13 @@ struct cpufreq_policy {
 	bool			fast_switch_possible;
 	bool			fast_switch_enabled;
 
+	/*
+	 * Preferred average time interval between consecutive invocations of
+	 * the driver to set the frequency for this policy.  To be set by the
+	 * scaling driver (0, which is the default, means no preference).
+	 */
+	unsigned int		transition_delay_us;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	int cached_resolved_idx;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index b1fedf9932d6..76877a62b5fa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -494,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy;
 	struct sugov_tunables *tunables;
-	unsigned int lat;
 	int ret = 0;
 
 	/* State should be equivalent to EXIT */
@@ -533,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto stop_kthread;
 	}
 
-	tunables->rate_limit_us = LATENCY_MULTIPLIER;
-	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (lat)
-		tunables->rate_limit_us *= lat;
+	if (policy->transition_delay_us) {
+		tunables->rate_limit_us = policy->transition_delay_us;
+	} else {
+		unsigned int lat;
+
+		tunables->rate_limit_us = LATENCY_MULTIPLIER;
+		lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+		if (lat)
+			tunables->rate_limit_us *= lat;
+	}
 
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
-- 
cgit v1.2.3


From 2611dc1939569718c65ffd59c8fb9ba7474d026c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Apr 2017 14:34:51 -0400
Subject: Remove compat_sys_getdents64()

Unlike normal compat syscall variants, it is needed only for
biarch architectures that have different alignement requirements for
u64 in 32bit and 64bit ABI *and* have __put_user() that won't handle
a store of 64bit value at 32bit-aligned address.  We used to have one
such (ia64), but its biarch support has been gone since 2010 (after
being broken in 2008, which went unnoticed since nobody had been using
it).

It had escaped removal at the same time only because back in 2004
a patch that switched several syscalls on amd64 from private wrappers to
generic compat ones had switched to use of compat_sys_getdents64(), which
hadn't needed (or used) a compat wrapper on amd64.

Let's bury it - it's at least 7 years overdue.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm64/include/asm/unistd.h        |  1 -
 arch/arm64/include/asm/unistd32.h      |  2 +-
 arch/x86/entry/syscalls/syscall_32.tbl |  2 +-
 arch/x86/include/asm/unistd.h          |  1 -
 fs/compat.c                            | 91 ----------------------------------
 include/linux/compat.h                 |  5 --
 include/uapi/asm-generic/unistd.h      |  3 +-
 7 files changed, 3 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index bdbeb06dc11e..a0baa9af5487 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -14,7 +14,6 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 #ifdef CONFIG_COMPAT
-#define __ARCH_WANT_COMPAT_SYS_GETDENTS64
 #define __ARCH_WANT_COMPAT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
 #define __ARCH_WANT_SYS_PAUSE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c66b51aab195..ef292160748c 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -456,7 +456,7 @@ __SYSCALL(__NR_setfsuid32, sys_setfsuid)
 #define __NR_setfsgid32 216
 __SYSCALL(__NR_setfsgid32, sys_setfsgid)
 #define __NR_getdents64 217
-__SYSCALL(__NR_getdents64, compat_sys_getdents64)
+__SYSCALL(__NR_getdents64, sys_getdents64)
 #define __NR_pivot_root 218
 __SYSCALL(__NR_pivot_root, sys_pivot_root)
 #define __NR_mincore 219
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9ba050fe47f3..b1a63f6f53c0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -226,7 +226,7 @@
 217	i386	pivot_root		sys_pivot_root
 218	i386	mincore			sys_mincore
 219	i386	madvise			sys_madvise
-220	i386	getdents64		sys_getdents64			compat_sys_getdents64
+220	i386	getdents64		sys_getdents64
 221	i386	fcntl64			sys_fcntl64			compat_sys_fcntl64
 # 222 is unused
 # 223 is unused
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 32712a925f26..1ba1536f627e 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -23,7 +23,6 @@
 #  include <asm/unistd_64.h>
 #  include <asm/unistd_64_x32.h>
 #  define __ARCH_WANT_COMPAT_SYS_TIME
-#  define __ARCH_WANT_COMPAT_SYS_GETDENTS64
 #  define __ARCH_WANT_COMPAT_SYS_PREADV64
 #  define __ARCH_WANT_COMPAT_SYS_PWRITEV64
 #  define __ARCH_WANT_COMPAT_SYS_PREADV64V2
diff --git a/fs/compat.c b/fs/compat.c
index c61b506f5bc9..54e5855e291a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -907,97 +907,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	return error;
 }
 
-#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
-
-struct compat_getdents_callback64 {
-	struct dir_context ctx;
-	struct linux_dirent64 __user *current_dir;
-	struct linux_dirent64 __user *previous;
-	int count;
-	int error;
-};
-
-static int compat_filldir64(struct dir_context *ctx, const char *name,
-			    int namlen, loff_t offset, u64 ino,
-			    unsigned int d_type)
-{
-	struct linux_dirent64 __user *dirent;
-	struct compat_getdents_callback64 *buf =
-		container_of(ctx, struct compat_getdents_callback64, ctx);
-	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
-		sizeof(u64));
-	u64 off;
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	dirent = buf->previous;
-
-	if (dirent) {
-		if (signal_pending(current))
-			return -EINTR;
-		if (__put_user_unaligned(offset, &dirent->d_off))
-			goto efault;
-	}
-	dirent = buf->current_dir;
-	if (__put_user_unaligned(ino, &dirent->d_ino))
-		goto efault;
-	off = 0;
-	if (__put_user_unaligned(off, &dirent->d_off))
-		goto efault;
-	if (__put_user(reclen, &dirent->d_reclen))
-		goto efault;
-	if (__put_user(d_type, &dirent->d_type))
-		goto efault;
-	if (copy_to_user(dirent->d_name, name, namlen))
-		goto efault;
-	if (__put_user(0, dirent->d_name + namlen))
-		goto efault;
-	buf->previous = dirent;
-	dirent = (void __user *)dirent + reclen;
-	buf->current_dir = dirent;
-	buf->count -= reclen;
-	return 0;
-efault:
-	buf->error = -EFAULT;
-	return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
-		struct linux_dirent64 __user *, dirent, unsigned int, count)
-{
-	struct fd f;
-	struct linux_dirent64 __user * lastdirent;
-	struct compat_getdents_callback64 buf = {
-		.ctx.actor = compat_filldir64,
-		.current_dir = dirent,
-		.count = count
-	};
-	int error;
-
-	if (!access_ok(VERIFY_WRITE, dirent, count))
-		return -EFAULT;
-
-	f = fdget_pos(fd);
-	if (!f.file)
-		return -EBADF;
-
-	error = iterate_dir(f.file, &buf.ctx);
-	if (error >= 0)
-		error = buf.error;
-	lastdirent = buf.previous;
-	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
-		if (__put_user_unaligned(d_off, &lastdirent->d_off))
-			error = -EFAULT;
-		else
-			error = count - buf.count;
-	}
-	fdput_pos(f);
-	return error;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
-
 /*
  * Exactly like fs/open.c:sys_open(), except that it doesn't set the
  * O_LARGEFILE flag.
diff --git a/include/linux/compat.h b/include/linux/compat.h
index aef47be2a5c1..54d65eb3d1e7 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -528,11 +528,6 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 asmlinkage long compat_sys_getdents(unsigned int fd,
 				    struct compat_linux_dirent __user *dirent,
 				    unsigned int count);
-#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
-asmlinkage long compat_sys_getdents64(unsigned int fd,
-				      struct linux_dirent64 __user *dirent,
-				      unsigned int count);
-#endif
 asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *,
 				    unsigned int nr_segs, unsigned int flags);
 asmlinkage long compat_sys_open(const char __user *filename, int flags,
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a076cf1a3a23..061185a5eb51 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -194,8 +194,7 @@ __SYSCALL(__NR_quotactl, sys_quotactl)
 
 /* fs/readdir.c */
 #define __NR_getdents64 61
-#define __ARCH_WANT_COMPAT_SYS_GETDENTS64
-__SC_COMP(__NR_getdents64, sys_getdents64, compat_sys_getdents64)
+__SYSCALL(__NR_getdents64, sys_getdents64)
 
 /* fs/read_write.c */
 #define __NR3264_lseek 62
-- 
cgit v1.2.3


From e99ca56ce03dd90991025878152bae8b53484147 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 8 Apr 2017 16:50:24 -0400
Subject: move compat select-related syscalls to fs/select.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c          | 368 --------------------------------------------
 fs/select.c          | 421 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/poll.h |  56 -------
 3 files changed, 419 insertions(+), 426 deletions(-)

(limited to 'include/linux')

diff --git a/fs/compat.c b/fs/compat.c
index 54e5855e291a..bc15c23fae55 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -43,7 +43,6 @@
 #include <linux/security.h>
 #include <linux/highmem.h>
 #include <linux/signal.h>
-#include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
@@ -925,373 +924,6 @@ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, fla
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
-#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
-
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
-				      int timeval, int ret)
-{
-	struct timespec ts;
-
-	if (!p)
-		return ret;
-
-	if (current->personality & STICKY_TIMEOUTS)
-		goto sticky;
-
-	/* No update for zero timeout */
-	if (!end_time->tv_sec && !end_time->tv_nsec)
-		return ret;
-
-	ktime_get_ts(&ts);
-	ts = timespec_sub(*end_time, ts);
-	if (ts.tv_sec < 0)
-		ts.tv_sec = ts.tv_nsec = 0;
-
-	if (timeval) {
-		struct compat_timeval rtv;
-
-		rtv.tv_sec = ts.tv_sec;
-		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
-
-		if (!copy_to_user(p, &rtv, sizeof(rtv)))
-			return ret;
-	} else {
-		struct compat_timespec rts;
-
-		rts.tv_sec = ts.tv_sec;
-		rts.tv_nsec = ts.tv_nsec;
-
-		if (!copy_to_user(p, &rts, sizeof(rts)))
-			return ret;
-	}
-	/*
-	 * If an application puts its timeval in read-only memory, we
-	 * don't want the Linux-specific update to the timeval to
-	 * cause a fault after the select has completed
-	 * successfully. However, because we're not updating the
-	 * timeval, we can't restart the system call.
-	 */
-
-sticky:
-	if (ret == -ERESTARTNOHAND)
-		ret = -EINTR;
-	return ret;
-}
-
-/*
- * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
- * 64-bit unsigned longs.
- */
-static
-int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
-			unsigned long *fdset)
-{
-	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-	if (ufdset) {
-		unsigned long odd;
-
-		if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
-			return -EFAULT;
-
-		odd = nr & 1UL;
-		nr &= ~1UL;
-		while (nr) {
-			unsigned long h, l;
-			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
-				return -EFAULT;
-			ufdset += 2;
-			*fdset++ = h << 32 | l;
-			nr -= 2;
-		}
-		if (odd && __get_user(*fdset, ufdset))
-			return -EFAULT;
-	} else {
-		/* Tricky, must clear full unsigned long in the
-		 * kernel fdset at the end, this makes sure that
-		 * actually happens.
-		 */
-		memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
-	}
-	return 0;
-}
-
-static
-int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
-		      unsigned long *fdset)
-{
-	unsigned long odd;
-	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-
-	if (!ufdset)
-		return 0;
-
-	odd = nr & 1UL;
-	nr &= ~1UL;
-	while (nr) {
-		unsigned long h, l;
-		l = *fdset++;
-		h = l >> 32;
-		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
-			return -EFAULT;
-		ufdset += 2;
-		nr -= 2;
-	}
-	if (odd && __put_user(*fdset, ufdset))
-		return -EFAULT;
-	return 0;
-}
-
-
-/*
- * This is a virtual copy of sys_select from fs/select.c and probably
- * should be compared to it from time to time
- */
-
-/*
- * We can actually return ERESTARTSYS instead of EINTR, but I'd
- * like to be certain this leads to no problems. So I return
- * EINTR just for safety.
- *
- * Update: ERESTARTSYS breaks at least the xview clock binary, so
- * I'm trying ERESTARTNOHAND which restart only when you want to.
- */
-int compat_core_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct timespec *end_time)
-{
-	fd_set_bits fds;
-	void *bits;
-	int size, max_fds, ret = -EINVAL;
-	struct fdtable *fdt;
-	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
-
-	if (n < 0)
-		goto out_nofds;
-
-	/* max_fds can increase, so grab it once to avoid race */
-	rcu_read_lock();
-	fdt = files_fdtable(current->files);
-	max_fds = fdt->max_fds;
-	rcu_read_unlock();
-	if (n > max_fds)
-		n = max_fds;
-
-	/*
-	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
-	 * since we used fdset we need to allocate memory in units of
-	 * long-words.
-	 */
-	size = FDS_BYTES(n);
-	bits = stack_fds;
-	if (size > sizeof(stack_fds) / 6) {
-		bits = kmalloc(6 * size, GFP_KERNEL);
-		ret = -ENOMEM;
-		if (!bits)
-			goto out_nofds;
-	}
-	fds.in      = (unsigned long *)  bits;
-	fds.out     = (unsigned long *) (bits +   size);
-	fds.ex      = (unsigned long *) (bits + 2*size);
-	fds.res_in  = (unsigned long *) (bits + 3*size);
-	fds.res_out = (unsigned long *) (bits + 4*size);
-	fds.res_ex  = (unsigned long *) (bits + 5*size);
-
-	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
-	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
-	    (ret = compat_get_fd_set(n, exp, fds.ex)))
-		goto out;
-	zero_fd_set(n, fds.res_in);
-	zero_fd_set(n, fds.res_out);
-	zero_fd_set(n, fds.res_ex);
-
-	ret = do_select(n, &fds, end_time);
-
-	if (ret < 0)
-		goto out;
-	if (!ret) {
-		ret = -ERESTARTNOHAND;
-		if (signal_pending(current))
-			goto out;
-		ret = 0;
-	}
-
-	if (compat_set_fd_set(n, inp, fds.res_in) ||
-	    compat_set_fd_set(n, outp, fds.res_out) ||
-	    compat_set_fd_set(n, exp, fds.res_ex))
-		ret = -EFAULT;
-out:
-	if (bits != stack_fds)
-		kfree(bits);
-out_nofds:
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
-	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-	struct compat_timeval __user *, tvp)
-{
-	struct timespec end_time, *to = NULL;
-	struct compat_timeval tv;
-	int ret;
-
-	if (tvp) {
-		if (copy_from_user(&tv, tvp, sizeof(tv)))
-			return -EFAULT;
-
-		to = &end_time;
-		if (poll_select_set_timeout(to,
-				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
-				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
-			return -EINVAL;
-	}
-
-	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
-
-	return ret;
-}
-
-struct compat_sel_arg_struct {
-	compat_ulong_t n;
-	compat_uptr_t inp;
-	compat_uptr_t outp;
-	compat_uptr_t exp;
-	compat_uptr_t tvp;
-};
-
-COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
-{
-	struct compat_sel_arg_struct a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		return -EFAULT;
-	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
-				 compat_ptr(a.exp), compat_ptr(a.tvp));
-}
-
-static long do_compat_pselect(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
-	compat_size_t sigsetsize)
-{
-	compat_sigset_t ss32;
-	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
-	int ret;
-
-	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
-			return -EFAULT;
-
-		to = &end_time;
-		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
-			return -EINVAL;
-	}
-
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
-	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-	struct compat_timespec __user *, tsp, void __user *, sig)
-{
-	compat_size_t sigsetsize = 0;
-	compat_uptr_t up = 0;
-
-	if (sig) {
-		if (!access_ok(VERIFY_READ, sig,
-				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
-		    	__get_user(up, (compat_uptr_t __user *)sig) ||
-		    	__get_user(sigsetsize,
-				(compat_size_t __user *)(sig+sizeof(up))))
-			return -EFAULT;
-	}
-	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
-				 sigsetsize);
-}
-
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
-	unsigned int,  nfds, struct compat_timespec __user *, tsp,
-	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
-{
-	compat_sigset_t ss32;
-	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
-	int ret;
-
-	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
-			return -EFAULT;
-
-		to = &end_time;
-		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
-			return -EINVAL;
-	}
-
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	ret = do_sys_poll(ufds, nfds, to);
-
-	/* We can restart this syscall, usually */
-	if (ret == -EINTR) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-				sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-		ret = -ERESTARTNOHAND;
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	return ret;
-}
-
 #ifdef CONFIG_FHANDLE
 /*
  * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
diff --git a/fs/select.c b/fs/select.c
index e2112270d75a..dd70937ddb60 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -338,6 +338,53 @@ sticky:
 	return ret;
 }
 
+/*
+ * Scalable version of the fd_set.
+ */
+
+typedef struct {
+	unsigned long *in, *out, *ex;
+	unsigned long *res_in, *res_out, *res_ex;
+} fd_set_bits;
+
+/*
+ * How many longwords for "nr" bits?
+ */
+#define FDS_BITPERLONG	(8*sizeof(long))
+#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
+#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
+
+/*
+ * We do a VERIFY_WRITE here even though we are only reading this time:
+ * we'll write to it eventually..
+ *
+ * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
+ */
+static inline
+int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+	nr = FDS_BYTES(nr);
+	if (ufdset)
+		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
+
+	memset(fdset, 0, nr);
+	return 0;
+}
+
+static inline unsigned long __must_check
+set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+	if (ufdset)
+		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
+	return 0;
+}
+
+static inline
+void zero_fd_set(unsigned long nr, unsigned long *fdset)
+{
+	memset(fdset, 0, FDS_BYTES(nr));
+}
+
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
@@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
@@ -881,7 +928,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 			sizeof(struct pollfd))
 
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
@@ -1053,3 +1100,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 
 	return ret;
 }
+
+#ifdef CONFIG_COMPAT
+#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
+
+static
+int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+/*
+ * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
+ * 64-bit unsigned longs.
+ */
+static
+int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+			unsigned long *fdset)
+{
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+	if (ufdset) {
+		unsigned long odd;
+
+		if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
+			return -EFAULT;
+
+		odd = nr & 1UL;
+		nr &= ~1UL;
+		while (nr) {
+			unsigned long h, l;
+			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+				return -EFAULT;
+			ufdset += 2;
+			*fdset++ = h << 32 | l;
+			nr -= 2;
+		}
+		if (odd && __get_user(*fdset, ufdset))
+			return -EFAULT;
+	} else {
+		/* Tricky, must clear full unsigned long in the
+		 * kernel fdset at the end, this makes sure that
+		 * actually happens.
+		 */
+		memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+	}
+	return 0;
+}
+
+static
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+		      unsigned long *fdset)
+{
+	unsigned long odd;
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+
+	if (!ufdset)
+		return 0;
+
+	odd = nr & 1UL;
+	nr &= ~1UL;
+	while (nr) {
+		unsigned long h, l;
+		l = *fdset++;
+		h = l >> 32;
+		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+			return -EFAULT;
+		ufdset += 2;
+		nr -= 2;
+	}
+	if (odd && __put_user(*fdset, ufdset))
+		return -EFAULT;
+	return 0;
+}
+
+
+/*
+ * This is a virtual copy of sys_select from fs/select.c and probably
+ * should be compared to it from time to time
+ */
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
+{
+	fd_set_bits fds;
+	void *bits;
+	int size, max_fds, ret = -EINVAL;
+	struct fdtable *fdt;
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+	if (n < 0)
+		goto out_nofds;
+
+	/* max_fds can increase, so grab it once to avoid race */
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fds = fdt->max_fds;
+	rcu_read_unlock();
+	if (n > max_fds)
+		n = max_fds;
+
+	/*
+	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+	 * since we used fdset we need to allocate memory in units of
+	 * long-words.
+	 */
+	size = FDS_BYTES(n);
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = (unsigned long *)  bits;
+	fds.out     = (unsigned long *) (bits +   size);
+	fds.ex      = (unsigned long *) (bits + 2*size);
+	fds.res_in  = (unsigned long *) (bits + 3*size);
+	fds.res_out = (unsigned long *) (bits + 4*size);
+	fds.res_ex  = (unsigned long *) (bits + 5*size);
+
+	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
+	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
+	    (ret = compat_get_fd_set(n, exp, fds.ex)))
+		goto out;
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
+
+	ret = do_select(n, &fds, end_time);
+
+	if (ret < 0)
+		goto out;
+	if (!ret) {
+		ret = -ERESTARTNOHAND;
+		if (signal_pending(current))
+			goto out;
+		ret = 0;
+	}
+
+	if (compat_set_fd_set(n, inp, fds.res_in) ||
+	    compat_set_fd_set(n, outp, fds.res_out) ||
+	    compat_set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
+out:
+	if (bits != stack_fds)
+		kfree(bits);
+out_nofds:
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timeval __user *, tvp)
+{
+	struct timespec end_time, *to = NULL;
+	struct compat_timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+			return -EINVAL;
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+	return ret;
+}
+
+struct compat_sel_arg_struct {
+	compat_ulong_t n;
+	compat_uptr_t inp;
+	compat_uptr_t outp;
+	compat_uptr_t exp;
+	compat_uptr_t tvp;
+};
+
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
+{
+	struct compat_sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+				 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timespec __user *, tsp, void __user *, sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+		    	__get_user(up, (compat_uptr_t __user *)sig) ||
+		    	__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+	unsigned int,  nfds, struct compat_timespec __user *, tsp,
+	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	return ret;
+}
+#endif
diff --git a/include/linux/poll.h b/include/linux/poll.h
index a46d6755035e..75ffc5729e4c 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -98,64 +98,8 @@ extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 				 ktime_t *expires, unsigned long slack);
 extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
-
-static inline int poll_schedule(struct poll_wqueues *pwq, int state)
-{
-	return poll_schedule_timeout(pwq, state, NULL, 0);
-}
-
-/*
- * Scalable version of the fd_set.
- */
-
-typedef struct {
-	unsigned long *in, *out, *ex;
-	unsigned long *res_in, *res_out, *res_ex;
-} fd_set_bits;
-
-/*
- * How many longwords for "nr" bits?
- */
-#define FDS_BITPERLONG	(8*sizeof(long))
-#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
-#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
-
-/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
- * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
- */
-static inline
-int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
-{
-	nr = FDS_BYTES(nr);
-	if (ufdset)
-		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
-
-	memset(fdset, 0, nr);
-	return 0;
-}
-
-static inline unsigned long __must_check
-set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
-{
-	if (ufdset)
-		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
-	return 0;
-}
-
-static inline
-void zero_fd_set(unsigned long nr, unsigned long *fdset)
-{
-	memset(fdset, 0, FDS_BYTES(nr));
-}
-
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
 
-extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
-extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-		       struct timespec64 *end_time);
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, struct timespec64 *end_time);
 
-- 
cgit v1.2.3


From 57240d007816486131bee88cd474c2a71f0fe224 Mon Sep 17 00:00:00 2001
From: "R. Parameswaran" <parameswaran.r7@gmail.com>
Date: Wed, 12 Apr 2017 18:31:04 -0700
Subject: l2tp: device MTU setup, tunnel socket needs a lock

The MTU overhead calculation in L2TP device set-up
merged via commit b784e7ebfce8cfb16c6f95e14e8532d0768ab7ff
needs to be adjusted to lock the tunnel socket while
referencing the sub-data structures to derive the
socket's IP overhead.

Reported-by: Guillaume Nault <g.nault@alphalink.fr>
Tested-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: R. Parameswaran <rparames@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/net.h | 2 +-
 net/l2tp/l2tp_eth.c | 2 ++
 net/socket.c        | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index a42fab24c8af..abcfa46a2bd9 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -298,7 +298,7 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset,
 int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg);
 int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how);
 
-/* Following routine returns the IP overhead imposed by a socket.  */
+/* Routine returns the IP overhead imposed by a (caller-protected) socket. */
 u32 kernel_sock_ip_overhead(struct sock *sk);
 
 #define MODULE_ALIAS_NETPROTO(proto) \
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 138566a63123..b722d559c544 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -225,7 +225,9 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
 		dev->needed_headroom += session->hdr_len;
 		return;
 	}
+	lock_sock(tunnel->sock);
 	l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
+	release_sock(tunnel->sock);
 	if (l3_overhead == 0) {
 		/* L3 Overhead couldn't be identified, this could be
 		 * because tunnel->sock was NULL or the socket's
diff --git a/net/socket.c b/net/socket.c
index eea997036ada..c2564eb25c6b 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3360,7 +3360,7 @@ EXPORT_SYMBOL(kernel_sock_shutdown);
 /* This routine returns the IP overhead imposed by a socket i.e.
  * the length of the underlying IP header, depending on whether
  * this is an IPv4 or IPv6 socket and the length from IP options turned
- * on at the socket.
+ * on at the socket. Assumes that the caller has a lock on the socket.
  */
 u32 kernel_sock_ip_overhead(struct sock *sk)
 {
-- 
cgit v1.2.3


From d51e4af5c2092c48a06ceaf2323b13a39a2df4ee Mon Sep 17 00:00:00 2001
From: "Chopra, Manish" <Manish.Chopra@cavium.com>
Date: Thu, 13 Apr 2017 04:54:44 -0700
Subject: qed: aRFS infrastructure support

This patch adds necessary APIs to interface with
qede aRFS support in successive patch.

It also reserves separate PTT entry for aRFS,
[as being in fastpath flow] for hardware access instead of
trying to acquire it at run time from the ptt pool.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h              |   2 +
 drivers/net/ethernet/qlogic/qed/qed_cxt.c          |  13 +-
 drivers/net/ethernet/qlogic/qed/qed_hsi.h          | 187 +++++++++++++++++++++
 .../net/ethernet/qlogic/qed/qed_init_fw_funcs.c    | 129 ++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_l2.c           | 133 +++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_l2.h           |   8 +
 drivers/net/ethernet/qlogic/qed/qed_main.c         |  26 +++
 drivers/net/ethernet/qlogic/qed/qed_reg_addr.h     |   8 +
 drivers/net/ethernet/qlogic/qed/qed_sp.h           |   1 +
 include/linux/qed/qed_eth_if.h                     |   8 +
 include/linux/qed/qed_if.h                         |   7 +
 11 files changed, 519 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index 4896ee0cc458..c539ba138db9 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -504,6 +504,8 @@ struct qed_hwfn {
 	u8 dcbx_no_edpm;
 	u8 db_bar_no_edpm;
 
+	struct qed_ptt *p_arfs_ptt;
+
 	/* p_ptp_ptt is valid for leading HWFN only */
 	struct qed_ptt *p_ptp_ptt;
 	struct qed_simd_fp_handler	simd_proto_handler[64];
diff --git a/drivers/net/ethernet/qlogic/qed/qed_cxt.c b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
index 15ef6ebed6bb..b3aaa985956e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_cxt.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_cxt.c
@@ -219,9 +219,6 @@ struct qed_cxt_mngr {
 	 */
 	u32				vf_count;
 
-	/* total number of SRQ's for this hwfn */
-	u32 srq_count;
-
 	/* Acquired CIDs */
 	struct qed_cid_acquired_map	acquired[MAX_CONN_TYPES];
 
@@ -237,6 +234,12 @@ struct qed_cxt_mngr {
 	u32 t2_num_pages;
 	u64 first_free;
 	u64 last_free;
+
+	/* total number of SRQ's for this hwfn */
+	u32 srq_count;
+
+	/* Maximal number of L2 steering filters */
+	u32 arfs_count;
 };
 static bool src_proto(enum protocol_type type)
 {
@@ -291,6 +294,9 @@ static void qed_cxt_src_iids(struct qed_cxt_mngr *p_mngr,
 		iids->pf_cids += p_mngr->conn_cfg[i].cid_count;
 		iids->per_vf_cids += p_mngr->conn_cfg[i].cids_per_vf;
 	}
+
+	/* Add L2 filtering filters in addition */
+	iids->pf_cids += p_mngr->arfs_count;
 }
 
 /* counts the iids for the Timers block configuration */
@@ -2007,6 +2013,7 @@ int qed_cxt_set_pf_params(struct qed_hwfn *p_hwfn, u32 rdma_tasks)
 
 		qed_cxt_set_proto_cid_count(p_hwfn, PROTOCOLID_ETH,
 					    p_params->num_cons, 1);
+		p_hwfn->p_cxt_mngr->arfs_count = p_params->num_arfs_filters;
 		break;
 	}
 	case QED_PCI_FCOE:
diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index 815c4ec5b458..858a57a73589 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -3473,6 +3473,11 @@ void qed_set_geneve_dest_port(struct qed_hwfn *p_hwfn,
 void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
 			   struct qed_ptt *p_ptt,
 			   bool eth_geneve_enable, bool ip_geneve_enable);
+void qed_set_rfs_mode_disable(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u16 pf_id);
+void qed_set_rfs_mode_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			     u16 pf_id, bool tcp, bool udp,
+			     bool ipv4, bool ipv6);
 
 #define	YSTORM_FLOW_CONTROL_MODE_OFFSET			(IRO[0].base)
 #define	YSTORM_FLOW_CONTROL_MODE_SIZE			(IRO[0].size)
@@ -4862,6 +4867,18 @@ struct eth_vport_tx_mode {
 	__le16 reserved2[3];
 };
 
+enum gft_filter_update_action {
+	GFT_ADD_FILTER,
+	GFT_DELETE_FILTER,
+	MAX_GFT_FILTER_UPDATE_ACTION
+};
+
+enum gft_logic_filter_type {
+	GFT_FILTER_TYPE,
+	RFS_FILTER_TYPE,
+	MAX_GFT_LOGIC_FILTER_TYPE
+};
+
 /* Ramrod data for rx queue start ramrod */
 struct rx_queue_start_ramrod_data {
 	__le16 rx_queue_id;
@@ -4932,6 +4949,16 @@ struct rx_udp_filter_data {
 	__le32 tenant_id;
 };
 
+struct rx_update_gft_filter_data {
+	struct regpair pkt_hdr_addr;
+	__le16 pkt_hdr_length;
+	__le16 rx_qid_or_action_icid;
+	u8 vport_id;
+	u8 filter_type;
+	u8 filter_action;
+	u8 reserved;
+};
+
 /* Ramrod data for rx queue start ramrod */
 struct tx_queue_start_ramrod_data {
 	__le16 sb_id;
@@ -5075,6 +5102,166 @@ struct vport_update_ramrod_data {
 	struct eth_vport_rss_config rss_config;
 };
 
+struct gft_cam_line {
+	__le32 camline;
+#define GFT_CAM_LINE_VALID_MASK		0x1
+#define GFT_CAM_LINE_VALID_SHIFT	0
+#define GFT_CAM_LINE_DATA_MASK		0x3FFF
+#define GFT_CAM_LINE_DATA_SHIFT		1
+#define GFT_CAM_LINE_MASK_BITS_MASK	0x3FFF
+#define GFT_CAM_LINE_MASK_BITS_SHIFT	15
+#define GFT_CAM_LINE_RESERVED1_MASK	0x7
+#define GFT_CAM_LINE_RESERVED1_SHIFT	29
+};
+
+struct gft_cam_line_mapped {
+	__le32 camline;
+#define GFT_CAM_LINE_MAPPED_VALID_MASK				0x1
+#define GFT_CAM_LINE_MAPPED_VALID_SHIFT				0
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_MASK			0x1
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_SHIFT			1
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_MASK		0x1
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_SHIFT		2
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK		0xF
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_SHIFT		3
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_MASK			0xF
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_SHIFT			7
+#define GFT_CAM_LINE_MAPPED_PF_ID_MASK				0xF
+#define GFT_CAM_LINE_MAPPED_PF_ID_SHIFT				11
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_MASK_MASK		0x1
+#define GFT_CAM_LINE_MAPPED_IP_VERSION_MASK_SHIFT		15
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_MASK_MASK		0x1
+#define GFT_CAM_LINE_MAPPED_TUNNEL_IP_VERSION_MASK_SHIFT	16
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK_MASK	0xF
+#define GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK_SHIFT	17
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_MASK_MASK		0xF
+#define GFT_CAM_LINE_MAPPED_TUNNEL_TYPE_MASK_SHIFT		21
+#define GFT_CAM_LINE_MAPPED_PF_ID_MASK_MASK			0xF
+#define GFT_CAM_LINE_MAPPED_PF_ID_MASK_SHIFT			25
+#define GFT_CAM_LINE_MAPPED_RESERVED1_MASK			0x7
+#define GFT_CAM_LINE_MAPPED_RESERVED1_SHIFT			29
+};
+
+union gft_cam_line_union {
+	struct gft_cam_line cam_line;
+	struct gft_cam_line_mapped cam_line_mapped;
+};
+
+enum gft_profile_ip_version {
+	GFT_PROFILE_IPV4 = 0,
+	GFT_PROFILE_IPV6 = 1,
+	MAX_GFT_PROFILE_IP_VERSION
+};
+
+enum gft_profile_upper_protocol_type {
+	GFT_PROFILE_ROCE_PROTOCOL = 0,
+	GFT_PROFILE_RROCE_PROTOCOL = 1,
+	GFT_PROFILE_FCOE_PROTOCOL = 2,
+	GFT_PROFILE_ICMP_PROTOCOL = 3,
+	GFT_PROFILE_ARP_PROTOCOL = 4,
+	GFT_PROFILE_USER_TCP_SRC_PORT_1_INNER = 5,
+	GFT_PROFILE_USER_TCP_DST_PORT_1_INNER = 6,
+	GFT_PROFILE_TCP_PROTOCOL = 7,
+	GFT_PROFILE_USER_UDP_DST_PORT_1_INNER = 8,
+	GFT_PROFILE_USER_UDP_DST_PORT_2_OUTER = 9,
+	GFT_PROFILE_UDP_PROTOCOL = 10,
+	GFT_PROFILE_USER_IP_1_INNER = 11,
+	GFT_PROFILE_USER_IP_2_OUTER = 12,
+	GFT_PROFILE_USER_ETH_1_INNER = 13,
+	GFT_PROFILE_USER_ETH_2_OUTER = 14,
+	GFT_PROFILE_RAW = 15,
+	MAX_GFT_PROFILE_UPPER_PROTOCOL_TYPE
+};
+
+struct gft_ram_line {
+	__le32 low32bits;
+#define GFT_RAM_LINE_VLAN_SELECT_MASK			0x3
+#define GFT_RAM_LINE_VLAN_SELECT_SHIFT			0
+#define GFT_RAM_LINE_TUNNEL_ENTROPHY_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_ENTROPHY_SHIFT		2
+#define GFT_RAM_LINE_TUNNEL_TTL_EQUAL_ONE_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_TTL_EQUAL_ONE_SHIFT		3
+#define GFT_RAM_LINE_TUNNEL_TTL_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_TTL_SHIFT			4
+#define GFT_RAM_LINE_TUNNEL_ETHERTYPE_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_ETHERTYPE_SHIFT		5
+#define GFT_RAM_LINE_TUNNEL_DST_PORT_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_DST_PORT_SHIFT		6
+#define GFT_RAM_LINE_TUNNEL_SRC_PORT_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_SRC_PORT_SHIFT		7
+#define GFT_RAM_LINE_TUNNEL_DSCP_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_DSCP_SHIFT			8
+#define GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL_MASK	0x1
+#define GFT_RAM_LINE_TUNNEL_OVER_IP_PROTOCOL_SHIFT	9
+#define GFT_RAM_LINE_TUNNEL_DST_IP_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_DST_IP_SHIFT		10
+#define GFT_RAM_LINE_TUNNEL_SRC_IP_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_SRC_IP_SHIFT		11
+#define GFT_RAM_LINE_TUNNEL_PRIORITY_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_PRIORITY_SHIFT		12
+#define GFT_RAM_LINE_TUNNEL_PROVIDER_VLAN_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_PROVIDER_VLAN_SHIFT		13
+#define GFT_RAM_LINE_TUNNEL_VLAN_MASK			0x1
+#define GFT_RAM_LINE_TUNNEL_VLAN_SHIFT			14
+#define GFT_RAM_LINE_TUNNEL_DST_MAC_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_DST_MAC_SHIFT		15
+#define GFT_RAM_LINE_TUNNEL_SRC_MAC_MASK		0x1
+#define GFT_RAM_LINE_TUNNEL_SRC_MAC_SHIFT		16
+#define GFT_RAM_LINE_TTL_EQUAL_ONE_MASK			0x1
+#define GFT_RAM_LINE_TTL_EQUAL_ONE_SHIFT		17
+#define GFT_RAM_LINE_TTL_MASK				0x1
+#define GFT_RAM_LINE_TTL_SHIFT				18
+#define GFT_RAM_LINE_ETHERTYPE_MASK			0x1
+#define GFT_RAM_LINE_ETHERTYPE_SHIFT			19
+#define GFT_RAM_LINE_RESERVED0_MASK			0x1
+#define GFT_RAM_LINE_RESERVED0_SHIFT			20
+#define GFT_RAM_LINE_TCP_FLAG_FIN_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_FIN_SHIFT			21
+#define GFT_RAM_LINE_TCP_FLAG_SYN_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_SYN_SHIFT			22
+#define GFT_RAM_LINE_TCP_FLAG_RST_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_RST_SHIFT			23
+#define GFT_RAM_LINE_TCP_FLAG_PSH_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_PSH_SHIFT			24
+#define GFT_RAM_LINE_TCP_FLAG_ACK_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_ACK_SHIFT			25
+#define GFT_RAM_LINE_TCP_FLAG_URG_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_URG_SHIFT			26
+#define GFT_RAM_LINE_TCP_FLAG_ECE_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_ECE_SHIFT			27
+#define GFT_RAM_LINE_TCP_FLAG_CWR_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_CWR_SHIFT			28
+#define GFT_RAM_LINE_TCP_FLAG_NS_MASK			0x1
+#define GFT_RAM_LINE_TCP_FLAG_NS_SHIFT			29
+#define GFT_RAM_LINE_DST_PORT_MASK			0x1
+#define GFT_RAM_LINE_DST_PORT_SHIFT			30
+#define GFT_RAM_LINE_SRC_PORT_MASK			0x1
+#define GFT_RAM_LINE_SRC_PORT_SHIFT			31
+	__le32 high32bits;
+#define GFT_RAM_LINE_DSCP_MASK				0x1
+#define GFT_RAM_LINE_DSCP_SHIFT				0
+#define GFT_RAM_LINE_OVER_IP_PROTOCOL_MASK		0x1
+#define GFT_RAM_LINE_OVER_IP_PROTOCOL_SHIFT		1
+#define GFT_RAM_LINE_DST_IP_MASK			0x1
+#define GFT_RAM_LINE_DST_IP_SHIFT			2
+#define GFT_RAM_LINE_SRC_IP_MASK			0x1
+#define GFT_RAM_LINE_SRC_IP_SHIFT			3
+#define GFT_RAM_LINE_PRIORITY_MASK			0x1
+#define GFT_RAM_LINE_PRIORITY_SHIFT			4
+#define GFT_RAM_LINE_PROVIDER_VLAN_MASK			0x1
+#define GFT_RAM_LINE_PROVIDER_VLAN_SHIFT		5
+#define GFT_RAM_LINE_VLAN_MASK				0x1
+#define GFT_RAM_LINE_VLAN_SHIFT				6
+#define GFT_RAM_LINE_DST_MAC_MASK			0x1
+#define GFT_RAM_LINE_DST_MAC_SHIFT			7
+#define GFT_RAM_LINE_SRC_MAC_MASK			0x1
+#define GFT_RAM_LINE_SRC_MAC_SHIFT			8
+#define GFT_RAM_LINE_TENANT_ID_MASK			0x1
+#define GFT_RAM_LINE_TENANT_ID_SHIFT			9
+#define GFT_RAM_LINE_RESERVED1_MASK			0x3FFFFF
+#define GFT_RAM_LINE_RESERVED1_SHIFT			10
+};
+
 struct mstorm_eth_conn_ag_ctx {
 	u8 byte0;
 	u8 byte1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
index 2a50e2b7568f..67200c5498ab 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c
@@ -961,3 +961,132 @@ void qed_set_geneve_enable(struct qed_hwfn *p_hwfn,
 	qed_wr(p_hwfn, p_ptt, DORQ_REG_L2_EDPM_TUNNEL_NGE_IP_EN,
 	       ip_geneve_enable ? 1 : 0);
 }
+
+#define T_ETH_PACKET_MATCH_RFS_EVENTID 25
+#define PARSER_ETH_CONN_CM_HDR (0x0)
+#define CAM_LINE_SIZE sizeof(u32)
+#define RAM_LINE_SIZE sizeof(u64)
+#define REG_SIZE sizeof(u32)
+
+void qed_set_rfs_mode_disable(struct qed_hwfn *p_hwfn,
+			      struct qed_ptt *p_ptt, u16 pf_id)
+{
+	union gft_cam_line_union camline;
+	struct gft_ram_line ramline;
+	u32 *p_ramline, i;
+
+	p_ramline = (u32 *)&ramline;
+
+	/*stop using gft logic */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_GFT, 0);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_CM_HDR_GFT, 0x0);
+	memset(&camline, 0, sizeof(union gft_cam_line_union));
+	qed_wr(p_hwfn, p_ptt, PRS_REG_GFT_CAM + CAM_LINE_SIZE * pf_id,
+	       camline.cam_line_mapped.camline);
+	memset(&ramline, 0, sizeof(union gft_cam_line_union));
+
+	for (i = 0; i < RAM_LINE_SIZE / REG_SIZE; i++) {
+		u32 hw_addr = PRS_REG_GFT_PROFILE_MASK_RAM;
+
+		hw_addr += (RAM_LINE_SIZE * pf_id + i * REG_SIZE);
+
+		qed_wr(p_hwfn, p_ptt, hw_addr, *(p_ramline + i));
+	}
+}
+
+void qed_set_rfs_mode_enable(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			     u16 pf_id, bool tcp, bool udp,
+			     bool ipv4, bool ipv6)
+{
+	u32 rfs_cm_hdr_event_id, *p_ramline;
+	union gft_cam_line_union camline;
+	struct gft_ram_line ramline;
+	int i;
+
+	rfs_cm_hdr_event_id = qed_rd(p_hwfn, p_ptt, PRS_REG_CM_HDR_GFT);
+	p_ramline = (u32 *)&ramline;
+
+	if (!ipv6 && !ipv4)
+		DP_NOTICE(p_hwfn,
+			  "set_rfs_mode_enable: must accept at least on of - ipv4 or ipv6");
+	if (!tcp && !udp)
+		DP_NOTICE(p_hwfn,
+			  "set_rfs_mode_enable: must accept at least on of - udp or tcp");
+
+	rfs_cm_hdr_event_id |= T_ETH_PACKET_MATCH_RFS_EVENTID <<
+					PRS_REG_CM_HDR_GFT_EVENT_ID_SHIFT;
+	rfs_cm_hdr_event_id |= PARSER_ETH_CONN_CM_HDR <<
+					PRS_REG_CM_HDR_GFT_CM_HDR_SHIFT;
+	qed_wr(p_hwfn, p_ptt, PRS_REG_CM_HDR_GFT, rfs_cm_hdr_event_id);
+
+	/* Configure Registers for RFS mode */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_GFT, 1);
+	qed_wr(p_hwfn, p_ptt, PRS_REG_LOAD_L2_FILTER, 0);
+	camline.cam_line_mapped.camline = 0;
+
+	/* cam line is now valid!! */
+	SET_FIELD(camline.cam_line_mapped.camline,
+		  GFT_CAM_LINE_MAPPED_VALID, 1);
+
+	/* filters are per PF!! */
+	SET_FIELD(camline.cam_line_mapped.camline,
+		  GFT_CAM_LINE_MAPPED_PF_ID_MASK, 1);
+	SET_FIELD(camline.cam_line_mapped.camline,
+		  GFT_CAM_LINE_MAPPED_PF_ID, pf_id);
+	if (!(tcp && udp)) {
+		SET_FIELD(camline.cam_line_mapped.camline,
+			  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE_MASK, 1);
+		if (tcp)
+			SET_FIELD(camline.cam_line_mapped.camline,
+				  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE,
+				  GFT_PROFILE_TCP_PROTOCOL);
+		else
+			SET_FIELD(camline.cam_line_mapped.camline,
+				  GFT_CAM_LINE_MAPPED_UPPER_PROTOCOL_TYPE,
+				  GFT_PROFILE_UDP_PROTOCOL);
+	}
+
+	if (!(ipv4 && ipv6)) {
+		SET_FIELD(camline.cam_line_mapped.camline,
+			  GFT_CAM_LINE_MAPPED_IP_VERSION_MASK, 1);
+		if (ipv4)
+			SET_FIELD(camline.cam_line_mapped.camline,
+				  GFT_CAM_LINE_MAPPED_IP_VERSION,
+				  GFT_PROFILE_IPV4);
+		else
+			SET_FIELD(camline.cam_line_mapped.camline,
+				  GFT_CAM_LINE_MAPPED_IP_VERSION,
+				  GFT_PROFILE_IPV6);
+	}
+
+	/* write characteristics to cam */
+	qed_wr(p_hwfn, p_ptt, PRS_REG_GFT_CAM + CAM_LINE_SIZE * pf_id,
+	       camline.cam_line_mapped.camline);
+	camline.cam_line_mapped.camline = qed_rd(p_hwfn, p_ptt,
+						 PRS_REG_GFT_CAM +
+						 CAM_LINE_SIZE * pf_id);
+
+	/* write line to RAM - compare to filter 4 tuple */
+	ramline.low32bits = 0;
+	ramline.high32bits = 0;
+	SET_FIELD(ramline.high32bits, GFT_RAM_LINE_DST_IP, 1);
+	SET_FIELD(ramline.high32bits, GFT_RAM_LINE_SRC_IP, 1);
+	SET_FIELD(ramline.low32bits, GFT_RAM_LINE_SRC_PORT, 1);
+	SET_FIELD(ramline.low32bits, GFT_RAM_LINE_DST_PORT, 1);
+
+	/* each iteration write to reg */
+	for (i = 0; i < RAM_LINE_SIZE / REG_SIZE; i++)
+		qed_wr(p_hwfn, p_ptt,
+		       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE * pf_id +
+		       i * REG_SIZE, *(p_ramline + i));
+
+	/* set default profile so that no filter match will happen */
+	ramline.low32bits = 0xffff;
+	ramline.high32bits = 0xffff;
+
+	for (i = 0; i < RAM_LINE_SIZE / REG_SIZE; i++)
+		qed_wr(p_hwfn, p_ptt,
+		       PRS_REG_GFT_PROFILE_MASK_RAM + RAM_LINE_SIZE *
+		       PRS_GFT_CAM_LINES_NO_MATCH + i * REG_SIZE,
+		       *(p_ramline + i));
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index d56441da87c5..eb5e280eb104 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -1799,6 +1799,84 @@ void qed_reset_vport_stats(struct qed_dev *cdev)
 		_qed_get_vport_stats(cdev, cdev->reset_stats);
 }
 
+static void
+qed_arfs_mode_configure(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			struct qed_arfs_config_params *p_cfg_params)
+{
+	if (p_cfg_params->arfs_enable) {
+		qed_set_rfs_mode_enable(p_hwfn, p_ptt, p_hwfn->rel_pf_id,
+					p_cfg_params->tcp, p_cfg_params->udp,
+					p_cfg_params->ipv4, p_cfg_params->ipv6);
+		DP_VERBOSE(p_hwfn, QED_MSG_SP,
+			   "tcp = %s, udp = %s, ipv4 = %s, ipv6 =%s\n",
+			   p_cfg_params->tcp ? "Enable" : "Disable",
+			   p_cfg_params->udp ? "Enable" : "Disable",
+			   p_cfg_params->ipv4 ? "Enable" : "Disable",
+			   p_cfg_params->ipv6 ? "Enable" : "Disable");
+	} else {
+		qed_set_rfs_mode_disable(p_hwfn, p_ptt, p_hwfn->rel_pf_id);
+	}
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP, "Configured ARFS mode : %s\n",
+		   p_cfg_params->arfs_enable ? "Enable" : "Disable");
+}
+
+static int
+qed_configure_rfs_ntuple_filter(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+				struct qed_spq_comp_cb *p_cb,
+				dma_addr_t p_addr, u16 length, u16 qid,
+				u8 vport_id, bool b_is_add)
+{
+	struct rx_update_gft_filter_data *p_ramrod = NULL;
+	struct qed_spq_entry *p_ent = NULL;
+	struct qed_sp_init_data init_data;
+	u16 abs_rx_q_id = 0;
+	u8 abs_vport_id = 0;
+	int rc = -EINVAL;
+
+	rc = qed_fw_vport(p_hwfn, vport_id, &abs_vport_id);
+	if (rc)
+		return rc;
+
+	rc = qed_fw_l2_queue(p_hwfn, qid, &abs_rx_q_id);
+	if (rc)
+		return rc;
+
+	/* Get SPQ entry */
+	memset(&init_data, 0, sizeof(init_data));
+	init_data.cid = qed_spq_get_cid(p_hwfn);
+
+	init_data.opaque_fid = p_hwfn->hw_info.opaque_fid;
+
+	if (p_cb) {
+		init_data.comp_mode = QED_SPQ_MODE_CB;
+		init_data.p_comp_data = p_cb;
+	} else {
+		init_data.comp_mode = QED_SPQ_MODE_EBLOCK;
+	}
+
+	rc = qed_sp_init_request(p_hwfn, &p_ent,
+				 ETH_RAMROD_GFT_UPDATE_FILTER,
+				 PROTOCOLID_ETH, &init_data);
+	if (rc)
+		return rc;
+
+	p_ramrod = &p_ent->ramrod.rx_update_gft;
+	DMA_REGPAIR_LE(p_ramrod->pkt_hdr_addr, p_addr);
+	p_ramrod->pkt_hdr_length = cpu_to_le16(length);
+	p_ramrod->rx_qid_or_action_icid = cpu_to_le16(abs_rx_q_id);
+	p_ramrod->vport_id = abs_vport_id;
+	p_ramrod->filter_type = RFS_FILTER_TYPE;
+	p_ramrod->filter_action = b_is_add ? GFT_ADD_FILTER : GFT_DELETE_FILTER;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_SP,
+		   "V[%0x], Q[%04x] - %s filter from 0x%llx [length %04xb]\n",
+		   abs_vport_id, abs_rx_q_id,
+		   b_is_add ? "Adding" : "Removing", (u64)p_addr, length);
+
+	return qed_spq_post(p_hwfn, p_ent, NULL);
+}
+
 static int qed_fill_eth_dev_info(struct qed_dev *cdev,
 				 struct qed_dev_eth_info *info)
 {
@@ -2356,6 +2434,59 @@ static int qed_configure_filter(struct qed_dev *cdev,
 	}
 }
 
+static int qed_configure_arfs_searcher(struct qed_dev *cdev, bool en_searcher)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_arfs_config_params arfs_config_params;
+
+	memset(&arfs_config_params, 0, sizeof(arfs_config_params));
+	arfs_config_params.tcp = true;
+	arfs_config_params.udp = true;
+	arfs_config_params.ipv4 = true;
+	arfs_config_params.ipv6 = true;
+	arfs_config_params.arfs_enable = en_searcher;
+
+	qed_arfs_mode_configure(p_hwfn, p_hwfn->p_arfs_ptt,
+				&arfs_config_params);
+	return 0;
+}
+
+static void
+qed_arfs_sp_response_handler(struct qed_hwfn *p_hwfn,
+			     void *cookie, union event_ring_data *data,
+			     u8 fw_return_code)
+{
+	struct qed_common_cb_ops *op = p_hwfn->cdev->protocol_ops.common;
+	void *dev = p_hwfn->cdev->ops_cookie;
+
+	op->arfs_filter_op(dev, cookie, fw_return_code);
+}
+
+static int qed_ntuple_arfs_filter_config(struct qed_dev *cdev, void *cookie,
+					 dma_addr_t mapping, u16 length,
+					 u16 vport_id, u16 rx_queue_id,
+					 bool add_filter)
+{
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+	struct qed_spq_comp_cb cb;
+	int rc = -EINVAL;
+
+	cb.function = qed_arfs_sp_response_handler;
+	cb.cookie = cookie;
+
+	rc = qed_configure_rfs_ntuple_filter(p_hwfn, p_hwfn->p_arfs_ptt,
+					     &cb, mapping, length, rx_queue_id,
+					     vport_id, add_filter);
+	if (rc)
+		DP_NOTICE(p_hwfn,
+			  "Failed to issue a-RFS filter configuration\n");
+	else
+		DP_VERBOSE(p_hwfn, NETIF_MSG_DRV,
+			   "Successfully issued a-RFS filter configuration\n");
+
+	return rc;
+}
+
 static int qed_fp_cqe_completion(struct qed_dev *dev,
 				 u8 rss_id, struct eth_slow_path_rx_cqe *cqe)
 {
@@ -2397,6 +2528,8 @@ static const struct qed_eth_ops qed_eth_ops_pass = {
 	.eth_cqe_completion = &qed_fp_cqe_completion,
 	.get_vport_stats = &qed_get_vport_stats,
 	.tunn_config = &qed_tunn_configure,
+	.ntuple_filter_config = &qed_ntuple_arfs_filter_config,
+	.configure_arfs_searcher = &qed_configure_arfs_searcher,
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void)
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.h b/drivers/net/ethernet/qlogic/qed/qed_l2.h
index e763abd334f6..6f44229899eb 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.h
@@ -185,6 +185,14 @@ struct qed_filter_accept_flags {
 #define QED_ACCEPT_BCAST                0x20
 };
 
+struct qed_arfs_config_params {
+	bool tcp;
+	bool udp;
+	bool ipv4;
+	bool ipv6;
+	bool arfs_enable;
+};
+
 struct qed_sp_vport_update_params {
 	u16				opaque_fid;
 	u8				vport_id;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 029f431e89ec..da562cf8a965 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -883,6 +883,9 @@ static void qed_update_pf_params(struct qed_dev *cdev,
 		params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX;
 	}
 
+	if (cdev->num_hwfns > 1 || IS_VF(cdev))
+		params->eth_pf_params.num_arfs_filters = 0;
+
 	/* In case we might support RDMA, don't allow qede to be greedy
 	 * with the L2 contexts. Allow for 64 queues [rx, tx, xdp] per hwfn.
 	 */
@@ -926,6 +929,18 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 			goto err;
 		}
 
+#ifdef CONFIG_RFS_ACCEL
+		if (cdev->num_hwfns == 1) {
+			p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
+			if (p_ptt) {
+				QED_LEADING_HWFN(cdev)->p_arfs_ptt = p_ptt;
+			} else {
+				DP_NOTICE(cdev,
+					  "Failed to acquire PTT for aRFS\n");
+				goto err;
+			}
+		}
+#endif
 		p_ptt = qed_ptt_acquire(QED_LEADING_HWFN(cdev));
 		if (p_ptt) {
 			QED_LEADING_HWFN(cdev)->p_ptp_ptt = p_ptt;
@@ -1032,6 +1047,12 @@ err:
 	if (IS_PF(cdev))
 		release_firmware(cdev->firmware);
 
+#ifdef CONFIG_RFS_ACCEL
+	if (IS_PF(cdev) && (cdev->num_hwfns == 1) &&
+	    QED_LEADING_HWFN(cdev)->p_arfs_ptt)
+		qed_ptt_release(QED_LEADING_HWFN(cdev),
+				QED_LEADING_HWFN(cdev)->p_arfs_ptt);
+#endif
 	if (IS_PF(cdev) && QED_LEADING_HWFN(cdev)->p_ptp_ptt)
 		qed_ptt_release(QED_LEADING_HWFN(cdev),
 				QED_LEADING_HWFN(cdev)->p_ptp_ptt);
@@ -1049,6 +1070,11 @@ static int qed_slowpath_stop(struct qed_dev *cdev)
 	qed_ll2_dealloc_if(cdev);
 
 	if (IS_PF(cdev)) {
+#ifdef CONFIG_RFS_ACCEL
+		if (cdev->num_hwfns == 1)
+			qed_ptt_release(QED_LEADING_HWFN(cdev),
+					QED_LEADING_HWFN(cdev)->p_arfs_ptt);
+#endif
 		qed_ptt_release(QED_LEADING_HWFN(cdev),
 				QED_LEADING_HWFN(cdev)->p_ptp_ptt);
 		qed_free_stream_mem(cdev);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
index e65397360ab4..1ae73b2d6d1e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h
@@ -1560,4 +1560,12 @@
 #define NIG_REG_TSGEN_FREECNT_UPDATE_K2 0x509008UL
 #define CNIG_REG_NIG_PORT0_CONF_K2 0x218200UL
 
+#define PRS_REG_SEARCH_GFT 0x1f11bcUL
+#define PRS_REG_CM_HDR_GFT 0x1f11c8UL
+#define PRS_REG_GFT_CAM 0x1f1100UL
+#define PRS_REG_GFT_PROFILE_MASK_RAM 0x1f1000UL
+#define PRS_REG_CM_HDR_GFT_EVENT_ID_SHIFT 0
+#define PRS_REG_CM_HDR_GFT_CM_HDR_SHIFT 8
+#define PRS_REG_LOAD_L2_FILTER 0x1f0198UL
+
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h
index 30393ffaa8e5..583c8d38c8d7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h
@@ -84,6 +84,7 @@ union ramrod_data {
 	struct tx_queue_stop_ramrod_data tx_queue_stop;
 	struct vport_start_ramrod_data vport_start;
 	struct vport_stop_ramrod_data vport_stop;
+	struct rx_update_gft_filter_data rx_update_gft;
 	struct vport_update_ramrod_data vport_update;
 	struct core_rx_start_ramrod_data core_rx_queue_start;
 	struct core_rx_stop_ramrod_data core_rx_queue_stop;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 4cd1f0ccfa36..1eba803cb7f1 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -301,6 +301,14 @@ struct qed_eth_ops {
 
 	int (*tunn_config)(struct qed_dev *cdev,
 			   struct qed_tunn_params *params);
+
+	int (*ntuple_filter_config)(struct qed_dev *cdev, void *cookie,
+				    dma_addr_t mapping, u16 length,
+				    u16 vport_id, u16 rx_queue_id,
+				    bool add_filter);
+
+	int (*configure_arfs_searcher)(struct qed_dev *cdev,
+				       bool en_searcher);
 };
 
 const struct qed_eth_ops *qed_get_eth_ops(void);
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 625f80f08f91..d44933a058ee 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -178,6 +178,12 @@ struct qed_eth_pf_params {
 	 * to update_pf_params routine invoked before slowpath start
 	 */
 	u16 num_cons;
+
+	/* To enable arfs, previous to HW-init a positive number needs to be
+	 * set [as filters require allocated searcher ILT memory].
+	 * This will set the maximal number of configured steering-filters.
+	 */
+	u32 num_arfs_filters;
 };
 
 struct qed_fcoe_pf_params {
@@ -427,6 +433,7 @@ struct qed_int_info {
 };
 
 struct qed_common_cb_ops {
+	void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc);
 	void	(*link_update)(void			*dev,
 			       struct qed_link_output	*link);
 	void	(*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type);
-- 
cgit v1.2.3


From d27158c0cf080c85753f34e7a20a91c3ba20a0b9 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 30 Mar 2017 19:26:35 -0500
Subject: signal: Remove unused definition of sig_user_definied

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/signal.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 94ad6eea9550..1f5a16620693 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -390,10 +390,6 @@ int unhandled_signal(struct task_struct *tsk, int sig);
 #define sig_kernel_ignore(sig)		siginmask(sig, SIG_KERNEL_IGNORE_MASK)
 #define sig_kernel_stop(sig)		siginmask(sig, SIG_KERNEL_STOP_MASK)
 
-#define sig_user_defined(t, signr) \
-	(((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) &&	\
-	 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
-
 #define sig_fatal(t, signr) \
 	(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
 	 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
-- 
cgit v1.2.3


From f51b17c8d90f85456579c3192ab59ee031835634 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Mon, 17 Apr 2017 21:34:56 +0800
Subject: boot/param: Move next_arg() function to lib/cmdline.c for later reuse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

next_arg() will be used to parse boot parameters in the x86/boot/compressed code,
so move it to lib/cmdline.c for better code reuse.

No change in functionality.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Cc: dave.jiang@intel.com
Cc: dyoung@redhat.com
Cc: keescook@chromium.org
Cc: zijun_hu <zijun_hu@htc.com>
Link: http://lkml.kernel.org/r/1492436099-4017-2-git-send-email-bhe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h |  1 +
 kernel/params.c        | 52 ---------------------------------------------
 lib/cmdline.c          | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c26dc3a8295..7ae256717a32 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -438,6 +438,7 @@ extern int get_option(char **str, int *pint);
 extern char *get_options(const char *str, int nints, int *ints);
 extern unsigned long long memparse(const char *ptr, char **retptr);
 extern bool parse_option_str(const char *str, const char *option);
+extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
 extern int core_kernel_data(unsigned long addr);
diff --git a/kernel/params.c b/kernel/params.c
index a6d6149c0fe6..60b2d8101355 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -160,58 +160,6 @@ static int parse_one(char *param,
 	return -ENOENT;
 }
 
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
-	unsigned int i, equals = 0;
-	int in_quote = 0, quoted = 0;
-	char *next;
-
-	if (*args == '"') {
-		args++;
-		in_quote = 1;
-		quoted = 1;
-	}
-
-	for (i = 0; args[i]; i++) {
-		if (isspace(args[i]) && !in_quote)
-			break;
-		if (equals == 0) {
-			if (args[i] == '=')
-				equals = i;
-		}
-		if (args[i] == '"')
-			in_quote = !in_quote;
-	}
-
-	*param = args;
-	if (!equals)
-		*val = NULL;
-	else {
-		args[equals] = '\0';
-		*val = args + equals + 1;
-
-		/* Don't include quotes in value. */
-		if (**val == '"') {
-			(*val)++;
-			if (args[i-1] == '"')
-				args[i-1] = '\0';
-		}
-	}
-	if (quoted && args[i-1] == '"')
-		args[i-1] = '\0';
-
-	if (args[i]) {
-		args[i] = '\0';
-		next = args + i + 1;
-	} else
-		next = args + i;
-
-	/* Chew up trailing spaces. */
-	return skip_spaces(next);
-}
-
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 char *parse_args(const char *doing,
 		 char *args,
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 8f13cf73c2ec..3c6432df7e63 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 
 /*
  *	If a hyphen was found in get_option, this will handle the
@@ -189,3 +190,59 @@ bool parse_option_str(const char *str, const char *option)
 
 	return false;
 }
+
+/*
+ * Parse a string to get a param value pair.
+ * You can use " around spaces, but can't escape ".
+ * Hyphens and underscores equivalent in parameter names.
+ */
+char *next_arg(char *args, char **param, char **val)
+{
+	unsigned int i, equals = 0;
+	int in_quote = 0, quoted = 0;
+	char *next;
+
+	if (*args == '"') {
+		args++;
+		in_quote = 1;
+		quoted = 1;
+	}
+
+	for (i = 0; args[i]; i++) {
+		if (isspace(args[i]) && !in_quote)
+			break;
+		if (equals == 0) {
+			if (args[i] == '=')
+				equals = i;
+		}
+		if (args[i] == '"')
+			in_quote = !in_quote;
+	}
+
+	*param = args;
+	if (!equals)
+		*val = NULL;
+	else {
+		args[equals] = '\0';
+		*val = args + equals + 1;
+
+		/* Don't include quotes in value. */
+		if (**val == '"') {
+			(*val)++;
+			if (args[i-1] == '"')
+				args[i-1] = '\0';
+		}
+	}
+	if (quoted && args[i-1] == '"')
+		args[i-1] = '\0';
+
+	if (args[i]) {
+		args[i] = '\0';
+		next = args + i + 1;
+	} else
+		next = args + i;
+
+	/* Chew up trailing spaces. */
+	return skip_spaces(next);
+	//return next;
+}
-- 
cgit v1.2.3


From 21470e32ca7f976bf131aa3c7b54019d07f7d821 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Date: Sun, 16 Apr 2017 21:51:07 -0300
Subject: usb: fix some references for /proc/bus/usb

Since when we got rid of usbfs, the /proc/bus/usb is now
elsewhere. Fix references for it.

Signed-off-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/media/usb/pwc/philips.txt  | 2 +-
 drivers/usb/class/Kconfig          | 2 +-
 drivers/usb/class/usblp.c          | 2 +-
 drivers/usb/core/devices.c         | 4 ++--
 drivers/usb/storage/unusual_devs.h | 2 +-
 include/linux/usb.h                | 2 +-
 include/uapi/linux/capability.h    | 2 +-
 include/uapi/linux/usb/ch9.h       | 3 ++-
 tools/usb/usbip/README             | 2 +-
 9 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/usb/pwc/philips.txt b/drivers/media/usb/pwc/philips.txt
index d38dd791511e..be8c80eff374 100644
--- a/drivers/media/usb/pwc/philips.txt
+++ b/drivers/media/usb/pwc/philips.txt
@@ -140,7 +140,7 @@ dev_hint
 
    A camera is specified by its type (the number from the camera model,
    like PCA645, PCVC750VC, etc) and optionally the serial number (visible
-   in /proc/bus/usb/devices). A hint consists of a string with the following
+   in /sys/kernel/debug/usb/devices). A hint consists of a string with the following
    format:
 
       [type[.serialnumber]:]node
diff --git a/drivers/usb/class/Kconfig b/drivers/usb/class/Kconfig
index bb8b73682a70..971385fe9abc 100644
--- a/drivers/usb/class/Kconfig
+++ b/drivers/usb/class/Kconfig
@@ -12,7 +12,7 @@ config USB_ACM
 	  Please read <file:Documentation/usb/acm.txt> for details.
 
 	  If your modem only reports "Cls=ff(vend.)" in the descriptors in
-	  /proc/bus/usb/devices, then your modem will not work with this
+	  /sys/kernel/debug/usb/devices, then your modem will not work with this
 	  driver.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c
index 73bd9a2ac530..fb87c17ed6fa 100644
--- a/drivers/usb/class/usblp.c
+++ b/drivers/usb/class/usblp.c
@@ -294,7 +294,7 @@ static int usblp_ctrl_msg(struct usblp *usblp, int request, int type, int dir, i
 
 /*
  * See the description for usblp_select_alts() below for the usage
- * explanation.  Look into your /proc/bus/usb/devices and dmesg in
+ * explanation.  Look into your /sys/kernel/debug/usb/devices and dmesg in
  * case of any trouble.
  */
 static int proto_bias = -1;
diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
index f2987ddb1cde..55dea2e7828f 100644
--- a/drivers/usb/core/devices.c
+++ b/drivers/usb/core/devices.c
@@ -24,7 +24,7 @@
  * <mountpoint>/devices contains USB topology, device, config, class,
  * interface, & endpoint data.
  *
- * I considered using /proc/bus/usb/devices/device# for each device
+ * I considered using /dev/bus/usb/device# for each device
  * as it is attached or detached, but I didn't like this for some
  * reason -- maybe it's just too deep of a directory structure.
  * I also don't like looking in multiple places to gather and view
@@ -40,7 +40,7 @@
  *   Converted the whole proc stuff to real
  *   read methods. Now not the whole device list needs to fit
  *   into one page, only the device list for one bus.
- *   Added a poll method to /proc/bus/usb/devices, to wake
+ *   Added a poll method to /sys/kernel/debug/usb/devices, to wake
  *   up an eventual usbd
  * 2000-01-04: Thomas Sailer <sailer@ife.ee.ethz.ch>
  *   Turned into its own filesystem
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 9129f6cb8230..a2b748975457 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -42,7 +42,7 @@
  *	- a patch that adds the entry for your device, including your
  *	  email address right above the entry (plus maybe a brief
  *	  explanation of the reason for the entry),
- *	- a copy of /proc/bus/usb/devices with your device plugged in
+ *	- a copy of /sys/kernel/debug/usb/devices with your device plugged in
  *	  running with this patch.
  * Send your submission to either Phil Dibowitz <phil@ipom.com> or
  * Alan Stern <stern@rowland.harvard.edu>, and don't forget to CC: the
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 226557362d36..cb9fbd54386e 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -318,7 +318,7 @@ void usb_put_intf(struct usb_interface *intf);
  * struct usb_interface (which persists only as long as its configuration
  * is installed).  The altsetting arrays can be accessed through these
  * structures at any time, permitting comparison of configurations and
- * providing support for the /proc/bus/usb/devices pseudo-file.
+ * providing support for the /sys/kernel/debug/usb/devices pseudo-file.
  */
 struct usb_interface_cache {
 	unsigned num_altsetting;	/* number of alternate settings */
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 49bc06295398..6fe14d001f68 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -205,7 +205,7 @@ struct vfs_cap_data {
 #define CAP_SYS_MODULE       16
 
 /* Allow ioperm/iopl access */
-/* Allow sending USB messages to any device via /proc/bus/usb */
+/* Allow sending USB messages to any device via /dev/bus/usb */
 
 #define CAP_SYS_RAWIO        17
 
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 2c5d7c4a69e3..ce1169af39d7 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -224,7 +224,8 @@ struct usb_ctrlrequest {
  * through the Linux-USB APIs, they are not converted to cpu byte
  * order; it is the responsibility of the client code to do this.
  * The single exception is when device and configuration descriptors (but
- * not other descriptors) are read from usbfs (i.e. /proc/bus/usb/BBB/DDD);
+ * not other descriptors) are read from character devices
+ * (i.e. /dev/bus/usb/BBB/DDD);
  * in this case the fields are converted to host endianness by the kernel.
  */
 
diff --git a/tools/usb/usbip/README b/tools/usb/usbip/README
index 5eb2b6c7722b..7844490fc603 100644
--- a/tools/usb/usbip/README
+++ b/tools/usb/usbip/README
@@ -244,7 +244,7 @@ Detach the imported device:
     - See 'Debug Tips' on the project wiki.
 	- http://usbip.wiki.sourceforge.net/how-to-debug-usbip
     - usbip-host.ko must be bound to the target device.
-	- See /proc/bus/usb/devices and find "Driver=..." lines of the device.
+	- See /sys/kernel/debug/usb/devices and find "Driver=..." lines of the device.
     - Target USB gadget must be bound to vudc
       (using USB gadget susbsys, not usbip bind command)
     - Shutdown firewall.
-- 
cgit v1.2.3


From 0c688614dcce84dfdbb305fd1c399c06cecea745 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 12 Apr 2017 18:37:14 -0400
Subject: console: move console_init() out of tty_io.c

All the console driver handling code lives in printk.c.
Move console_init() there as well so console support can still be used
when the TTY code is configured out. No logical changes from this patch.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c    | 24 ------------------------
 include/linux/console.h |  2 ++
 include/linux/tty.h     |  7 ++++---
 init/main.c             |  2 +-
 kernel/printk/printk.c  | 24 ++++++++++++++++++++++++
 5 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index dd0c1aa60402..8244b6199784 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3584,30 +3584,6 @@ void tty_default_fops(struct file_operations *fops)
 	*fops = tty_fops;
 }
 
-/*
- * Initialize the console device. This is called *early*, so
- * we can't necessarily depend on lots of kernel help here.
- * Just do some early initializations, and do the complex setup
- * later.
- */
-void __init console_init(void)
-{
-	initcall_t *call;
-
-	/* Setup the default TTY line discipline. */
-	n_tty_init();
-
-	/*
-	 * set up the console device so that later boot sequences can
-	 * inform about problems etc..
-	 */
-	call = __con_initcall_start;
-	while (call < __con_initcall_end) {
-		(*call)();
-		call++;
-	}
-}
-
 static char *tty_devnode(struct device *dev, umode_t *mode)
 {
 	if (!mode)
diff --git a/include/linux/console.h b/include/linux/console.h
index 5949d1855589..b8920a031a3e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -212,4 +212,6 @@ extern bool vgacon_text_force(void);
 static inline bool vgacon_text_force(void) { return false; }
 #endif
 
+extern void console_init(void);
+
 #endif /* _LINUX_CONSOLE_H */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1017e904c0a3..f1106d7c73b6 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -390,7 +390,6 @@ static inline bool tty_throttled(struct tty_struct *tty)
 }
 
 #ifdef CONFIG_TTY
-extern void console_init(void);
 extern void tty_kref_put(struct tty_struct *tty);
 extern struct pid *tty_get_pgrp(struct tty_struct *tty);
 extern void tty_vhangup_self(void);
@@ -402,8 +401,6 @@ extern struct tty_struct *get_current_tty(void);
 extern int __init tty_init(void);
 extern const char *tty_name(const struct tty_struct *tty);
 #else
-static inline void console_init(void)
-{ }
 static inline void tty_kref_put(struct tty_struct *tty)
 { }
 static inline struct pid *tty_get_pgrp(struct tty_struct *tty)
@@ -669,7 +666,11 @@ extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
 
 /* n_tty.c */
 extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops);
+#ifdef CONFIG_TTY
 extern void __init n_tty_init(void);
+#else
+static inline void n_tty_init(void) { }
+#endif
 
 /* tty_audit.c */
 #ifdef CONFIG_AUDIT
diff --git a/init/main.c b/init/main.c
index b0c11cbf5ddf..cfb7e6821b85 100644
--- a/init/main.c
+++ b/init/main.c
@@ -27,7 +27,7 @@
 #include <linux/initrd.h>
 #include <linux/bootmem.h>
 #include <linux/acpi.h>
-#include <linux/tty.h>
+#include <linux/console.h>
 #include <linux/nmi.h>
 #include <linux/percpu.h>
 #include <linux/kmod.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 2984fb0f0257..3a0940652660 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2610,6 +2610,30 @@ int unregister_console(struct console *console)
 }
 EXPORT_SYMBOL(unregister_console);
 
+/*
+ * Initialize the console device. This is called *early*, so
+ * we can't necessarily depend on lots of kernel help here.
+ * Just do some early initializations, and do the complex setup
+ * later.
+ */
+void __init console_init(void)
+{
+	initcall_t *call;
+
+	/* Setup the default TTY line discipline. */
+	n_tty_init();
+
+	/*
+	 * set up the console device so that later boot sequences can
+	 * inform about problems etc..
+	 */
+	call = __con_initcall_start;
+	while (call < __con_initcall_end) {
+		(*call)();
+		call++;
+	}
+}
+
 /*
  * Some boot consoles access data that is in the init section and which will
  * be discarded after the initcalls have been run. To make sure that no code
-- 
cgit v1.2.3


From a1235b3eb10086b8420f37bbb6c29436f55940ba Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 12 Apr 2017 18:37:16 -0400
Subject: tty: split job control support into a file of its own

This makes it easier for job control to become optional and/or usable
independently from tty_io.c, as well as providing a nice purpose
separation. No logical changes from this patch.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/Makefile      |   2 +-
 drivers/tty/tty_io.c      | 547 +--------------------------------------------
 drivers/tty/tty_jobctrl.c | 554 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/tty.h       |   6 +
 4 files changed, 572 insertions(+), 537 deletions(-)
 create mode 100644 drivers/tty/tty_jobctrl.c

(limited to 'include/linux')

diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 6983b5a49ec3..f02becdb3e33 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -1,6 +1,6 @@
 obj-$(CONFIG_TTY)		+= tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o \
 				   tty_buffer.o tty_port.o tty_mutex.o \
-				   tty_ldsem.o tty_baudrate.o
+				   tty_ldsem.o tty_baudrate.o tty_jobctrl.o
 obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 8244b6199784..0c150b5a9dd6 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -377,65 +377,6 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
 EXPORT_SYMBOL_GPL(tty_find_polling_driver);
 #endif
 
-static int is_ignored(int sig)
-{
-	return (sigismember(&current->blocked, sig) ||
-		current->sighand->action[sig-1].sa.sa_handler == SIG_IGN);
-}
-
-/**
- *	tty_check_change	-	check for POSIX terminal changes
- *	@tty: tty to check
- *
- *	If we try to write to, or set the state of, a terminal and we're
- *	not in the foreground, send a SIGTTOU.  If the signal is blocked or
- *	ignored, go ahead and perform the operation.  (POSIX 7.2)
- *
- *	Locking: ctrl_lock
- */
-
-int __tty_check_change(struct tty_struct *tty, int sig)
-{
-	unsigned long flags;
-	struct pid *pgrp, *tty_pgrp;
-	int ret = 0;
-
-	if (current->signal->tty != tty)
-		return 0;
-
-	rcu_read_lock();
-	pgrp = task_pgrp(current);
-
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	tty_pgrp = tty->pgrp;
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-
-	if (tty_pgrp && pgrp != tty->pgrp) {
-		if (is_ignored(sig)) {
-			if (sig == SIGTTIN)
-				ret = -EIO;
-		} else if (is_current_pgrp_orphaned())
-			ret = -EIO;
-		else {
-			kill_pgrp(pgrp, sig, 1);
-			set_thread_flag(TIF_SIGPENDING);
-			ret = -ERESTARTSYS;
-		}
-	}
-	rcu_read_unlock();
-
-	if (!tty_pgrp)
-		tty_warn(tty, "sig=%d, tty->pgrp == NULL!\n", sig);
-
-	return ret;
-}
-
-int tty_check_change(struct tty_struct *tty)
-{
-	return __tty_check_change(tty, SIGTTOU);
-}
-EXPORT_SYMBOL(tty_check_change);
-
 static ssize_t hung_up_tty_read(struct file *file, char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -509,79 +450,6 @@ static const struct file_operations hung_up_tty_fops = {
 static DEFINE_SPINLOCK(redirect_lock);
 static struct file *redirect;
 
-
-void proc_clear_tty(struct task_struct *p)
-{
-	unsigned long flags;
-	struct tty_struct *tty;
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	tty = p->signal->tty;
-	p->signal->tty = NULL;
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	tty_kref_put(tty);
-}
-
-/**
- * proc_set_tty -  set the controlling terminal
- *
- * Only callable by the session leader and only if it does not already have
- * a controlling terminal.
- *
- * Caller must hold:  tty_lock()
- *		      a readlock on tasklist_lock
- *		      sighand lock
- */
-static void __proc_set_tty(struct tty_struct *tty)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	/*
-	 * The session and fg pgrp references will be non-NULL if
-	 * tiocsctty() is stealing the controlling tty
-	 */
-	put_pid(tty->session);
-	put_pid(tty->pgrp);
-	tty->pgrp = get_pid(task_pgrp(current));
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-	tty->session = get_pid(task_session(current));
-	if (current->signal->tty) {
-		tty_debug(tty, "current tty %s not NULL!!\n",
-			  current->signal->tty->name);
-		tty_kref_put(current->signal->tty);
-	}
-	put_pid(current->signal->tty_old_pgrp);
-	current->signal->tty = tty_kref_get(tty);
-	current->signal->tty_old_pgrp = NULL;
-}
-
-static void proc_set_tty(struct tty_struct *tty)
-{
-	spin_lock_irq(&current->sighand->siglock);
-	__proc_set_tty(tty);
-	spin_unlock_irq(&current->sighand->siglock);
-}
-
-struct tty_struct *get_current_tty(void)
-{
-	struct tty_struct *tty;
-	unsigned long flags;
-
-	spin_lock_irqsave(&current->sighand->siglock, flags);
-	tty = tty_kref_get(current->signal->tty);
-	spin_unlock_irqrestore(&current->sighand->siglock, flags);
-	return tty;
-}
-EXPORT_SYMBOL_GPL(get_current_tty);
-
-static void session_clear_tty(struct pid *session)
-{
-	struct task_struct *p;
-	do_each_pid_task(session, PIDTYPE_SID, p) {
-		proc_clear_tty(p);
-	} while_each_pid_task(session, PIDTYPE_SID, p);
-}
-
 /**
  *	tty_wakeup	-	request more data
  *	@tty: terminal
@@ -608,60 +476,6 @@ void tty_wakeup(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_wakeup);
 
-/**
- *	tty_signal_session_leader	- sends SIGHUP to session leader
- *	@tty		controlling tty
- *	@exit_session	if non-zero, signal all foreground group processes
- *
- *	Send SIGHUP and SIGCONT to the session leader and its process group.
- *	Optionally, signal all processes in the foreground process group.
- *
- *	Returns the number of processes in the session with this tty
- *	as their controlling terminal. This value is used to drop
- *	tty references for those processes.
- */
-static int tty_signal_session_leader(struct tty_struct *tty, int exit_session)
-{
-	struct task_struct *p;
-	int refs = 0;
-	struct pid *tty_pgrp = NULL;
-
-	read_lock(&tasklist_lock);
-	if (tty->session) {
-		do_each_pid_task(tty->session, PIDTYPE_SID, p) {
-			spin_lock_irq(&p->sighand->siglock);
-			if (p->signal->tty == tty) {
-				p->signal->tty = NULL;
-				/* We defer the dereferences outside fo
-				   the tasklist lock */
-				refs++;
-			}
-			if (!p->signal->leader) {
-				spin_unlock_irq(&p->sighand->siglock);
-				continue;
-			}
-			__group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
-			__group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
-			put_pid(p->signal->tty_old_pgrp);  /* A noop */
-			spin_lock(&tty->ctrl_lock);
-			tty_pgrp = get_pid(tty->pgrp);
-			if (tty->pgrp)
-				p->signal->tty_old_pgrp = get_pid(tty->pgrp);
-			spin_unlock(&tty->ctrl_lock);
-			spin_unlock_irq(&p->sighand->siglock);
-		} while_each_pid_task(tty->session, PIDTYPE_SID, p);
-	}
-	read_unlock(&tasklist_lock);
-
-	if (tty_pgrp) {
-		if (exit_session)
-			kill_pgrp(tty_pgrp, SIGHUP, exit_session);
-		put_pid(tty_pgrp);
-	}
-
-	return refs;
-}
-
 /**
  *	__tty_hangup		-	actual handler for hangup events
  *	@work: tty device
@@ -840,7 +654,7 @@ void tty_vhangup_self(void)
  *	is complete. That guarantee is necessary for security reasons.
  */
 
-static void tty_vhangup_session(struct tty_struct *tty)
+void tty_vhangup_session(struct tty_struct *tty)
 {
 	tty_debug_hangup(tty, "session hangup\n");
 	__tty_hangup(tty, 1);
@@ -861,106 +675,6 @@ int tty_hung_up_p(struct file *filp)
 
 EXPORT_SYMBOL(tty_hung_up_p);
 
-/**
- *	disassociate_ctty	-	disconnect controlling tty
- *	@on_exit: true if exiting so need to "hang up" the session
- *
- *	This function is typically called only by the session leader, when
- *	it wants to disassociate itself from its controlling tty.
- *
- *	It performs the following functions:
- * 	(1)  Sends a SIGHUP and SIGCONT to the foreground process group
- * 	(2)  Clears the tty from being controlling the session
- * 	(3)  Clears the controlling tty for all processes in the
- * 		session group.
- *
- *	The argument on_exit is set to 1 if called when a process is
- *	exiting; it is 0 if called by the ioctl TIOCNOTTY.
- *
- *	Locking:
- *		BTM is taken for hysterical raisins, and held when
- *		  called from no_tty().
- *		  tty_mutex is taken to protect tty
- *		  ->siglock is taken to protect ->signal/->sighand
- *		  tasklist_lock is taken to walk process list for sessions
- *		    ->siglock is taken to protect ->signal/->sighand
- */
-
-void disassociate_ctty(int on_exit)
-{
-	struct tty_struct *tty;
-
-	if (!current->signal->leader)
-		return;
-
-	tty = get_current_tty();
-	if (tty) {
-		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) {
-			tty_vhangup_session(tty);
-		} else {
-			struct pid *tty_pgrp = tty_get_pgrp(tty);
-			if (tty_pgrp) {
-				kill_pgrp(tty_pgrp, SIGHUP, on_exit);
-				if (!on_exit)
-					kill_pgrp(tty_pgrp, SIGCONT, on_exit);
-				put_pid(tty_pgrp);
-			}
-		}
-		tty_kref_put(tty);
-
-	} else if (on_exit) {
-		struct pid *old_pgrp;
-		spin_lock_irq(&current->sighand->siglock);
-		old_pgrp = current->signal->tty_old_pgrp;
-		current->signal->tty_old_pgrp = NULL;
-		spin_unlock_irq(&current->sighand->siglock);
-		if (old_pgrp) {
-			kill_pgrp(old_pgrp, SIGHUP, on_exit);
-			kill_pgrp(old_pgrp, SIGCONT, on_exit);
-			put_pid(old_pgrp);
-		}
-		return;
-	}
-
-	spin_lock_irq(&current->sighand->siglock);
-	put_pid(current->signal->tty_old_pgrp);
-	current->signal->tty_old_pgrp = NULL;
-
-	tty = tty_kref_get(current->signal->tty);
-	if (tty) {
-		unsigned long flags;
-		spin_lock_irqsave(&tty->ctrl_lock, flags);
-		put_pid(tty->session);
-		put_pid(tty->pgrp);
-		tty->session = NULL;
-		tty->pgrp = NULL;
-		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-		tty_kref_put(tty);
-	} else
-		tty_debug_hangup(tty, "no current tty\n");
-
-	spin_unlock_irq(&current->sighand->siglock);
-	/* Now clear signal->tty under the lock */
-	read_lock(&tasklist_lock);
-	session_clear_tty(task_session(current));
-	read_unlock(&tasklist_lock);
-}
-
-/**
- *
- *	no_tty	- Ensure the current process does not have a controlling tty
- */
-void no_tty(void)
-{
-	/* FIXME: Review locking here. The tty_lock never covered any race
-	   between a new association and proc_clear_tty but possible we need
-	   to protect against this anyway */
-	struct task_struct *tsk = current;
-	disassociate_ctty(0);
-	proc_clear_tty(tsk);
-}
-
-
 /**
  *	stop_tty	-	propagate flow control
  *	@tty: tty to stop
@@ -2163,38 +1877,13 @@ retry_open:
 	}
 	clear_bit(TTY_HUPPED, &tty->flags);
 
-
-	read_lock(&tasklist_lock);
-	spin_lock_irq(&current->sighand->siglock);
 	noctty = (filp->f_flags & O_NOCTTY) ||
-			(IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) ||
-			device == MKDEV(TTYAUX_MAJOR, 1) ||
-			(tty->driver->type == TTY_DRIVER_TYPE_PTY &&
-			 tty->driver->subtype == PTY_TYPE_MASTER);
-
-	if (!noctty &&
-	    current->signal->leader &&
-	    !current->signal->tty &&
-	    tty->session == NULL) {
-		/*
-		 * Don't let a process that only has write access to the tty
-		 * obtain the privileges associated with having a tty as
-		 * controlling terminal (being able to reopen it with full
-		 * access through /dev/tty, being able to perform pushback).
-		 * Many distributions set the group of all ttys to "tty" and
-		 * grant write-only access to all terminals for setgid tty
-		 * binaries, which should not imply full privileges on all ttys.
-		 *
-		 * This could theoretically break old code that performs open()
-		 * on a write-only file descriptor. In that case, it might be
-		 * necessary to also permit this if
-		 * inode_permission(inode, MAY_READ) == 0.
-		 */
-		if (filp->f_mode & FMODE_READ)
-			__proc_set_tty(tty);
-	}
-	spin_unlock_irq(&current->sighand->siglock);
-	read_unlock(&tasklist_lock);
+		 (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) ||
+		 device == MKDEV(TTYAUX_MAJOR, 1) ||
+		 (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+		  tty->driver->subtype == PTY_TYPE_MASTER);
+	if (!noctty)
+		tty_open_proc_set_tty(filp, tty);
 	tty_unlock(tty);
 	return 0;
 }
@@ -2456,211 +2145,6 @@ static int fionbio(struct file *file, int __user *p)
 	return 0;
 }
 
-/**
- *	tiocsctty	-	set controlling tty
- *	@tty: tty structure
- *	@arg: user argument
- *
- *	This ioctl is used to manage job control. It permits a session
- *	leader to set this tty as the controlling tty for the session.
- *
- *	Locking:
- *		Takes tty_lock() to serialize proc_set_tty() for this tty
- *		Takes tasklist_lock internally to walk sessions
- *		Takes ->siglock() when updating signal->tty
- */
-
-static int tiocsctty(struct tty_struct *tty, struct file *file, int arg)
-{
-	int ret = 0;
-
-	tty_lock(tty);
-	read_lock(&tasklist_lock);
-
-	if (current->signal->leader && (task_session(current) == tty->session))
-		goto unlock;
-
-	/*
-	 * The process must be a session leader and
-	 * not have a controlling tty already.
-	 */
-	if (!current->signal->leader || current->signal->tty) {
-		ret = -EPERM;
-		goto unlock;
-	}
-
-	if (tty->session) {
-		/*
-		 * This tty is already the controlling
-		 * tty for another session group!
-		 */
-		if (arg == 1 && capable(CAP_SYS_ADMIN)) {
-			/*
-			 * Steal it away
-			 */
-			session_clear_tty(tty->session);
-		} else {
-			ret = -EPERM;
-			goto unlock;
-		}
-	}
-
-	/* See the comment in tty_open(). */
-	if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) {
-		ret = -EPERM;
-		goto unlock;
-	}
-
-	proc_set_tty(tty);
-unlock:
-	read_unlock(&tasklist_lock);
-	tty_unlock(tty);
-	return ret;
-}
-
-/**
- *	tty_get_pgrp	-	return a ref counted pgrp pid
- *	@tty: tty to read
- *
- *	Returns a refcounted instance of the pid struct for the process
- *	group controlling the tty.
- */
-
-struct pid *tty_get_pgrp(struct tty_struct *tty)
-{
-	unsigned long flags;
-	struct pid *pgrp;
-
-	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	pgrp = get_pid(tty->pgrp);
-	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
-
-	return pgrp;
-}
-EXPORT_SYMBOL_GPL(tty_get_pgrp);
-
-/*
- * This checks not only the pgrp, but falls back on the pid if no
- * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
- * without this...
- *
- * The caller must hold rcu lock or the tasklist lock.
- */
-static struct pid *session_of_pgrp(struct pid *pgrp)
-{
-	struct task_struct *p;
-	struct pid *sid = NULL;
-
-	p = pid_task(pgrp, PIDTYPE_PGID);
-	if (p == NULL)
-		p = pid_task(pgrp, PIDTYPE_PID);
-	if (p != NULL)
-		sid = task_session(p);
-
-	return sid;
-}
-
-/**
- *	tiocgpgrp		-	get process group
- *	@tty: tty passed by user
- *	@real_tty: tty side of the tty passed by the user if a pty else the tty
- *	@p: returned pid
- *
- *	Obtain the process group of the tty. If there is no process group
- *	return an error.
- *
- *	Locking: none. Reference to current->signal->tty is safe.
- */
-
-static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
-{
-	struct pid *pid;
-	int ret;
-	/*
-	 * (tty == real_tty) is a cheap way of
-	 * testing if the tty is NOT a master pty.
-	 */
-	if (tty == real_tty && current->signal->tty != real_tty)
-		return -ENOTTY;
-	pid = tty_get_pgrp(real_tty);
-	ret =  put_user(pid_vnr(pid), p);
-	put_pid(pid);
-	return ret;
-}
-
-/**
- *	tiocspgrp		-	attempt to set process group
- *	@tty: tty passed by user
- *	@real_tty: tty side device matching tty passed by user
- *	@p: pid pointer
- *
- *	Set the process group of the tty to the session passed. Only
- *	permitted where the tty session is our session.
- *
- *	Locking: RCU, ctrl lock
- */
-
-static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
-{
-	struct pid *pgrp;
-	pid_t pgrp_nr;
-	int retval = tty_check_change(real_tty);
-
-	if (retval == -EIO)
-		return -ENOTTY;
-	if (retval)
-		return retval;
-	if (!current->signal->tty ||
-	    (current->signal->tty != real_tty) ||
-	    (real_tty->session != task_session(current)))
-		return -ENOTTY;
-	if (get_user(pgrp_nr, p))
-		return -EFAULT;
-	if (pgrp_nr < 0)
-		return -EINVAL;
-	rcu_read_lock();
-	pgrp = find_vpid(pgrp_nr);
-	retval = -ESRCH;
-	if (!pgrp)
-		goto out_unlock;
-	retval = -EPERM;
-	if (session_of_pgrp(pgrp) != task_session(current))
-		goto out_unlock;
-	retval = 0;
-	spin_lock_irq(&tty->ctrl_lock);
-	put_pid(real_tty->pgrp);
-	real_tty->pgrp = get_pid(pgrp);
-	spin_unlock_irq(&tty->ctrl_lock);
-out_unlock:
-	rcu_read_unlock();
-	return retval;
-}
-
-/**
- *	tiocgsid		-	get session id
- *	@tty: tty passed by user
- *	@real_tty: tty side of the tty passed by the user if a pty else the tty
- *	@p: pointer to returned session id
- *
- *	Obtain the session id of the tty. If there is no session
- *	return an error.
- *
- *	Locking: none. Reference to current->signal->tty is safe.
- */
-
-static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
-{
-	/*
-	 * (tty == real_tty) is a cheap way of
-	 * testing if the tty is NOT a master pty.
-	*/
-	if (tty == real_tty && current->signal->tty != real_tty)
-		return -ENOTTY;
-	if (!real_tty->session)
-		return -ENOTTY;
-	return put_user(pid_vnr(real_tty->session), p);
-}
-
 /**
  *	tiocsetd	-	set line discipline
  *	@tty: tty device
@@ -2920,19 +2404,6 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		int excl = test_bit(TTY_EXCLUSIVE, &tty->flags);
 		return put_user(excl, (int __user *)p);
 	}
-	case TIOCNOTTY:
-		if (current->signal->tty != tty)
-			return -ENOTTY;
-		no_tty();
-		return 0;
-	case TIOCSCTTY:
-		return tiocsctty(real_tty, file, arg);
-	case TIOCGPGRP:
-		return tiocgpgrp(tty, real_tty, p);
-	case TIOCSPGRP:
-		return tiocspgrp(tty, real_tty, p);
-	case TIOCGSID:
-		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
 		return tiocgetd(tty, p);
 	case TIOCSETD:
@@ -2993,6 +2464,10 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCSSERIAL:
 		tty_warn_deprecated_flags(p);
 		break;
+	default:
+		retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg);
+		if (retval != -ENOIOCTLCMD)
+			return retval;
 	}
 	if (tty->ops->ioctl) {
 		retval = tty->ops->ioctl(tty, cmd, arg);
diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c
new file mode 100644
index 000000000000..e7032309ee87
--- /dev/null
+++ b/drivers/tty/tty_jobctrl.c
@@ -0,0 +1,554 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/tty.h>
+#include <linux/fcntl.h>
+#include <linux/uaccess.h>
+
+static int is_ignored(int sig)
+{
+	return (sigismember(&current->blocked, sig) ||
+		current->sighand->action[sig-1].sa.sa_handler == SIG_IGN);
+}
+
+/**
+ *	tty_check_change	-	check for POSIX terminal changes
+ *	@tty: tty to check
+ *
+ *	If we try to write to, or set the state of, a terminal and we're
+ *	not in the foreground, send a SIGTTOU.  If the signal is blocked or
+ *	ignored, go ahead and perform the operation.  (POSIX 7.2)
+ *
+ *	Locking: ctrl_lock
+ */
+int __tty_check_change(struct tty_struct *tty, int sig)
+{
+	unsigned long flags;
+	struct pid *pgrp, *tty_pgrp;
+	int ret = 0;
+
+	if (current->signal->tty != tty)
+		return 0;
+
+	rcu_read_lock();
+	pgrp = task_pgrp(current);
+
+	spin_lock_irqsave(&tty->ctrl_lock, flags);
+	tty_pgrp = tty->pgrp;
+	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+
+	if (tty_pgrp && pgrp != tty->pgrp) {
+		if (is_ignored(sig)) {
+			if (sig == SIGTTIN)
+				ret = -EIO;
+		} else if (is_current_pgrp_orphaned())
+			ret = -EIO;
+		else {
+			kill_pgrp(pgrp, sig, 1);
+			set_thread_flag(TIF_SIGPENDING);
+			ret = -ERESTARTSYS;
+		}
+	}
+	rcu_read_unlock();
+
+	if (!tty_pgrp)
+		tty_warn(tty, "sig=%d, tty->pgrp == NULL!\n", sig);
+
+	return ret;
+}
+
+int tty_check_change(struct tty_struct *tty)
+{
+	return __tty_check_change(tty, SIGTTOU);
+}
+EXPORT_SYMBOL(tty_check_change);
+
+void proc_clear_tty(struct task_struct *p)
+{
+	unsigned long flags;
+	struct tty_struct *tty;
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	tty = p->signal->tty;
+	p->signal->tty = NULL;
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	tty_kref_put(tty);
+}
+
+/**
+ * proc_set_tty -  set the controlling terminal
+ *
+ * Only callable by the session leader and only if it does not already have
+ * a controlling terminal.
+ *
+ * Caller must hold:  tty_lock()
+ *		      a readlock on tasklist_lock
+ *		      sighand lock
+ */
+static void __proc_set_tty(struct tty_struct *tty)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tty->ctrl_lock, flags);
+	/*
+	 * The session and fg pgrp references will be non-NULL if
+	 * tiocsctty() is stealing the controlling tty
+	 */
+	put_pid(tty->session);
+	put_pid(tty->pgrp);
+	tty->pgrp = get_pid(task_pgrp(current));
+	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+	tty->session = get_pid(task_session(current));
+	if (current->signal->tty) {
+		tty_debug(tty, "current tty %s not NULL!!\n",
+			  current->signal->tty->name);
+		tty_kref_put(current->signal->tty);
+	}
+	put_pid(current->signal->tty_old_pgrp);
+	current->signal->tty = tty_kref_get(tty);
+	current->signal->tty_old_pgrp = NULL;
+}
+
+static void proc_set_tty(struct tty_struct *tty)
+{
+	spin_lock_irq(&current->sighand->siglock);
+	__proc_set_tty(tty);
+	spin_unlock_irq(&current->sighand->siglock);
+}
+
+/*
+ * Called by tty_open() to set the controlling tty if applicable.
+ */
+void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty)
+{
+	read_lock(&tasklist_lock);
+	spin_lock_irq(&current->sighand->siglock);
+	if (current->signal->leader &&
+	    !current->signal->tty &&
+	    tty->session == NULL) {
+		/*
+		 * Don't let a process that only has write access to the tty
+		 * obtain the privileges associated with having a tty as
+		 * controlling terminal (being able to reopen it with full
+		 * access through /dev/tty, being able to perform pushback).
+		 * Many distributions set the group of all ttys to "tty" and
+		 * grant write-only access to all terminals for setgid tty
+		 * binaries, which should not imply full privileges on all ttys.
+		 *
+		 * This could theoretically break old code that performs open()
+		 * on a write-only file descriptor. In that case, it might be
+		 * necessary to also permit this if
+		 * inode_permission(inode, MAY_READ) == 0.
+		 */
+		if (filp->f_mode & FMODE_READ)
+			__proc_set_tty(tty);
+	}
+	spin_unlock_irq(&current->sighand->siglock);
+	read_unlock(&tasklist_lock);
+}
+
+struct tty_struct *get_current_tty(void)
+{
+	struct tty_struct *tty;
+	unsigned long flags;
+
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	tty = tty_kref_get(current->signal->tty);
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	return tty;
+}
+EXPORT_SYMBOL_GPL(get_current_tty);
+
+/*
+ * Called from tty_release().
+ */
+void session_clear_tty(struct pid *session)
+{
+	struct task_struct *p;
+	do_each_pid_task(session, PIDTYPE_SID, p) {
+		proc_clear_tty(p);
+	} while_each_pid_task(session, PIDTYPE_SID, p);
+}
+
+/**
+ *	tty_signal_session_leader	- sends SIGHUP to session leader
+ *	@tty		controlling tty
+ *	@exit_session	if non-zero, signal all foreground group processes
+ *
+ *	Send SIGHUP and SIGCONT to the session leader and its process group.
+ *	Optionally, signal all processes in the foreground process group.
+ *
+ *	Returns the number of processes in the session with this tty
+ *	as their controlling terminal. This value is used to drop
+ *	tty references for those processes.
+ */
+int tty_signal_session_leader(struct tty_struct *tty, int exit_session)
+{
+	struct task_struct *p;
+	int refs = 0;
+	struct pid *tty_pgrp = NULL;
+
+	read_lock(&tasklist_lock);
+	if (tty->session) {
+		do_each_pid_task(tty->session, PIDTYPE_SID, p) {
+			spin_lock_irq(&p->sighand->siglock);
+			if (p->signal->tty == tty) {
+				p->signal->tty = NULL;
+				/* We defer the dereferences outside fo
+				   the tasklist lock */
+				refs++;
+			}
+			if (!p->signal->leader) {
+				spin_unlock_irq(&p->sighand->siglock);
+				continue;
+			}
+			__group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			__group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
+			put_pid(p->signal->tty_old_pgrp);  /* A noop */
+			spin_lock(&tty->ctrl_lock);
+			tty_pgrp = get_pid(tty->pgrp);
+			if (tty->pgrp)
+				p->signal->tty_old_pgrp = get_pid(tty->pgrp);
+			spin_unlock(&tty->ctrl_lock);
+			spin_unlock_irq(&p->sighand->siglock);
+		} while_each_pid_task(tty->session, PIDTYPE_SID, p);
+	}
+	read_unlock(&tasklist_lock);
+
+	if (tty_pgrp) {
+		if (exit_session)
+			kill_pgrp(tty_pgrp, SIGHUP, exit_session);
+		put_pid(tty_pgrp);
+	}
+
+	return refs;
+}
+
+/**
+ *	disassociate_ctty	-	disconnect controlling tty
+ *	@on_exit: true if exiting so need to "hang up" the session
+ *
+ *	This function is typically called only by the session leader, when
+ *	it wants to disassociate itself from its controlling tty.
+ *
+ *	It performs the following functions:
+ * 	(1)  Sends a SIGHUP and SIGCONT to the foreground process group
+ * 	(2)  Clears the tty from being controlling the session
+ * 	(3)  Clears the controlling tty for all processes in the
+ * 		session group.
+ *
+ *	The argument on_exit is set to 1 if called when a process is
+ *	exiting; it is 0 if called by the ioctl TIOCNOTTY.
+ *
+ *	Locking:
+ *		BTM is taken for hysterical raisons, and held when
+ *		  called from no_tty().
+ *		  tty_mutex is taken to protect tty
+ *		  ->siglock is taken to protect ->signal/->sighand
+ *		  tasklist_lock is taken to walk process list for sessions
+ *		    ->siglock is taken to protect ->signal/->sighand
+ */
+void disassociate_ctty(int on_exit)
+{
+	struct tty_struct *tty;
+
+	if (!current->signal->leader)
+		return;
+
+	tty = get_current_tty();
+	if (tty) {
+		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) {
+			tty_vhangup_session(tty);
+		} else {
+			struct pid *tty_pgrp = tty_get_pgrp(tty);
+			if (tty_pgrp) {
+				kill_pgrp(tty_pgrp, SIGHUP, on_exit);
+				if (!on_exit)
+					kill_pgrp(tty_pgrp, SIGCONT, on_exit);
+				put_pid(tty_pgrp);
+			}
+		}
+		tty_kref_put(tty);
+
+	} else if (on_exit) {
+		struct pid *old_pgrp;
+		spin_lock_irq(&current->sighand->siglock);
+		old_pgrp = current->signal->tty_old_pgrp;
+		current->signal->tty_old_pgrp = NULL;
+		spin_unlock_irq(&current->sighand->siglock);
+		if (old_pgrp) {
+			kill_pgrp(old_pgrp, SIGHUP, on_exit);
+			kill_pgrp(old_pgrp, SIGCONT, on_exit);
+			put_pid(old_pgrp);
+		}
+		return;
+	}
+
+	spin_lock_irq(&current->sighand->siglock);
+	put_pid(current->signal->tty_old_pgrp);
+	current->signal->tty_old_pgrp = NULL;
+
+	tty = tty_kref_get(current->signal->tty);
+	if (tty) {
+		unsigned long flags;
+		spin_lock_irqsave(&tty->ctrl_lock, flags);
+		put_pid(tty->session);
+		put_pid(tty->pgrp);
+		tty->session = NULL;
+		tty->pgrp = NULL;
+		spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+		tty_kref_put(tty);
+	}
+
+	spin_unlock_irq(&current->sighand->siglock);
+	/* Now clear signal->tty under the lock */
+	read_lock(&tasklist_lock);
+	session_clear_tty(task_session(current));
+	read_unlock(&tasklist_lock);
+}
+
+/**
+ *
+ *	no_tty	- Ensure the current process does not have a controlling tty
+ */
+void no_tty(void)
+{
+	/* FIXME: Review locking here. The tty_lock never covered any race
+	   between a new association and proc_clear_tty but possible we need
+	   to protect against this anyway */
+	struct task_struct *tsk = current;
+	disassociate_ctty(0);
+	proc_clear_tty(tsk);
+}
+
+/**
+ *	tiocsctty	-	set controlling tty
+ *	@tty: tty structure
+ *	@arg: user argument
+ *
+ *	This ioctl is used to manage job control. It permits a session
+ *	leader to set this tty as the controlling tty for the session.
+ *
+ *	Locking:
+ *		Takes tty_lock() to serialize proc_set_tty() for this tty
+ *		Takes tasklist_lock internally to walk sessions
+ *		Takes ->siglock() when updating signal->tty
+ */
+static int tiocsctty(struct tty_struct *tty, struct file *file, int arg)
+{
+	int ret = 0;
+
+	tty_lock(tty);
+	read_lock(&tasklist_lock);
+
+	if (current->signal->leader && (task_session(current) == tty->session))
+		goto unlock;
+
+	/*
+	 * The process must be a session leader and
+	 * not have a controlling tty already.
+	 */
+	if (!current->signal->leader || current->signal->tty) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
+	if (tty->session) {
+		/*
+		 * This tty is already the controlling
+		 * tty for another session group!
+		 */
+		if (arg == 1 && capable(CAP_SYS_ADMIN)) {
+			/*
+			 * Steal it away
+			 */
+			session_clear_tty(tty->session);
+		} else {
+			ret = -EPERM;
+			goto unlock;
+		}
+	}
+
+	/* See the comment in tty_open_proc_set_tty(). */
+	if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto unlock;
+	}
+
+	proc_set_tty(tty);
+unlock:
+	read_unlock(&tasklist_lock);
+	tty_unlock(tty);
+	return ret;
+}
+
+/**
+ *	tty_get_pgrp	-	return a ref counted pgrp pid
+ *	@tty: tty to read
+ *
+ *	Returns a refcounted instance of the pid struct for the process
+ *	group controlling the tty.
+ */
+struct pid *tty_get_pgrp(struct tty_struct *tty)
+{
+	unsigned long flags;
+	struct pid *pgrp;
+
+	spin_lock_irqsave(&tty->ctrl_lock, flags);
+	pgrp = get_pid(tty->pgrp);
+	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
+
+	return pgrp;
+}
+EXPORT_SYMBOL_GPL(tty_get_pgrp);
+
+/*
+ * This checks not only the pgrp, but falls back on the pid if no
+ * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
+ * without this...
+ *
+ * The caller must hold rcu lock or the tasklist lock.
+ */
+static struct pid *session_of_pgrp(struct pid *pgrp)
+{
+	struct task_struct *p;
+	struct pid *sid = NULL;
+
+	p = pid_task(pgrp, PIDTYPE_PGID);
+	if (p == NULL)
+		p = pid_task(pgrp, PIDTYPE_PID);
+	if (p != NULL)
+		sid = task_session(p);
+
+	return sid;
+}
+
+/**
+ *	tiocgpgrp		-	get process group
+ *	@tty: tty passed by user
+ *	@real_tty: tty side of the tty passed by the user if a pty else the tty
+ *	@p: returned pid
+ *
+ *	Obtain the process group of the tty. If there is no process group
+ *	return an error.
+ *
+ *	Locking: none. Reference to current->signal->tty is safe.
+ */
+static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
+{
+	struct pid *pid;
+	int ret;
+	/*
+	 * (tty == real_tty) is a cheap way of
+	 * testing if the tty is NOT a master pty.
+	 */
+	if (tty == real_tty && current->signal->tty != real_tty)
+		return -ENOTTY;
+	pid = tty_get_pgrp(real_tty);
+	ret =  put_user(pid_vnr(pid), p);
+	put_pid(pid);
+	return ret;
+}
+
+/**
+ *	tiocspgrp		-	attempt to set process group
+ *	@tty: tty passed by user
+ *	@real_tty: tty side device matching tty passed by user
+ *	@p: pid pointer
+ *
+ *	Set the process group of the tty to the session passed. Only
+ *	permitted where the tty session is our session.
+ *
+ *	Locking: RCU, ctrl lock
+ */
+static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
+{
+	struct pid *pgrp;
+	pid_t pgrp_nr;
+	int retval = tty_check_change(real_tty);
+
+	if (retval == -EIO)
+		return -ENOTTY;
+	if (retval)
+		return retval;
+	if (!current->signal->tty ||
+	    (current->signal->tty != real_tty) ||
+	    (real_tty->session != task_session(current)))
+		return -ENOTTY;
+	if (get_user(pgrp_nr, p))
+		return -EFAULT;
+	if (pgrp_nr < 0)
+		return -EINVAL;
+	rcu_read_lock();
+	pgrp = find_vpid(pgrp_nr);
+	retval = -ESRCH;
+	if (!pgrp)
+		goto out_unlock;
+	retval = -EPERM;
+	if (session_of_pgrp(pgrp) != task_session(current))
+		goto out_unlock;
+	retval = 0;
+	spin_lock_irq(&tty->ctrl_lock);
+	put_pid(real_tty->pgrp);
+	real_tty->pgrp = get_pid(pgrp);
+	spin_unlock_irq(&tty->ctrl_lock);
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+/**
+ *	tiocgsid		-	get session id
+ *	@tty: tty passed by user
+ *	@real_tty: tty side of the tty passed by the user if a pty else the tty
+ *	@p: pointer to returned session id
+ *
+ *	Obtain the session id of the tty. If there is no session
+ *	return an error.
+ *
+ *	Locking: none. Reference to current->signal->tty is safe.
+ */
+static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p)
+{
+	/*
+	 * (tty == real_tty) is a cheap way of
+	 * testing if the tty is NOT a master pty.
+	*/
+	if (tty == real_tty && current->signal->tty != real_tty)
+		return -ENOTTY;
+	if (!real_tty->session)
+		return -ENOTTY;
+	return put_user(pid_vnr(real_tty->session), p);
+}
+
+/*
+ * Called from tty_ioctl(). If tty is a pty then real_tty is the slave side,
+ * if not then tty == real_tty.
+ */
+long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty,
+		       struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *p = (void __user *)arg;
+
+	switch (cmd) {
+	case TIOCNOTTY:
+		if (current->signal->tty != tty)
+			return -ENOTTY;
+		no_tty();
+		return 0;
+	case TIOCSCTTY:
+		return tiocsctty(real_tty, file, arg);
+	case TIOCGPGRP:
+		return tiocgpgrp(tty, real_tty, p);
+	case TIOCSPGRP:
+		return tiocspgrp(tty, real_tty, p);
+	case TIOCGSID:
+		return tiocgsid(tty, real_tty, p);
+	}
+	return -ENOIOCTLCMD;
+}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f1106d7c73b6..d07cd2105a6c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -475,9 +475,13 @@ extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 extern int is_current_pgrp_orphaned(void);
 extern void tty_hangup(struct tty_struct *tty);
 extern void tty_vhangup(struct tty_struct *tty);
+extern void tty_vhangup_session(struct tty_struct *tty);
 extern int tty_hung_up_p(struct file *filp);
 extern void do_SAK(struct tty_struct *tty);
 extern void __do_SAK(struct tty_struct *tty);
+extern void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty);
+extern int tty_signal_session_leader(struct tty_struct *tty, int exit_session);
+extern void session_clear_tty(struct pid *session);
 extern void no_tty(void);
 extern void tty_buffer_free_all(struct tty_port *port);
 extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld);
@@ -525,6 +529,8 @@ extern void tty_ldisc_flush(struct tty_struct *tty);
 extern long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 			unsigned int cmd, unsigned long arg);
+extern long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty,
+			      struct file *file, unsigned int cmd, unsigned long arg);
 extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg);
 extern void tty_default_fops(struct file_operations *fops);
 extern struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx);
-- 
cgit v1.2.3


From 5ef1ecf060f28ecef313b5723f1fd39bf5a35f56 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Wed, 29 Mar 2017 20:54:37 +0200
Subject: mmc: sdio: fix alignment issue in struct sdio_func

Certain 64-bit systems (e.g. Amlogic Meson GX) require buffers to be
used for DMA to be 8-byte-aligned. struct sdio_func has an embedded
small DMA buffer not meeting this requirement.
When testing switching to descriptor chain mode in meson-gx driver
SDIO is broken therefore. Fix this by allocating the small DMA buffer
separately as kmalloc ensures that the returned memory area is
properly aligned for every basic data type.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Tested-by: Helmut Klein <hgkr.klein@gmail.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sdio_bus.c   | 12 +++++++++++-
 include/linux/mmc/sdio_func.h |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index e992a7f8a16f..2b32b88949ba 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -267,7 +267,7 @@ static void sdio_release_func(struct device *dev)
 	sdio_free_func_cis(func);
 
 	kfree(func->info);
-
+	kfree(func->tmpbuf);
 	kfree(func);
 }
 
@@ -282,6 +282,16 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card)
 	if (!func)
 		return ERR_PTR(-ENOMEM);
 
+	/*
+	 * allocate buffer separately to make sure it's properly aligned for
+	 * DMA usage (incl. 64 bit DMA)
+	 */
+	func->tmpbuf = kmalloc(4, GFP_KERNEL);
+	if (!func->tmpbuf) {
+		kfree(func);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	func->card = card;
 
 	device_initialize(&func->dev);
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index aab032a6ae61..97ca105347a6 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -53,7 +53,7 @@ struct sdio_func {
 	unsigned int		state;		/* function state */
 #define SDIO_STATE_PRESENT	(1<<0)		/* present in sysfs */
 
-	u8			tmpbuf[4];	/* DMA:able scratch buffer */
+	u8			*tmpbuf;	/* DMA:able scratch buffer */
 
 	unsigned		num_info;	/* number of info strings */
 	const char		**info;		/* info strings */
-- 
cgit v1.2.3


From 5f8ddeab10ce45d3d3de8ae7ea8811512845c497 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sun, 16 Apr 2017 02:55:09 +0200
Subject: rhashtable: remove insecure_elasticity

commit 83e7e4ce9e93c3 ("mac80211: Use rhltable instead of rhashtable")
removed the last user that made use of 'insecure_elasticity' parameter,
i.e. the default of 16 is used everywhere.

Replace it with a constant.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 21 ++++++++++++++++-----
 lib/rhashtable.c           | 17 +----------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index e507290cd2c7..ae87dcdf52d2 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -49,6 +49,21 @@
 /* Base bits plus 1 bit for nulls marker */
 #define RHT_HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
 
+/* Maximum chain length before rehash
+ *
+ * The maximum (not average) chain length grows with the size of the hash
+ * table, at a rate of (log N)/(log log N).
+ *
+ * The value of 16 is selected so that even if the hash table grew to
+ * 2^32 you would not expect the maximum chain length to exceed it
+ * unless we are under attack (or extremely unlucky).
+ *
+ * As this limit is only to detect attacks, we don't need to set it to a
+ * lower value as you'd need the chain length to vastly exceed 16 to have
+ * any real effect on the system.
+ */
+#define RHT_ELASTICITY	16u
+
 struct rhash_head {
 	struct rhash_head __rcu		*next;
 };
@@ -114,7 +129,6 @@ struct rhashtable;
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
  * @nulls_base: Base value to generate nulls marker
- * @insecure_elasticity: Set to true to disable chain length checks
  * @automatic_shrinking: Enable automatic shrinking of tables
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
@@ -130,7 +144,6 @@ struct rhashtable_params {
 	unsigned int		max_size;
 	unsigned int		min_size;
 	u32			nulls_base;
-	bool			insecure_elasticity;
 	bool			automatic_shrinking;
 	size_t			locks_mul;
 	rht_hashfn_t		hashfn;
@@ -143,7 +156,6 @@ struct rhashtable_params {
  * @tbl: Bucket table
  * @nelems: Number of elements in table
  * @key_len: Key length for hashfn
- * @elasticity: Maximum chain length before rehash
  * @p: Configuration parameters
  * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
@@ -154,7 +166,6 @@ struct rhashtable {
 	struct bucket_table __rcu	*tbl;
 	atomic_t			nelems;
 	unsigned int			key_len;
-	unsigned int			elasticity;
 	struct rhashtable_params	p;
 	bool				rhlist;
 	struct work_struct		run_work;
@@ -726,7 +737,7 @@ slow_path:
 		return rhashtable_insert_slow(ht, key, obj);
 	}
 
-	elasticity = ht->elasticity;
+	elasticity = RHT_ELASTICITY;
 	pprev = rht_bucket_insert(ht, tbl, hash);
 	data = ERR_PTR(-ENOMEM);
 	if (!pprev)
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index f8635fd57442..d22a5ef109fb 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -535,7 +535,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht,
 	struct rhash_head *head;
 	int elasticity;
 
-	elasticity = ht->elasticity;
+	elasticity = RHT_ELASTICITY;
 	pprev = rht_bucket_var(tbl, hash);
 	rht_for_each_continue(head, *pprev, tbl, hash) {
 		struct rhlist_head *list;
@@ -972,21 +972,6 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(&ht->p);
 
-	/* The maximum (not average) chain length grows with the
-	 * size of the hash table, at a rate of (log N)/(log log N).
-	 * The value of 16 is selected so that even if the hash
-	 * table grew to 2^32 you would not expect the maximum
-	 * chain length to exceed it unless we are under attack
-	 * (or extremely unlucky).
-	 *
-	 * As this limit is only to detect attacks, we don't need
-	 * to set it to a lower value as you'd need the chain
-	 * length to vastly exceed 16 to have any real effect
-	 * on the system.
-	 */
-	if (!params->insecure_elasticity)
-		ht->elasticity = 16;
-
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
 	else
-- 
cgit v1.2.3


From ec19b85913486993d7d6f747beed1a711afd47d8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 31 Mar 2017 19:01:14 -0400
Subject: ftrace: Move the probe function into the tracing directory

As nothing outside the tracing directory uses the function probes mechanism,
I'm moving the prototypes out of the include/linux/ftrace.h and into the
local kernel/trace/trace.h header. I plan on making them hook to the
trace_array structure which is local to kernel/trace, and I do not want to
expose it to the rest of the kernel. This requires that the probe functions
must also be local to tracing. But luckily nothing else uses them.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 24 ------------------------
 kernel/trace/trace.h   | 25 +++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 06b2990a35e4..3e790ff1c501 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -360,30 +360,6 @@ void ftrace_bug(int err, struct dyn_ftrace *rec);
 
 struct seq_file;
 
-struct ftrace_probe_ops {
-	void			(*func)(unsigned long ip,
-					unsigned long parent_ip,
-					void **data);
-	int			(*init)(struct ftrace_probe_ops *ops,
-					unsigned long ip, void **data);
-	void			(*free)(struct ftrace_probe_ops *ops,
-					unsigned long ip, void **data);
-	int			(*print)(struct seq_file *m,
-					 unsigned long ip,
-					 struct ftrace_probe_ops *ops,
-					 void *data);
-};
-
-extern int
-register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
-			      void *data);
-extern void
-unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
-				void *data);
-extern void
-unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
-extern void unregister_ftrace_function_probe_all(char *glob);
-
 extern int ftrace_text_reserved(const void *start, const void *end);
 
 extern int ftrace_nr_registered_ops(void);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 31a4997b67c6..2ff6d49fa5ca 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -923,6 +923,31 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) {
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+
+struct ftrace_probe_ops {
+	void			(*func)(unsigned long ip,
+					unsigned long parent_ip,
+					void **data);
+	int			(*init)(struct ftrace_probe_ops *ops,
+					unsigned long ip, void **data);
+	void			(*free)(struct ftrace_probe_ops *ops,
+					unsigned long ip, void **data);
+	int			(*print)(struct seq_file *m,
+					 unsigned long ip,
+					 struct ftrace_probe_ops *ops,
+					 void *data);
+};
+
+extern int
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+			      void *data);
+extern void
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
+				void *data);
+extern void
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
+extern void unregister_ftrace_function_probe_all(char *glob);
+
 void ftrace_create_filter_files(struct ftrace_ops *ops,
 				struct dentry *parent);
 void ftrace_destroy_filter_files(struct ftrace_ops *ops);
-- 
cgit v1.2.3


From ae749c7ab475de2c9c427058db19921c91846e89 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 12 Apr 2017 13:25:54 +0100
Subject: PCI: Add arch_can_pci_mmap_wc() macro

Most of the almost-identical versions of pci_mmap_page_range() silently
ignore the 'write_combine' argument and give uncached mappings.

Yet we allow the PCIIOC_WRITE_COMBINE ioctl in /proc/bus/pci, expose the
'resourceX_wc' file in sysfs, and allow an attempted mapping to apparently
succeed.

To fix this, introduce a macro arch_can_pci_mmap_wc() which indicates
whether the platform can do a write-combining mapping.  On x86 this ends up
being pat_enabled(), while the few other platforms that support it can just
set it to a literal '1'.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/filesystems/sysfs-pci.txt |  4 ++++
 arch/ia64/include/asm/pci.h             |  2 ++
 arch/powerpc/include/asm/pci.h          |  5 +++--
 arch/x86/include/asm/pci.h              |  2 ++
 drivers/pci/pci-sysfs.c                 |  4 ++--
 drivers/pci/proc.c                      | 15 ++++++++-------
 include/linux/pci.h                     |  4 ++++
 7 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 6ea1ceda6f52..25b7f1c1b868 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -117,6 +117,10 @@ code must define HAVE_PCI_MMAP and provide a pci_mmap_page_range function.
 Platforms are free to only support subsets of the mmap functionality, but
 useful return codes should be provided.
 
+Platforms which support write-combining maps of PCI resources must define
+arch_can_pci_mmap_wc() which shall evaluate to non-zero at runtime when
+write-combining is permitted.
+
 Legacy resources are protected by the HAVE_PCI_LEGACY define.  Platforms
 wishing to support legacy functionality should define it and provide
 pci_legacy_read, pci_legacy_write and pci_mmap_legacy_page_range functions.
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index c0835b0dc722..6283758474ad 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -51,6 +51,8 @@ extern unsigned long ia64_max_iommu_merge_mask;
 #define PCI_DMA_BUS_IS_PHYS	(ia64_max_iommu_merge_mask == ~0UL)
 
 #define HAVE_PCI_MMAP
+#define arch_can_pci_mmap_wc()	1
+
 extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
 				enum pci_mmap_state mmap_state, int write_combine);
 #define HAVE_PCI_LEGACY
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 93eded8d3843..b5b68c6a10b1 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -81,8 +81,9 @@ struct vm_area_struct;
 int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine);
 
-/* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
-#define HAVE_PCI_MMAP	1
+/* Tell drivers/pci/proc.c that we have pci_mmap_page_range() and it does WC */
+#define HAVE_PCI_MMAP		1
+#define arch_can_pci_mmap_wc()	1
 
 extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val,
 			   size_t count);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 1411dbed5e5e..f6e22c271efa 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
 #include <linux/string.h>
 #include <linux/scatterlist.h>
 #include <asm/io.h>
+#include <asm/pat.h>
 #include <asm/x86_init.h>
 
 #ifdef __KERNEL__
@@ -102,6 +103,7 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
 
 #define HAVE_PCI_MMAP
+#define arch_can_pci_mmap_wc()	pat_enabled()
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       enum pci_mmap_state mmap_state,
 			       int write_combine);
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 7ac258fd3c5c..7d494bd66a97 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1211,9 +1211,9 @@ static int pci_create_resource_files(struct pci_dev *pdev)
 
 		retval = pci_create_attr(pdev, i, 0);
 		/* for prefetchable resources, create a WC mappable file */
-		if (!retval && pdev->resource[i].flags & IORESOURCE_PREFETCH)
+		if (!retval && arch_can_pci_mmap_wc() &&
+		    pdev->resource[i].flags & IORESOURCE_PREFETCH)
 			retval = pci_create_attr(pdev, i, 1);
-
 		if (retval) {
 			pci_remove_resource_files(pdev);
 			return retval;
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index dc8912e2d4a1..a2aa58a8fb96 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -210,14 +210,15 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd,
 		break;
 
 	case PCIIOC_WRITE_COMBINE:
-		if (arg)
-			fpriv->write_combine = 1;
-		else
-			fpriv->write_combine = 0;
-		break;
-
+		if (arch_can_pci_mmap_wc()) {
+			if (arg)
+				fpriv->write_combine = 1;
+			else
+				fpriv->write_combine = 0;
+			break;
+		}
+		/* If arch decided it can't, fall through... */
 #endif /* HAVE_PCI_MMAP */
-
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..e614fb42d8bb 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1626,6 +1626,10 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 
 #include <asm/pci.h>
 
+#ifndef arch_can_pci_mmap_wc
+#define arch_can_pci_mmap_wc()		0
+#endif
+
 #ifndef pci_root_bus_fwnode
 #define pci_root_bus_fwnode(bus)	NULL
 #endif
-- 
cgit v1.2.3


From 11df19546fe4a6135cdae62e96a1e25b3fabf6ea Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 12 Apr 2017 13:25:55 +0100
Subject: PCI: Move multiple declarations of pci_mmap_page_range() to
 <linux/pci.h>

We can declare it <linux/pci.h> even on platforms where it isn't going to
be defined.  There's no need to have it littered through the various
<asm/pci.h> files.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/arm/include/asm/pci.h        | 2 --
 arch/cris/include/asm/pci.h       | 3 ---
 arch/ia64/include/asm/pci.h       | 2 --
 arch/microblaze/include/asm/pci.h | 3 ---
 arch/mips/include/asm/pci.h       | 3 ---
 arch/mn10300/include/asm/pci.h    | 3 ---
 arch/parisc/include/asm/pci.h     | 3 ---
 arch/powerpc/include/asm/pci.h    | 3 ---
 arch/sh/include/asm/pci.h         | 3 +--
 arch/sparc/include/asm/pci_64.h   | 4 ----
 arch/unicore32/include/asm/pci.h  | 2 --
 arch/x86/include/asm/pci.h        | 4 ----
 arch/xtensa/include/asm/pci.h     | 4 ----
 include/linux/pci.h               | 7 +++++++
 14 files changed, 8 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 057d381f4e57..51118a0cabbc 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -29,8 +29,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 #define PCI_DMA_BUS_IS_PHYS     (1)
 
 #define HAVE_PCI_MMAP
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-                               enum pci_mmap_state mmap_state, int write_combine);
 
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
diff --git a/arch/cris/include/asm/pci.h b/arch/cris/include/asm/pci.h
index b1b289df04c7..65198cb46c85 100644
--- a/arch/cris/include/asm/pci.h
+++ b/arch/cris/include/asm/pci.h
@@ -42,9 +42,6 @@ struct pci_dev;
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
 #define HAVE_PCI_MMAP
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			       enum pci_mmap_state mmap_state, int write_combine);
-
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 6283758474ad..fc60b3d139f4 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -53,8 +53,6 @@ extern unsigned long ia64_max_iommu_merge_mask;
 #define HAVE_PCI_MMAP
 #define arch_can_pci_mmap_wc()	1
 
-extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
-				enum pci_mmap_state mmap_state, int write_combine);
 #define HAVE_PCI_LEGACY
 extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
 				      struct vm_area_struct *vma,
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 2a120bb70e54..ab381d283fb9 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -46,9 +46,6 @@ extern int pci_domain_nr(struct pci_bus *bus);
 extern int pci_proc_domain(struct pci_bus *bus);
 
 struct vm_area_struct;
-/* Map a range of PCI memory or I/O space for a device into user space */
-int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine);
 
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
 #define HAVE_PCI_MMAP	1
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 30d1129d8624..3141e2a8113a 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -111,9 +111,6 @@ extern void pcibios_set_master(struct pci_dev *dev);
 
 #define HAVE_PCI_MMAP
 
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-	enum pci_mmap_state mmap_state, int write_combine);
-
 #define HAVE_ARCH_PCI_RESOURCE_TO_USER
 
 /*
diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h
index 51159fff025a..082b6de90936 100644
--- a/arch/mn10300/include/asm/pci.h
+++ b/arch/mn10300/include/asm/pci.h
@@ -74,9 +74,6 @@ static inline int pci_controller_num(struct pci_dev *dev)
 }
 
 #define HAVE_PCI_MMAP
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			       enum pci_mmap_state mmap_state,
-			       int write_combine);
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index defebd956585..bb9ea9003e08 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -201,7 +201,4 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 
 #define HAVE_PCI_MMAP
 
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-	enum pci_mmap_state mmap_state, int write_combine);
-
 #endif /* __ASM_PARISC_PCI_H */
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index b5b68c6a10b1..55887d171205 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -77,9 +77,6 @@ extern int pci_domain_nr(struct pci_bus *bus);
 extern int pci_proc_domain(struct pci_bus *bus);
 
 struct vm_area_struct;
-/* Map a range of PCI memory or I/O space for a device into user space */
-int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine);
 
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() and it does WC */
 #define HAVE_PCI_MMAP		1
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 644314f2b1ef..46abbc9983c9 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -66,8 +66,7 @@ extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
 struct pci_dev;
 
 #define HAVE_PCI_MMAP
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-	enum pci_mmap_state mmap_state, int write_combine);
+
 extern void pcibios_set_master(struct pci_dev *dev);
 
 /* Dynamic DMA mapping stuff.
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 2303635158f5..516fda70a1bb 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -45,10 +45,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 #define HAVE_ARCH_PCI_GET_UNMAPPED_AREA
 #define get_pci_unmapped_area get_fb_unmapped_area
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state,
-			int write_combine);
-
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
 	return PCI_IRQ_NONE;
diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h
index 37e55d018de5..a51290862acd 100644
--- a/arch/unicore32/include/asm/pci.h
+++ b/arch/unicore32/include/asm/pci.h
@@ -17,8 +17,6 @@
 #include <mach/hardware.h> /* for PCIBIOS_MIN_* */
 
 #define HAVE_PCI_MMAP
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-	enum pci_mmap_state mmap_state, int write_combine);
 
 #endif /* __KERNEL__ */
 #endif
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index f6e22c271efa..734cc9411b3a 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -104,10 +104,6 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
 #define HAVE_PCI_MMAP
 #define arch_can_pci_mmap_wc()	pat_enabled()
-extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			       enum pci_mmap_state mmap_state,
-			       int write_combine);
-
 
 #ifdef CONFIG_PCI
 extern void early_quirks(void);
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index 5d6bd932ba4e..bb5510b28754 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -46,10 +46,6 @@ struct pci_dev;
 
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
-/* Map a range of PCI memory or I/O space for a device into user space */
-int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine);
-
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
 #define HAVE_PCI_MMAP	1
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e614fb42d8bb..e7bb4b62cc97 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1626,6 +1626,13 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 
 #include <asm/pci.h>
 
+/* Map a range of PCI memory or I/O space for a device into user space.
+ * Architectures provide this function if they set HAVE_PCI_MMAP, and
+ * it accepts the 'write_combine' argument when arch_can_pci_mmap_wc()
+ * evaluates to nonzero. */
+int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine);
+
 #ifndef arch_can_pci_mmap_wc
 #define arch_can_pci_mmap_wc()		0
 #endif
-- 
cgit v1.2.3


From e854d8b2a82ef76521ad2bed68211fde0511d417 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 12 Apr 2017 13:25:56 +0100
Subject: PCI: Add arch_can_pci_mmap_io() on architectures which can mmap() I/O
 space

This is relatively esoteric, and knowing that we don't have it makes life
easier in some cases rather than just an eventual -EINVAL from
pci_mmap_page_range().

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/filesystems/sysfs-pci.txt |  3 ++-
 arch/microblaze/include/asm/pci.h       |  3 ++-
 arch/powerpc/include/asm/pci.h          |  1 +
 arch/sparc/include/asm/pci_64.h         |  1 +
 arch/xtensa/include/asm/pci.h           |  3 ++-
 drivers/pci/pci-sysfs.c                 | 13 ++++++++-----
 drivers/pci/proc.c                      | 11 +++++++----
 include/linux/pci.h                     |  3 +++
 8 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 25b7f1c1b868..46b95d82c4fd 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -119,7 +119,8 @@ useful return codes should be provided.
 
 Platforms which support write-combining maps of PCI resources must define
 arch_can_pci_mmap_wc() which shall evaluate to non-zero at runtime when
-write-combining is permitted.
+write-combining is permitted. Platforms which support maps of I/O resources
+define arch_can_pci_mmap_io() similarly.
 
 Legacy resources are protected by the HAVE_PCI_LEGACY define.  Platforms
 wishing to support legacy functionality should define it and provide
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index ab381d283fb9..efd4983cb697 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -48,7 +48,8 @@ extern int pci_proc_domain(struct pci_bus *bus);
 struct vm_area_struct;
 
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
-#define HAVE_PCI_MMAP	1
+#define HAVE_PCI_MMAP		1
+#define arch_can_pci_mmap_io()	1
 
 extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val,
 			   size_t count);
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 55887d171205..c8975dac535f 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -80,6 +80,7 @@ struct vm_area_struct;
 
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() and it does WC */
 #define HAVE_PCI_MMAP		1
+#define arch_can_pci_mmap_io()	1
 #define arch_can_pci_mmap_wc()	1
 
 extern int pci_legacy_read(struct pci_bus *bus, loff_t port, u32 *val,
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 516fda70a1bb..b957ca5527a3 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -42,6 +42,7 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 /* Platform support for /proc/bus/pci/X/Y mmap()s. */
 
 #define HAVE_PCI_MMAP
+#define arch_can_pci_mmap_io()	1
 #define HAVE_ARCH_PCI_GET_UNMAPPED_AREA
 #define get_pci_unmapped_area get_fb_unmapped_area
 
diff --git a/arch/xtensa/include/asm/pci.h b/arch/xtensa/include/asm/pci.h
index bb5510b28754..e4f366a488d3 100644
--- a/arch/xtensa/include/asm/pci.h
+++ b/arch/xtensa/include/asm/pci.h
@@ -47,7 +47,8 @@ struct pci_dev;
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
 /* Tell drivers/pci/proc.c that we have pci_mmap_page_range() */
-#define HAVE_PCI_MMAP	1
+#define HAVE_PCI_MMAP		1
+#define arch_can_pci_mmap_io()	1
 
 #endif /* __KERNEL__ */
 
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 7d494bd66a97..cb04bc29943f 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1174,11 +1174,14 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine)
 	} else {
 		pdev->res_attr[num] = res_attr;
 		sprintf(res_attr_name, "resource%d", num);
-		res_attr->mmap = pci_mmap_resource_uc;
-	}
-	if (pci_resource_flags(pdev, num) & IORESOURCE_IO) {
-		res_attr->read = pci_read_resource_io;
-		res_attr->write = pci_write_resource_io;
+		if (pci_resource_flags(pdev, num) & IORESOURCE_IO) {
+			res_attr->read = pci_read_resource_io;
+			res_attr->write = pci_write_resource_io;
+			if (arch_can_pci_mmap_io())
+				res_attr->mmap = pci_mmap_resource_uc;
+		} else {
+			res_attr->mmap = pci_mmap_resource_uc;
+		}
 	}
 	res_attr->attr.name = res_attr_name;
 	res_attr->attr.mode = S_IRUSR | S_IWUSR;
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index a2aa58a8fb96..45e5cf7e9193 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -202,6 +202,8 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd,
 
 #ifdef HAVE_PCI_MMAP
 	case PCIIOC_MMAP_IS_IO:
+		if (!arch_can_pci_mmap_io())
+			return -EINVAL;
 		fpriv->mmap_state = pci_mmap_io;
 		break;
 
@@ -232,15 +234,16 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct pci_dev *dev = PDE_DATA(file_inode(file));
 	struct pci_filp_private *fpriv = file->private_data;
-	int i, ret, write_combine = 0, res_bit;
+	int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM;
 
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
 
-	if (fpriv->mmap_state == pci_mmap_io)
+	if (fpriv->mmap_state == pci_mmap_io) {
+		if (!arch_can_pci_mmap_io())
+			return -EINVAL;
 		res_bit = IORESOURCE_IO;
-	else
-		res_bit = IORESOURCE_MEM;
+	}
 
 	/* Make sure the caller is mapping a real resource for this device */
 	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e7bb4b62cc97..590cfcf6acf5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1636,6 +1636,9 @@ int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
 #ifndef arch_can_pci_mmap_wc
 #define arch_can_pci_mmap_wc()		0
 #endif
+#ifndef arch_can_pci_mmap_io
+#define arch_can_pci_mmap_io()		0
+#endif
 
 #ifndef pci_root_bus_fwnode
 #define pci_root_bus_fwnode(bus)	NULL
-- 
cgit v1.2.3


From b8c17e6664c461e4aed545a943304c3b32dd309c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2016 14:25:21 -0800
Subject: rcu: Maintain special bits at bottom of ->dynticks counter

Currently, IPIs are used to force other CPUs to invalidate their TLBs
in response to a kernel virtual-memory mapping change.  This works, but
degrades both battery lifetime (for idle CPUs) and real-time response
(for nohz_full CPUs), and in addition results in unnecessary IPIs due to
the fact that CPUs executing in usermode are unaffected by stale kernel
mappings.  It would be better to cause a CPU executing in usermode to
wait until it is entering kernel mode to do the flush, first to avoid
interrupting usemode tasks and second to handle multiple flush requests
with a single flush in the case of a long-running user task.

This commit therefore reserves a bit at the bottom of the ->dynticks
counter, which is checked upon exit from extended quiescent states.
If it is set, it is cleared and then a new rcu_eqs_special_exit() macro is
invoked, which, if not supplied, is an empty single-pass do-while loop.
If this bottom bit is set on -entry- to an extended quiescent state,
then a WARN_ON_ONCE() triggers.

This bottom bit may be set using a new rcu_eqs_special_set() function,
which returns true if the bit was set, or false if the CPU turned
out to not be in an extended quiescent state.  Please note that this
function refuses to set the bit for a non-nohz_full CPU when that CPU
is executing in usermode because usermode execution is tracked by RCU
as a dyntick-idle extended quiescent state only for nohz_full CPUs.

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcutiny.h |  5 ++++
 kernel/rcu/tree.c       | 77 +++++++++++++++++++++++++++++++++++++++----------
 kernel/rcu/tree.h       |  1 +
 3 files changed, 67 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b452953e21c8..6c9d941e3962 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 	return 0;
 }
 
+static inline bool rcu_eqs_special_set(int cpu)
+{
+	return false;  /* Never flag non-existent other CPUs! */
+}
+
 static inline unsigned long get_state_synchronize_rcu(void)
 {
 	return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 50fee7689e71..0efad311ded4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -274,9 +274,19 @@ void rcu_bh_qs(void)
 
 static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
 
+/*
+ * Steal a bit from the bottom of ->dynticks for idle entry/exit
+ * control.  Initially this is for TLB flushing.
+ */
+#define RCU_DYNTICK_CTRL_MASK 0x1
+#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
+#ifndef rcu_eqs_special_exit
+#define rcu_eqs_special_exit() do { } while (0)
+#endif
+
 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-	.dynticks = ATOMIC_INIT(1),
+	.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
 	.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
 	.dynticks_idle = ATOMIC_INIT(1),
@@ -290,15 +300,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 static void rcu_dynticks_eqs_enter(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-	int special;
+	int seq;
 
 	/*
-	 * CPUs seeing atomic_inc_return() must see prior RCU read-side
+	 * CPUs seeing atomic_add_return() must see prior RCU read-side
 	 * critical sections, and we also must force ordering with the
 	 * next idle sojourn.
 	 */
-	special = atomic_inc_return(&rdtp->dynticks);
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
+	seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+	/* Better be in an extended quiescent state! */
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     (seq & RCU_DYNTICK_CTRL_CTR));
+	/* Better not have special action (TLB flush) pending! */
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     (seq & RCU_DYNTICK_CTRL_MASK));
 }
 
 /*
@@ -308,15 +323,22 @@ static void rcu_dynticks_eqs_enter(void)
 static void rcu_dynticks_eqs_exit(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-	int special;
+	int seq;
 
 	/*
-	 * CPUs seeing atomic_inc_return() must see prior idle sojourns,
+	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
 	 * and we also must force ordering with the next RCU read-side
 	 * critical section.
 	 */
-	special = atomic_inc_return(&rdtp->dynticks);
-	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+	seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
+		     !(seq & RCU_DYNTICK_CTRL_CTR));
+	if (seq & RCU_DYNTICK_CTRL_MASK) {
+		atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks);
+		smp_mb__after_atomic(); /* _exit after clearing mask. */
+		/* Prefer duplicate flushes to losing a flush. */
+		rcu_eqs_special_exit();
+	}
 }
 
 /*
@@ -333,9 +355,9 @@ static void rcu_dynticks_eqs_online(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-	if (atomic_read(&rdtp->dynticks) & 0x1)
+	if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR)
 		return;
-	atomic_add(0x1, &rdtp->dynticks);
+	atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks);
 }
 
 /*
@@ -347,7 +369,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-	return !(atomic_read(&rdtp->dynticks) & 0x1);
+	return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR);
 }
 
 /*
@@ -358,7 +380,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
 {
 	int snap = atomic_add_return(0, &rdtp->dynticks);
 
-	return snap;
+	return snap & ~RCU_DYNTICK_CTRL_MASK;
 }
 
 /*
@@ -367,7 +389,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp)
  */
 static bool rcu_dynticks_in_eqs(int snap)
 {
-	return !(snap & 0x1);
+	return !(snap & RCU_DYNTICK_CTRL_CTR);
 }
 
 /*
@@ -387,10 +409,33 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap)
 static void rcu_dynticks_momentary_idle(void)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
-	int special = atomic_add_return(2, &rdtp->dynticks);
+	int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
+					&rdtp->dynticks);
 
 	/* It is illegal to call this from idle state. */
-	WARN_ON_ONCE(!(special & 0x1));
+	WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+}
+
+/*
+ * Set the special (bottom) bit of the specified CPU so that it
+ * will take special action (such as flushing its TLB) on the
+ * next exit from an extended quiescent state.  Returns true if
+ * the bit was successfully set, or false if the CPU was not in
+ * an extended quiescent state.
+ */
+bool rcu_eqs_special_set(int cpu)
+{
+	int old;
+	int new;
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	do {
+		old = atomic_read(&rdtp->dynticks);
+		if (old & RCU_DYNTICK_CTRL_CTR)
+			return false;
+		new = old | RCU_DYNTICK_CTRL_MASK;
+	} while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old);
+	return true;
 }
 
 DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ec62a05bfdb3..7468b4de7e0c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -596,6 +596,7 @@ extern struct rcu_state rcu_preempt_state;
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 int rcu_dynticks_snap(struct rcu_dynticks *rdtp);
+bool rcu_eqs_special_set(int cpu);
 
 #ifdef CONFIG_RCU_BOOST
 DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-- 
cgit v1.2.3


From 77e5849688670280b173bb9e0544e9da7b2acc36 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 14 Jan 2017 13:32:50 -0800
Subject: rcu: Make arch select smp_mb__after_unlock_lock() strength

The definition of smp_mb__after_unlock_lock() is currently smp_mb()
for CONFIG_PPC and a no-op otherwise.  It would be better to instead
provide an architecture-selectable Kconfig option, and select the
strength of smp_mb__after_unlock_lock() based on that option.  This
commit therefore creates ARCH_WEAK_RELEASE_ACQUIRE, has PPC select it,
and bases the definition of smp_mb__after_unlock_lock() on this new
ARCH_WEAK_RELEASE_ACQUIRE Kconfig option.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Boqun Feng <boqun.feng@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <linuxppc-dev@lists.ozlabs.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 arch/Kconfig             | 3 +++
 arch/powerpc/Kconfig     | 1 +
 include/linux/rcupdate.h | 6 +++---
 3 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..adefaf344239 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -320,6 +320,9 @@ config HAVE_CMPXCHG_LOCAL
 config HAVE_CMPXCHG_DOUBLE
 	bool
 
+config ARCH_WEAK_RELEASE_ACQUIRE
+	bool
+
 config ARCH_WANT_IPC_PARSE_VERSION
 	bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 97a8bc8a095c..7a5c9b764cd2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -99,6 +99,7 @@ config PPC
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index de88b33c0974..e6146d0074f8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1127,11 +1127,11 @@ do { \
  * if the UNLOCK and LOCK are executed by the same CPU or if the
  * UNLOCK and LOCK operate on the same lock variable.
  */
-#ifdef CONFIG_PPC
+#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
 #define smp_mb__after_unlock_lock()	smp_mb()  /* Full ordering for lock. */
-#else /* #ifdef CONFIG_PPC */
+#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
 #define smp_mb__after_unlock_lock()	do { } while (0)
-#endif /* #else #ifdef CONFIG_PPC */
+#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
 
 
 #endif /* __LINUX_RCUPDATE_H */
-- 
cgit v1.2.3


From 900b1028ec388e50c98200641ae4274794c807cf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 10 Feb 2017 14:32:54 -0800
Subject: srcu: Allow SRCU to access rcu_scheduler_active

This is primarily a code-movement commit in preparation for allowing
SRCU to handle early-boot SRCU grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h  |  6 +++---
 kernel/rcu/tiny_plugin.h |  9 +++++----
 kernel/rcu/tree.c        |  2 +-
 kernel/rcu/tree_exp.h    | 12 -----------
 kernel/rcu/update.c      | 52 +++++++++++++++++++++++++++++++-----------------
 5 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6c9d941e3962..5219be250f00 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -217,14 +217,14 @@ static inline void exit_rcu(void)
 {
 }
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
 extern int rcu_scheduler_active __read_mostly;
 void rcu_scheduler_starting(void);
-#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 static inline void rcu_scheduler_starting(void)
 {
 }
-#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 
 #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
 
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index df3a60e19f07..371034e77f87 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	RCU_TRACE(.name = "rcu_bh")
 };
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
 #include <linux/kernel_stat.h>
 
 int rcu_scheduler_active __read_mostly;
@@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
  * The reason for this is that Tiny RCU does not need kthreads, so does
  * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process.
+ * at a certain phase of the boot process.  Unless SRCU is in the mix.
  */
 void __init rcu_scheduler_starting(void)
 {
 	WARN_ON(nr_context_switches() > 0);
-	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+	rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
+		? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
 }
 
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
 
 #ifdef CONFIG_RCU_TRACE
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8cc9d40b41ea..2380d1e3dfb8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3974,7 +3974,7 @@ early_initcall(rcu_spawn_gp_kthread);
  * task is booting the system, and such primitives are no-ops).  After this
  * function is called, any synchronous grace-period primitives are run as
  * expedited, with the requesting task driving the grace period forward.
- * A later core_initcall() rcu_exp_runtime_mode() will switch to full
+ * A later core_initcall() rcu_set_runtime_mode() will switch to full
  * runtime RCU functionality.
  */
 void rcu_scheduler_starting(void)
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index a1f52bbe9db6..51ca287828a2 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -737,15 +737,3 @@ void synchronize_rcu_expedited(void)
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
-
-/*
- * Switch to run-time mode once Tree RCU has fully initialized.
- */
-static int __init rcu_exp_runtime_mode(void)
-{
-	rcu_test_sync_prims();
-	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
-	rcu_test_sync_prims();
-	return 0;
-}
-core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 55c8530316c7..c5df0d756900 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
  * non-expedited counterparts?  Intended for use within RCU.  Note
  * that if the user specifies both rcu_expedited and rcu_normal, then
  * rcu_normal wins.  (Except during the time period during boot from
- * when the first task is spawned until the rcu_exp_runtime_mode()
+ * when the first task is spawned until the rcu_set_runtime_mode()
  * core_initcall() is invoked, at which point everything is expedited.)
  */
 bool rcu_gp_is_normal(void)
@@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void)
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+/*
+ * Test each non-SRCU synchronous grace-period wait API.  This is
+ * useful just after a change in mode for these primitives, and
+ * during early boot.
+ */
+void rcu_test_sync_prims(void)
+{
+	if (!IS_ENABLED(CONFIG_PROVE_RCU))
+		return;
+	synchronize_rcu();
+	synchronize_rcu_bh();
+	synchronize_sched();
+	synchronize_rcu_expedited();
+	synchronize_rcu_bh_expedited();
+	synchronize_sched_expedited();
+}
+
+#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU)
+
+/*
+ * Switch to run-time mode once RCU has fully initialized.
+ */
+static int __init rcu_set_runtime_mode(void)
+{
+	rcu_test_sync_prims();
+	rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+	rcu_test_sync_prims();
+	return 0;
+}
+core_initcall(rcu_set_runtime_mode);
+
+#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */
+
 #ifdef CONFIG_PREEMPT_RCU
 
 /*
@@ -817,23 +850,6 @@ static void rcu_spawn_tasks_kthread(void)
 
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
-/*
- * Test each non-SRCU synchronous grace-period wait API.  This is
- * useful just after a change in mode for these primitives, and
- * during early boot.
- */
-void rcu_test_sync_prims(void)
-{
-	if (!IS_ENABLED(CONFIG_PROVE_RCU))
-		return;
-	synchronize_rcu();
-	synchronize_rcu_bh();
-	synchronize_sched();
-	synchronize_rcu_expedited();
-	synchronize_rcu_bh_expedited();
-	synchronize_sched_expedited();
-}
-
 #ifdef CONFIG_PROVE_RCU
 
 /*
-- 
cgit v1.2.3


From c2a8ec0778b2ca0d360ba9b5cac7fcd5ddfe798f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 10 Mar 2017 15:31:55 -0800
Subject: srcu: Move to state-based grace-period sequencing

The current SRCU grace-period processing might never reach the last
portion of srcu_advance_batches().  This is OK given the current
implementation, as the first portion, up to the try_check_zero()
following the srcu_flip() is sufficient to drive grace periods forward.
However, it has the unfortunate side-effect of making it impossible to
determine when a given grace period has ended, and it will be necessary
to efficiently trace ends of grace periods in order to efficiently handle
per-CPU SRCU callback lists.

This commit therefore adds states to the SRCU grace-period processing,
so that the end of a given SRCU grace period is marked by the transition
to the SRCU_STATE_DONE state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  10 ++++-
 kernel/rcu/srcu.c    | 111 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 69 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a598cf3ac70c..f149a685896c 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -48,7 +48,7 @@ struct srcu_struct {
 	unsigned long completed;
 	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
-	bool running;
+	int srcu_state;
 	/* callbacks just queued */
 	struct rcu_batch batch_queue;
 	/* callbacks try to do the first check_zero */
@@ -62,6 +62,12 @@ struct srcu_struct {
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 };
 
+/* Values for -> state variable. */
+#define SRCU_STATE_IDLE		0
+#define SRCU_STATE_SCAN1	1
+#define SRCU_STATE_SCAN2	2
+#define SRCU_STATE_DONE		3
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
 int __init_srcu_struct(struct srcu_struct *sp, const char *name,
@@ -89,7 +95,7 @@ void process_srcu(struct work_struct *work);
 		.completed = -300,					\
 		.per_cpu_ref = &name##_srcu_array,			\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.running = false,					\
+		.srcu_state = SRCU_STATE_IDLE,				\
 		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
 		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
 		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 821ecda873f2..70286f68f3a1 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -111,7 +111,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
 	spin_lock_init(&sp->queue_lock);
-	sp->running = false;
+	sp->srcu_state = SRCU_STATE_IDLE;
 	rcu_batch_init(&sp->batch_queue);
 	rcu_batch_init(&sp->batch_check0);
 	rcu_batch_init(&sp->batch_check1);
@@ -270,7 +270,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	if (WARN_ON(!rcu_all_batches_empty(sp)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
-	if (WARN_ON(sp->running))
+	if (WARN_ON(READ_ONCE(sp->srcu_state) != SRCU_STATE_IDLE))
 		return; /* Caller forgot to stop doing call_srcu()? */
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
@@ -391,8 +391,8 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_batch_queue(&sp->batch_queue, head);
-	if (!sp->running) {
-		sp->running = true;
+	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
@@ -424,9 +424,9 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (!sp->running) {
+	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
-		sp->running = true;
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
 		/* give the processing owner to work_struct */
@@ -548,7 +548,9 @@ static void srcu_collect_new(struct srcu_struct *sp)
  */
 static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 {
-	int idx = 1 ^ (sp->completed & 1);
+	int idx;
+
+	WARN_ON_ONCE(sp->srcu_state == SRCU_STATE_IDLE);
 
 	/*
 	 * Because readers might be delayed for an extended period after
@@ -558,48 +560,56 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 	 * invoking a callback.
 	 */
 
-	if (rcu_batch_empty(&sp->batch_check0) &&
-	    rcu_batch_empty(&sp->batch_check1))
-		return; /* no callbacks need to be advanced */
-
-	if (!try_check_zero(sp, idx, trycount))
-		return; /* failed to advance, will try after SRCU_INTERVAL */
-
-	/*
-	 * The callbacks in ->batch_check1 have already done with their
-	 * first zero check and flip back when they were enqueued on
-	 * ->batch_check0 in a previous invocation of srcu_advance_batches().
-	 * (Presumably try_check_zero() returned false during that
-	 * invocation, leaving the callbacks stranded on ->batch_check1.)
-	 * They are therefore ready to invoke, so move them to ->batch_done.
-	 */
-	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
-	if (rcu_batch_empty(&sp->batch_check0))
-		return; /* no callbacks need to be advanced */
-	srcu_flip(sp);
-
-	/*
-	 * The callbacks in ->batch_check0 just finished their
-	 * first check zero and flip, so move them to ->batch_check1
-	 * for future checking on the other idx.
-	 */
-	rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
-	/*
-	 * SRCU read-side critical sections are normally short, so check
-	 * at least twice in quick succession after a flip.
-	 */
-	trycount = trycount < 2 ? 2 : trycount;
-	if (!try_check_zero(sp, idx^1, trycount))
-		return; /* failed to advance, will try after SRCU_INTERVAL */
+	if (sp->srcu_state == SRCU_STATE_DONE)
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+
+	if (sp->srcu_state == SRCU_STATE_SCAN1) {
+		idx = 1 ^ (sp->completed & 1);
+		if (!try_check_zero(sp, idx, trycount))
+			return; /* readers present, retry after SRCU_INTERVAL */
+
+		/*
+		 * The callbacks in ->batch_check1 have already done
+		 * with their first zero check and flip back when they were
+		 * enqueued on ->batch_check0 in a previous invocation of
+		 * srcu_advance_batches().  (Presumably try_check_zero()
+		 * returned false during that invocation, leaving the
+		 * callbacks stranded on ->batch_check1.) They are therefore
+		 * ready to invoke, so move them to ->batch_done.
+		 */
+		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+		srcu_flip(sp);
+
+		/*
+		 * The callbacks in ->batch_check0 just finished their
+		 * first check zero and flip, so move them to ->batch_check1
+		 * for future checking on the other idx.
+		 */
+		rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN2);
+	}
 
-	/*
-	 * The callbacks in ->batch_check1 have now waited for all
-	 * pre-existing readers using both idx values.  They are therefore
-	 * ready to invoke, so move them to ->batch_done.
-	 */
-	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+	if (sp->srcu_state == SRCU_STATE_SCAN2) {
+
+		/*
+		 * SRCU read-side critical sections are normally short,
+		 * so check at least twice in quick succession after a flip.
+		 */
+		idx = 1 ^ (sp->completed & 1);
+		trycount = trycount < 2 ? 2 : trycount;
+		if (!try_check_zero(sp, idx, trycount))
+			return; /* readers present, retry after SRCU_INTERVAL */
+
+		/*
+		 * The callbacks in ->batch_check1 have now waited for
+		 * all pre-existing readers using both idx values.  They are
+		 * therefore ready to invoke, so move them to ->batch_done.
+		 */
+		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+		WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+	}
 }
 
 /*
@@ -633,8 +643,9 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 
 	if (rcu_all_batches_empty(sp)) {
 		spin_lock_irq(&sp->queue_lock);
-		if (rcu_all_batches_empty(sp)) {
-			sp->running = false;
+		if (rcu_all_batches_empty(sp) &&
+		    READ_ONCE(sp->srcu_state) == SRCU_STATE_DONE) {
+			WRITE_ONCE(sp->srcu_state, SRCU_STATE_IDLE);
 			pending = false;
 		}
 		spin_unlock_irq(&sp->queue_lock);
-- 
cgit v1.2.3


From ac367c1c621b75689f6d5cd8301d364ba2c9f292 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 11 Mar 2017 07:14:06 -0800
Subject: srcu: Add grace-period sequence numbers

This commit adds grace-period sequence numbers, which will be used to
handle mid-boot grace periods and per-CPU callback lists.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  1 +
 kernel/rcu/srcu.c    | 27 +++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index f149a685896c..047ac8c28a4e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -46,6 +46,7 @@ struct rcu_batch {
 
 struct srcu_struct {
 	unsigned long completed;
+	unsigned long srcu_gp_seq;
 	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	int srcu_state;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 70286f68f3a1..d464986d82b6 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -110,6 +110,7 @@ static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
+	sp->srcu_gp_seq = 0;
 	spin_lock_init(&sp->queue_lock);
 	sp->srcu_state = SRCU_STATE_IDLE;
 	rcu_batch_init(&sp->batch_queue);
@@ -318,6 +319,15 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 #define SYNCHRONIZE_SRCU_TRYCOUNT	2
 #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
 
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+	WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+	rcu_seq_start(&sp->srcu_gp_seq);
+}
+
 /*
  * @@@ Wait until all pre-existing readers complete.  Such readers
  * will have used the index specified by "idx".
@@ -354,6 +364,15 @@ static void srcu_flip(struct srcu_struct *sp)
 	smp_mb(); /* D */  /* Pairs with C. */
 }
 
+/*
+ * End an SRCU grace period.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+	rcu_seq_end(&sp->srcu_gp_seq);
+	WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+}
+
 /*
  * Enqueue an SRCU callback on the specified srcu_struct structure,
  * initiating grace-period processing if it is not already running.
@@ -392,7 +411,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_batch_queue(&sp->batch_queue, head);
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
@@ -426,7 +445,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+		srcu_gp_start(sp);
 		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
 		/* give the processing owner to work_struct */
@@ -561,7 +580,7 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 	 */
 
 	if (sp->srcu_state == SRCU_STATE_DONE)
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
+		srcu_gp_start(sp);
 
 	if (sp->srcu_state == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (sp->completed & 1);
@@ -608,7 +627,7 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		 */
 		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
 
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+		srcu_gp_end(sp);
 	}
 }
 
-- 
cgit v1.2.3


From 8660b7d8a545227fd9ee80508aa82528ea9947d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Mar 2017 16:48:18 -0700
Subject: srcu: Use rcu_segcblist to track SRCU callbacks

This commit switches SRCU from custom-built callback queues to the new
rcu_segcblist structure.  This change associates grace-period sequence
numbers with groups of callbacks, which will be needed for efficient
processing of per-CPU callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_segcblist.h | 678 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/srcu.h          |  24 +-
 kernel/rcu/rcu.h              |   6 +
 kernel/rcu/rcu_segcblist.h    | 670 -----------------------------------------
 kernel/rcu/srcu.c             | 159 ++--------
 kernel/rcu/tree.h             |   2 +-
 6 files changed, 721 insertions(+), 818 deletions(-)
 create mode 100644 include/linux/rcu_segcblist.h
 delete mode 100644 kernel/rcu/rcu_segcblist.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
new file mode 100644
index 000000000000..74b1e7243955
--- /dev/null
+++ b/include/linux/rcu_segcblist.h
@@ -0,0 +1,678 @@
+/*
+ * RCU segmented callback lists
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __KERNEL_RCU_SEGCBLIST_H
+#define __KERNEL_RCU_SEGCBLIST_H
+
+/* Simple unsegmented callback lists. */
+struct rcu_cblist {
+	struct rcu_head *head;
+	struct rcu_head **tail;
+	long len;
+	long len_lazy;
+};
+
+#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
+
+/* Initialize simple callback list. */
+static inline void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+	rclp->len = 0;
+	rclp->len_lazy = 0;
+}
+
+/* Is simple callback list empty? */
+static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
+{
+	return !rclp->head;
+}
+
+/* Return number of callbacks in simple callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len;
+}
+
+/* Return number of lazy callbacks in simple callback list. */
+static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len_lazy;
+}
+
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+	int cnt = 0;
+	struct rcu_head **rhpp = &rclp->head;
+
+	for (;;) {
+		if (!*rhpp)
+			return cnt;
+		if (++cnt > lim)
+			return -1;
+		rhpp = &(*rhpp)->next;
+	}
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list.  This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness).  This allows
+ * different users to have different ways of determining laziness.
+ */
+static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+	struct rcu_head *rhp;
+
+	rhp = rclp->head;
+	if (!rhp)
+		return NULL;
+	rclp->len--;
+	rclp->head = rhp->next;
+	if (!rclp->head)
+		rclp->tail = &rclp->head;
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+	rclp->len_lazy--;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+	return rclp->head;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+	WARN_ON_ONCE(rcu_cblist_empty(rclp));
+	return rclp->tail;
+}
+
+/* Complicated segmented callback lists.  ;-) */
+
+/*
+ * Index values for segments in rcu_segcblist structure.
+ *
+ * The segments are as follows:
+ *
+ * [head, *tails[RCU_DONE_TAIL]):
+ *	Callbacks whose grace period has elapsed, and thus can be invoked.
+ * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
+ *	Callbacks waiting for the current GP from the current CPU's viewpoint.
+ * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
+ *	Callbacks that arrived before the next GP started, again from
+ *	the current CPU's viewpoint.  These can be handled by the next GP.
+ * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
+ *	Callbacks that might have arrived after the next GP started.
+ *	There is some uncertainty as to when a given GP starts and
+ *	ends, but a CPU knows the exact times if it is the one starting
+ *	or ending the GP.  Other CPUs know that the previous GP ends
+ *	before the next one starts.
+ *
+ * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
+ * empty.
+ *
+ * The ->gp_seq[] array contains the grace-period number at which the
+ * corresponding segment of callbacks will be ready to invoke.  A given
+ * element of this array is meaningful only when the corresponding segment
+ * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
+ * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
+ * not yet been assigned a grace-period number).
+ */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_CBLIST_NSEGS	4
+
+struct rcu_segcblist {
+	struct rcu_head *head;
+	struct rcu_head **tails[RCU_CBLIST_NSEGS];
+	unsigned long gp_seq[RCU_CBLIST_NSEGS];
+	long len;
+	long len_lazy;
+};
+
+#define RCU_SEGCBLIST_INITIALIZER(n) \
+{ \
+	.head = NULL, \
+	.tails[RCU_DONE_TAIL] = &n.head, \
+	.tails[RCU_WAIT_TAIL] = &n.head, \
+	.tails[RCU_NEXT_READY_TAIL] = &n.head, \
+	.tails[RCU_NEXT_TAIL] = &n.head, \
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+	int i;
+
+	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+	rsclp->head = NULL;
+	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = &rsclp->head;
+	rsclp->len = 0;
+	rsclp->len_lazy = 0;
+}
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful!  The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure.  When callbacks are being invoked, they are
+ * removed as a group.  If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list.  Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+	return !rsclp->head;
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+	return READ_ONCE(rsclp->len);
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len_lazy;
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len - rsclp->len_lazy;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+	return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it.  This structure must be empty.
+ */
+static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+	rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+	if (seg == RCU_DONE_TAIL)
+		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+	return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks?  (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+	return !*rsclp->tails[seg];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Dequeue and return the first ready-to-invoke callback.  If there
+ * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
+ * to avoid interference.  Does not protect from interference from other
+ * CPUs or tasks.
+ */
+static inline struct rcu_head *
+rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+	int i;
+	struct rcu_head *rhp;
+
+	local_irq_save(flags);
+	if (!rcu_segcblist_ready_cbs(rsclp)) {
+		local_irq_restore(flags);
+		return NULL;
+	}
+	rhp = rsclp->head;
+	BUG_ON(!rhp);
+	rsclp->head = rhp->next;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+		if (rsclp->tails[i] != &rhp->next)
+			break;
+		rsclp->tails[i] = &rsclp->head;
+	}
+	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+	WRITE_ONCE(rsclp->len, rsclp->len - 1);
+	local_irq_restore(flags);
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rsclp->len_lazy--;
+	local_irq_restore(flags);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure.  This is useful for diagnostics.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return rsclp->head;
+	return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure.  This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return *rsclp->tails[RCU_DONE_TAIL];
+	return NULL;
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed.  Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+	rhp->next = NULL;
+	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
+	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure.  This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks.  (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks.  Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+					       struct rcu_cblist *rclp)
+{
+	rclp->len_lazy += rsclp->len_lazy;
+	rclp->len += rsclp->len;
+	rsclp->len_lazy = 0;
+	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+						  struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_ready_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+			rsclp->tails[i] = &rsclp->head;
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure.  Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period.  Too bad!  They will have to start over.
+ */
+static inline void
+rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+			       struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_pend_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Move the entire contents of the specified rcu_segcblist structure,
+ * counts, callbacks, and all, to the specified rcu_cblist structure.
+ * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
+ * @@@ Memory barrier needed?  (Not if only used at boot time...)
+ */
+static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
+					     struct rcu_cblist *rclp)
+{
+	rcu_segcblist_extract_done_cbs(rsclp, rclp);
+	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
+	rcu_segcblist_extract_count(rsclp, rclp);
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+					      struct rcu_cblist *rclp)
+{
+	rsclp->len_lazy += rclp->len_lazy;
+	/* ->len sampled locklessly. */
+	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+	rclp->len_lazy = 0;
+	rclp->len = 0;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rclp->head)
+		return; /* No callbacks to move. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = rclp->head;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+		if (&rsclp->head == rsclp->tails[i])
+			rsclp->tails[i] = rclp->tail;
+		else
+			break;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	if (!rclp->head)
+		return; /* Nothing to do. */
+	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
+					 unsigned long seq)
+{
+	int i, j;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+
+	/*
+	 * Find all callbacks whose ->gp_seq numbers indicate that they
+	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+	 */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			break;
+		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+	}
+
+	/* If no callbacks moved, nothing more need be done. */
+	if (i == RCU_WAIT_TAIL)
+		return;
+
+	/* Clean up tail pointers that might have been misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+
+	/*
+	 * Callbacks moved, so clean up the misordered ->tails[] pointers
+	 * that now point into the middle of the list of ready-to-invoke
+	 * callbacks.  The overall effect is to copy down the later pointers
+	 * into the gap that was created by the now-ready segments.
+	 */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+			break;  /* No more callbacks. */
+		rsclp->tails[j] = rsclp->tails[i];
+		rsclp->gp_seq[j] = rsclp->gp_seq[i];
+	}
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally.  This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability.  When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number at which new callbacks would become
+ * ready to invoke.
+ */
+static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
+					    unsigned long seq)
+{
+	int i;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+
+	/*
+	 * Find the segment preceding the oldest segment of callbacks
+	 * whose ->gp_seq[] completion is at or after that passed in via
+	 * "seq", skipping any empty segments.  This oldest segment, along
+	 * with any later segments, can be merged in with any newly arrived
+	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+	 * as their ->gp_seq[] grace-period completion sequence number.
+	 */
+	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+			break;
+
+	/*
+	 * If all the segments contain callbacks that correspond to
+	 * earlier grace-period sequence numbers than "seq", leave.
+	 * Assuming that the rcu_segcblist structure has enough
+	 * segments in its arrays, this can only happen if some of
+	 * the non-done segments contain callbacks that really are
+	 * ready to invoke.  This situation will get straightened
+	 * out by the next call to rcu_segcblist_advance().
+	 *
+	 * Also advance to the oldest segment of callbacks whose
+	 * ->gp_seq[] completion is at or after that passed in via "seq",
+	 * skipping any empty segments.
+	 */
+	if (++i >= RCU_NEXT_TAIL)
+		return false;
+
+	/*
+	 * Merge all later callbacks, including newly arrived callbacks,
+	 * into the segment located by the for-loop above.  Assign "seq"
+	 * as the ->gp_seq[] value in order to correctly handle the case
+	 * where there were no pending callbacks in the rcu_segcblist
+	 * structure other than in the RCU_NEXT_TAIL segment.
+	 */
+	for (; i < RCU_NEXT_TAIL; i++) {
+		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+		rsclp->gp_seq[i] = seq;
+	}
+	return true;
+}
+
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq".  We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+						  unsigned long seq)
+{
+	int i;
+
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			return true;
+	return false;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+	return rsclp->head;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+	return rsclp->tails[RCU_NEXT_TAIL];
+}
+
+#endif /* __KERNEL_RCU_SEGCBLIST_H */
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 047ac8c28a4e..ad154a7bc114 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -22,7 +22,7 @@
  *	   Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU/ *.txt
+ *		Documentation/RCU/ *.txt
  *
  */
 
@@ -32,31 +32,20 @@
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/rcu_segcblist.h>
 
 struct srcu_array {
 	unsigned long lock_count[2];
 	unsigned long unlock_count[2];
 };
 
-struct rcu_batch {
-	struct rcu_head *head, **tail;
-};
-
-#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
-
 struct srcu_struct {
 	unsigned long completed;
 	unsigned long srcu_gp_seq;
 	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
+	spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */
 	int srcu_state;
-	/* callbacks just queued */
-	struct rcu_batch batch_queue;
-	/* callbacks try to do the first check_zero */
-	struct rcu_batch batch_check0;
-	/* callbacks done with the first check_zero and the flip */
-	struct rcu_batch batch_check1;
-	struct rcu_batch batch_done;
+	struct rcu_segcblist srcu_cblist;
 	struct delayed_work work;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
@@ -97,10 +86,7 @@ void process_srcu(struct work_struct *work);
 		.per_cpu_ref = &name##_srcu_array,			\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
 		.srcu_state = SRCU_STATE_IDLE,				\
-		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
-		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
-		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
-		.batch_done = RCU_BATCH_INIT(name.batch_done),		\
+		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
 		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
 		__SRCU_DEP_MAP_INIT(name)				\
 	}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 0bc1313c49e2..a943b42a9cf7 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -87,6 +87,12 @@ static inline unsigned long rcu_seq_snap(unsigned long *sp)
 	return s;
 }
 
+/* Return the current value the update side's sequence number, no ordering. */
+static inline unsigned long rcu_seq_current(unsigned long *sp)
+{
+	return READ_ONCE(*sp);
+}
+
 /*
  * Given a snapshot from rcu_seq_snap(), determine whether or not a
  * full update-side operation has occurred.
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
deleted file mode 100644
index 982e3e05b22a..000000000000
--- a/kernel/rcu/rcu_segcblist.h
+++ /dev/null
@@ -1,670 +0,0 @@
-/*
- * RCU segmented callback lists
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright IBM Corporation, 2017
- *
- * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#ifndef __KERNEL_RCU_SEGCBLIST_H
-#define __KERNEL_RCU_SEGCBLIST_H
-
-/* Simple unsegmented callback lists. */
-struct rcu_cblist {
-	struct rcu_head *head;
-	struct rcu_head **tail;
-	long len;
-	long len_lazy;
-};
-
-#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
-
-/* Initialize simple callback list. */
-static inline void rcu_cblist_init(struct rcu_cblist *rclp)
-{
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-	rclp->len = 0;
-	rclp->len_lazy = 0;
-}
-
-/* Is simple callback list empty? */
-static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
-{
-	return !rclp->head;
-}
-
-/* Return number of callbacks in simple callback list. */
-static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len;
-}
-
-/* Return number of lazy callbacks in simple callback list. */
-static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len_lazy;
-}
-
-/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
-	int cnt = 0;
-	struct rcu_head **rhpp = &rclp->head;
-
-	for (;;) {
-		if (!*rhpp)
-			return cnt;
-		if (++cnt > lim)
-			return -1;
-		rhpp = &(*rhpp)->next;
-	}
-}
-
-/*
- * Dequeue the oldest rcu_head structure from the specified callback
- * list.  This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness).  This allows
- * different users to have different ways of determining laziness.
- */
-static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
-{
-	struct rcu_head *rhp;
-
-	rhp = rclp->head;
-	if (!rhp)
-		return NULL;
-	prefetch(rhp);
-	rclp->len--;
-	rclp->head = rhp->next;
-	if (!rclp->head)
-		rclp->tail = &rclp->head;
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
-{
-	rclp->len_lazy--;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
-	return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
-	WARN_ON_ONCE(rcu_cblist_empty(rclp));
-	return rclp->tail;
-}
-
-/* Complicated segmented callback lists.  ;-) */
-
-/*
- * Index values for segments in rcu_segcblist structure.
- *
- * The segments are as follows:
- *
- * [head, *tails[RCU_DONE_TAIL]):
- *	Callbacks whose grace period has elapsed, and thus can be invoked.
- * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]):
- *	Callbacks waiting for the current GP from the current CPU's viewpoint.
- * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]):
- *	Callbacks that arrived before the next GP started, again from
- *	the current CPU's viewpoint.  These can be handled by the next GP.
- * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]):
- *	Callbacks that might have arrived after the next GP started.
- *	There is some uncertainty as to when a given GP starts and
- *	ends, but a CPU knows the exact times if it is the one starting
- *	or ending the GP.  Other CPUs know that the previous GP ends
- *	before the next one starts.
- *
- * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also
- * empty.
- *
- * The ->gp_seq[] array contains the grace-period number at which the
- * corresponding segment of callbacks will be ready to invoke.  A given
- * element of this array is meaningful only when the corresponding segment
- * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks
- * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have
- * not yet been assigned a grace-period number).
- */
-#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
-#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
-#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
-#define RCU_NEXT_TAIL		3
-#define RCU_CBLIST_NSEGS	4
-
-struct rcu_segcblist {
-	struct rcu_head *head;
-	struct rcu_head **tails[RCU_CBLIST_NSEGS];
-	unsigned long gp_seq[RCU_CBLIST_NSEGS];
-	long len;
-	long len_lazy;
-};
-
-/*
- * Initialize an rcu_segcblist structure.
- */
-static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
-{
-	int i;
-
-	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
-	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
-	rsclp->head = NULL;
-	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = &rsclp->head;
-	rsclp->len = 0;
-	rsclp->len_lazy = 0;
-}
-
-/*
- * Is the specified rcu_segcblist structure empty?
- *
- * But careful!  The fact that the ->head field is NULL does not
- * necessarily imply that there are no callbacks associated with
- * this structure.  When callbacks are being invoked, they are
- * removed as a group.  If callback invocation must be preempted,
- * the remaining callbacks will be added back to the list.  Either
- * way, the counts are updated later.
- *
- * So it is often the case that rcu_segcblist_n_cbs() should be used
- * instead.
- */
-static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
-{
-	return !rsclp->head;
-}
-
-/* Return number of callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
-{
-	return READ_ONCE(rsclp->len);
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len - rsclp->len_lazy;
-}
-
-/*
- * Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
- */
-static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
-{
-	return !!rsclp->tails[RCU_NEXT_TAIL];
-}
-
-/*
- * Disable the specified rcu_segcblist structure, so that callbacks can
- * no longer be posted to it.  This structure must be empty.
- */
-static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
-	rsclp->tails[RCU_NEXT_TAIL] = NULL;
-}
-
-/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
-	if (seg == RCU_DONE_TAIL)
-		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
-	return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
- * Are all segments following the specified segment of the specified
- * rcu_segcblist structure empty of callbacks?  (The specified
- * segment might well contain callbacks.)
- */
-static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
-{
-	return !*rsclp->tails[seg];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are ready to be invoked?
- */
-static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are still pending, that is, not yet ready to be invoked?
- */
-static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
-}
-
-/*
- * Dequeue and return the first ready-to-invoke callback.  If there
- * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
- * to avoid interference.  Does not protect from interference from other
- * CPUs or tasks.
- */
-static inline struct rcu_head *
-rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-	int i;
-	struct rcu_head *rhp;
-
-	local_irq_save(flags);
-	if (!rcu_segcblist_ready_cbs(rsclp)) {
-		local_irq_restore(flags);
-		return NULL;
-	}
-	rhp = rsclp->head;
-	BUG_ON(!rhp);
-	rsclp->head = rhp->next;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
-		if (rsclp->tails[i] != &rhp->next)
-			break;
-		rsclp->tails[i] = &rsclp->head;
-	}
-	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
-	WRITE_ONCE(rsclp->len, rsclp->len - 1);
-	local_irq_restore(flags);
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rsclp->len_lazy--;
-	local_irq_restore(flags);
-}
-
-/*
- * Return a pointer to the first callback in the specified rcu_segcblist
- * structure.  This is useful for diagnostics.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return rsclp->head;
-	return NULL;
-}
-
-/*
- * Return a pointer to the first pending callback in the specified
- * rcu_segcblist structure.  This is useful just after posting a given
- * callback -- if that callback is the first pending callback, then
- * you cannot rely on someone else having already started up the required
- * grace period.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return *rsclp->tails[RCU_DONE_TAIL];
-	return NULL;
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
- * Enqueue the specified callback onto the specified rcu_segcblist
- * structure, updating accounting as needed.  Note that the ->len
- * field may be accessed locklessly, hence the WRITE_ONCE().
- * The ->len field is used by rcu_barrier() and friends to determine
- * if it must post a callback on this structure, and it is OK
- * for rcu_barrier() to sometimes post callbacks needlessly, but
- * absolutely not OK for it to ever miss posting a callback.
- */
-static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
-					 struct rcu_head *rhp, bool lazy)
-{
-	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
-	if (lazy)
-		rsclp->len_lazy++;
-	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
-	rhp->next = NULL;
-	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
-	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
-}
-
-/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure.  This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks.  (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks.  Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
-					       struct rcu_cblist *rclp)
-{
-	rclp->len_lazy += rsclp->len_lazy;
-	rclp->len += rsclp->len;
-	rsclp->len_lazy = 0;
-	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
-}
-
-/*
- * Extract only those callbacks ready to be invoked from the specified
- * rcu_segcblist structure and place them in the specified rcu_cblist
- * structure.
- */
-static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
-						  struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_ready_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
-	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
-			rsclp->tails[i] = &rsclp->head;
-}
-
-/*
- * Extract only those callbacks still pending (not yet ready to be
- * invoked) from the specified rcu_segcblist structure and place them in
- * the specified rcu_cblist structure.  Note that this loses information
- * about any callbacks that might have been partway done waiting for
- * their grace period.  Too bad!  They will have to start over.
- */
-static inline void
-rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
-			       struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_pend_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
-	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Move the entire contents of the specified rcu_segcblist structure,
- * counts, callbacks, and all, to the specified rcu_cblist structure.
- * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
- * @@@ Memory barrier needed?  (Not if only used at boot time...)
- */
-static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
-					     struct rcu_cblist *rclp)
-{
-	rcu_segcblist_extract_done_cbs(rsclp, rclp);
-	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
-	rcu_segcblist_extract_count(rsclp, rclp);
-}
-
-/*
- * Insert counts from the specified rcu_cblist structure in the
- * specified rcu_segcblist structure.
- */
-static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
-					      struct rcu_cblist *rclp)
-{
-	rsclp->len_lazy += rclp->len_lazy;
-	/* ->len sampled locklessly. */
-	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
-	rclp->len_lazy = 0;
-	rclp->len = 0;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the beginning of the
- * done-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rclp->head)
-		return; /* No callbacks to move. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = rclp->head;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
-		if (&rsclp->head == rsclp->tails[i])
-			rsclp->tails[i] = rclp->tail;
-		else
-			break;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the end of the
- * new-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	if (!rclp->head)
-		return; /* Nothing to do. */
-	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
-	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value passed in for the grace-period counter.
- */
-static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
-					 unsigned long seq)
-{
-	int i, j;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
-
-	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
-	 */
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			break;
-		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
-	}
-
-	/* If no callbacks moved, nothing more need be done. */
-	if (i == RCU_WAIT_TAIL)
-		return;
-
-	/* Clean up tail pointers that might have been misordered above. */
-	for (j = RCU_WAIT_TAIL; j < i; j++)
-		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
-
-	/*
-	 * Callbacks moved, so clean up the misordered ->tails[] pointers
-	 * that now point into the middle of the list of ready-to-invoke
-	 * callbacks.  The overall effect is to copy down the later pointers
-	 * into the gap that was created by the now-ready segments.
-	 */
-	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-			break;  /* No more callbacks. */
-		rsclp->tails[j] = rsclp->tails[i];
-		rsclp->gp_seq[j] = rsclp->gp_seq[i];
-	}
-}
-
-/*
- * "Accelerate" callbacks based on more-accurate grace-period information.
- * The reason for this is that RCU does not synchronize the beginnings and
- * ends of grace periods, and that callbacks are posted locally.  This in
- * turn means that the callbacks must be labelled conservatively early
- * on, as getting exact information would degrade both performance and
- * scalability.  When more accurate grace-period information becomes
- * available, previously posted callbacks can be "accelerated", marking
- * them to complete at the end of the earlier grace period.
- *
- * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number at which new callbacks would become
- * ready to invoke.
- */
-static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
-					    unsigned long seq)
-{
-	int i;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
-
-	/*
-	 * Find the segment preceding the oldest segment of callbacks
-	 * whose ->gp_seq[] completion is at or after that passed in via
-	 * "seq", skipping any empty segments.  This oldest segment, along
-	 * with any later segments, can be merged in with any newly arrived
-	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
-	 * as their ->gp_seq[] grace-period completion sequence number.
-	 */
-	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
-		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
-			break;
-
-	/*
-	 * If all the segments contain callbacks that correspond to
-	 * earlier grace-period sequence numbers than "seq", leave.
-	 * Assuming that the rcu_segcblist structure has enough
-	 * segments in its arrays, this can only happen if some of
-	 * the non-done segments contain callbacks that really are
-	 * ready to invoke.  This situation will get straightened
-	 * out by the next call to rcu_segcblist_advance().
-	 *
-	 * Also advance to the oldest segment of callbacks whose
-	 * ->gp_seq[] completion is at or after that passed in via "seq",
-	 * skipping any empty segments.
-	 */
-	if (++i >= RCU_NEXT_TAIL)
-		return false;
-
-	/*
-	 * Merge all later callbacks, including newly arrived callbacks,
-	 * into the segment located by the for-loop above.  Assign "seq"
-	 * as the ->gp_seq[] value in order to correctly handle the case
-	 * where there were no pending callbacks in the rcu_segcblist
-	 * structure other than in the RCU_NEXT_TAIL segment.
-	 */
-	for (; i < RCU_NEXT_TAIL; i++) {
-		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
-		rsclp->gp_seq[i] = seq;
-	}
-	return true;
-}
-
-/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq".  We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-						  unsigned long seq)
-{
-	int i;
-
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
-		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
-		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			return true;
-	return false;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
-	return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
-	return rsclp->tails[RCU_NEXT_TAIL];
-}
-
-#endif /* __KERNEL_RCU_SEGCBLIST_H */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index d464986d82b6..56fd30862122 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -22,7 +22,7 @@
  *	   Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU/ *.txt
+ *		Documentation/RCU/ *.txt
  *
  */
 
@@ -38,85 +38,13 @@
 
 #include "rcu.h"
 
-/*
- * Initialize an rcu_batch structure to empty.
- */
-static inline void rcu_batch_init(struct rcu_batch *b)
-{
-	b->head = NULL;
-	b->tail = &b->head;
-}
-
-/*
- * Enqueue a callback onto the tail of the specified rcu_batch structure.
- */
-static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
-{
-	*b->tail = head;
-	b->tail = &head->next;
-}
-
-/*
- * Is the specified rcu_batch structure empty?
- */
-static inline bool rcu_batch_empty(struct rcu_batch *b)
-{
-	return b->tail == &b->head;
-}
-
-/*
- * Are all batches empty for the specified srcu_struct?
- */
-static inline bool rcu_all_batches_empty(struct srcu_struct *sp)
-{
-	return rcu_batch_empty(&sp->batch_done) &&
-	       rcu_batch_empty(&sp->batch_check1) &&
-	       rcu_batch_empty(&sp->batch_check0) &&
-	       rcu_batch_empty(&sp->batch_queue);
-}
-
-/*
- * Remove the callback at the head of the specified rcu_batch structure
- * and return a pointer to it, or return NULL if the structure is empty.
- */
-static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
-{
-	struct rcu_head *head;
-
-	if (rcu_batch_empty(b))
-		return NULL;
-
-	head = b->head;
-	b->head = head->next;
-	if (b->tail == &head->next)
-		rcu_batch_init(b);
-
-	return head;
-}
-
-/*
- * Move all callbacks from the rcu_batch structure specified by "from" to
- * the structure specified by "to".
- */
-static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
-{
-	if (!rcu_batch_empty(from)) {
-		*to->tail = from->head;
-		to->tail = from->tail;
-		rcu_batch_init(from);
-	}
-}
-
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
 	sp->srcu_gp_seq = 0;
 	spin_lock_init(&sp->queue_lock);
 	sp->srcu_state = SRCU_STATE_IDLE;
-	rcu_batch_init(&sp->batch_queue);
-	rcu_batch_init(&sp->batch_check0);
-	rcu_batch_init(&sp->batch_check1);
-	rcu_batch_init(&sp->batch_done);
+	rcu_segcblist_init(&sp->srcu_cblist);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
 	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
@@ -268,7 +196,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 {
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
-	if (WARN_ON(!rcu_all_batches_empty(sp)))
+	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
 	if (WARN_ON(READ_ONCE(sp->srcu_state) != SRCU_STATE_IDLE))
@@ -324,6 +252,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *sp)
 {
+	rcu_segcblist_accelerate(&sp->srcu_cblist,
+				 rcu_seq_snap(&sp->srcu_gp_seq));
 	WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 	rcu_seq_start(&sp->srcu_gp_seq);
 }
@@ -371,6 +301,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
 {
 	rcu_seq_end(&sp->srcu_gp_seq);
 	WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
+
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_advance(&sp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	spin_unlock_irq(&sp->queue_lock);
 }
 
 /*
@@ -409,7 +344,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	head->func = func;
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_batch_queue(&sp->batch_queue, head);
+	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
@@ -445,13 +380,13 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 		srcu_gp_start(sp);
-		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
 		/* give the processing owner to work_struct */
 		srcu_reschedule(sp, 0);
 	} else {
-		rcu_batch_queue(&sp->batch_queue, head);
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
@@ -548,19 +483,6 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed);
 #define SRCU_CALLBACK_BATCH	10
 #define SRCU_INTERVAL		1
 
-/*
- * Move any new SRCU callbacks to the first stage of the SRCU grace
- * period pipeline.
- */
-static void srcu_collect_new(struct srcu_struct *sp)
-{
-	if (!rcu_batch_empty(&sp->batch_queue)) {
-		spin_lock_irq(&sp->queue_lock);
-		rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
-		spin_unlock_irq(&sp->queue_lock);
-	}
-}
-
 /*
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
@@ -586,26 +508,7 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		idx = 1 ^ (sp->completed & 1);
 		if (!try_check_zero(sp, idx, trycount))
 			return; /* readers present, retry after SRCU_INTERVAL */
-
-		/*
-		 * The callbacks in ->batch_check1 have already done
-		 * with their first zero check and flip back when they were
-		 * enqueued on ->batch_check0 in a previous invocation of
-		 * srcu_advance_batches().  (Presumably try_check_zero()
-		 * returned false during that invocation, leaving the
-		 * callbacks stranded on ->batch_check1.) They are therefore
-		 * ready to invoke, so move them to ->batch_done.
-		 */
-		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
 		srcu_flip(sp);
-
-		/*
-		 * The callbacks in ->batch_check0 just finished their
-		 * first check zero and flip, so move them to ->batch_check1
-		 * for future checking on the other idx.
-		 */
-		rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
-
 		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN2);
 	}
 
@@ -619,14 +522,6 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		trycount = trycount < 2 ? 2 : trycount;
 		if (!try_check_zero(sp, idx, trycount))
 			return; /* readers present, retry after SRCU_INTERVAL */
-
-		/*
-		 * The callbacks in ->batch_check1 have now waited for
-		 * all pre-existing readers using both idx values.  They are
-		 * therefore ready to invoke, so move them to ->batch_done.
-		 */
-		rcu_batch_move(&sp->batch_done, &sp->batch_check1);
-
 		srcu_gp_end(sp);
 	}
 }
@@ -639,17 +534,26 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
  */
 static void srcu_invoke_callbacks(struct srcu_struct *sp)
 {
-	int i;
-	struct rcu_head *head;
+	struct rcu_cblist ready_cbs;
+	struct rcu_head *rhp;
 
-	for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
-		head = rcu_batch_dequeue(&sp->batch_done);
-		if (!head)
-			break;
+	spin_lock_irq(&sp->queue_lock);
+	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+		spin_unlock_irq(&sp->queue_lock);
+		return;
+	}
+	rcu_cblist_init(&ready_cbs);
+	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
+	rhp = rcu_cblist_dequeue(&ready_cbs);
+	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		local_bh_disable();
-		head->func(head);
+		rhp->func(rhp);
 		local_bh_enable();
 	}
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
 }
 
 /*
@@ -660,9 +564,9 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
 	bool pending = true;
 
-	if (rcu_all_batches_empty(sp)) {
+	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
 		spin_lock_irq(&sp->queue_lock);
-		if (rcu_all_batches_empty(sp) &&
+		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
 		    READ_ONCE(sp->srcu_state) == SRCU_STATE_DONE) {
 			WRITE_ONCE(sp->srcu_state, SRCU_STATE_IDLE);
 			pending = false;
@@ -683,7 +587,6 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_collect_new(sp);
 	srcu_advance_batches(sp, 1);
 	srcu_invoke_callbacks(sp);
 	srcu_reschedule(sp, SRCU_INTERVAL);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 93889ff21dbb..4f62651588ea 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,7 +30,7 @@
 #include <linux/seqlock.h>
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
-#include "rcu_segcblist.h"
+#include <linux/rcu_segcblist.h>
 
 /*
  * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
-- 
cgit v1.2.3


From f2425b4efb0c69e77c0b9666b605ae4a1ecaae47 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Mar 2017 12:42:30 -0700
Subject: srcu: Move combining-tree definitions for SRCU's benefit

This commit moves the C preprocessor code that defines the default shape
of the rcu_node combining tree to a new include/linux/rcu_node_tree.h
file as a first step towards enabling SRCU to create its own combining
tree, which in turn enables SRCU to implement per-CPU callback handling,
thus avoiding contention on the lock currently guarding the single list
of callbacks.  Note that users of SRCU still need to know the size of
the srcu_struct structure, hence include/linux rather than kernel/rcu.

This commit is code-movement only.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_node_tree.h | 102 ++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree.h             |  71 +----------------------------
 2 files changed, 103 insertions(+), 70 deletions(-)
 create mode 100644 include/linux/rcu_node_tree.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
new file mode 100644
index 000000000000..b7eb97096b1c
--- /dev/null
+++ b/include/linux/rcu_node_tree.h
@@ -0,0 +1,102 @@
+/*
+ * RCU node combining tree definitions.  These are used to compute
+ * global attributes while avoiding common-case global contention.  A key
+ * property that these computations rely on is a tournament-style approach
+ * where only one of the tasks contending a lower level in the tree need
+ * advance to the next higher level.  If properly configured, this allows
+ * unlimited scalability while maintaining a constant level of contention
+ * on the root node.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_RCU_NODE_TREE_H
+#define __LINUX_RCU_NODE_TREE_H
+
+/*
+ * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
+ * CONFIG_RCU_FANOUT_LEAF.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this did work well going from three levels to four.
+ * Of course, your mileage may vary.
+ */
+
+#ifdef CONFIG_RCU_FANOUT
+#define RCU_FANOUT CONFIG_RCU_FANOUT
+#else /* #ifdef CONFIG_RCU_FANOUT */
+# ifdef CONFIG_64BIT
+# define RCU_FANOUT 64
+# else
+# define RCU_FANOUT 32
+# endif
+#endif /* #else #ifdef CONFIG_RCU_FANOUT */
+
+#ifdef CONFIG_RCU_FANOUT_LEAF
+#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
+#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
+#define RCU_FANOUT_LEAF 16
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
+
+#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
+#define RCU_FANOUT_2	      (RCU_FANOUT_1 * RCU_FANOUT)
+#define RCU_FANOUT_3	      (RCU_FANOUT_2 * RCU_FANOUT)
+#define RCU_FANOUT_4	      (RCU_FANOUT_3 * RCU_FANOUT)
+
+#if NR_CPUS <= RCU_FANOUT_1
+#  define RCU_NUM_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_NODES	      NUM_RCU_LVL_0
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+#elif NR_CPUS <= RCU_FANOUT_2
+#  define RCU_NUM_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+#elif NR_CPUS <= RCU_FANOUT_3
+#  define RCU_NUM_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+#elif NR_CPUS <= RCU_FANOUT_4
+#  define RCU_NUM_LVLS	      4
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
+#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
+#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
+
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+
+#endif /* __LINUX_RCU_NODE_TREE_H */
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4f62651588ea..1bec3958d44f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -31,76 +31,7 @@
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
 #include <linux/rcu_segcblist.h>
-
-/*
- * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
- * CONFIG_RCU_FANOUT_LEAF.
- * In theory, it should be possible to add more levels straightforwardly.
- * In practice, this did work well going from three levels to four.
- * Of course, your mileage may vary.
- */
-
-#ifdef CONFIG_RCU_FANOUT
-#define RCU_FANOUT CONFIG_RCU_FANOUT
-#else /* #ifdef CONFIG_RCU_FANOUT */
-# ifdef CONFIG_64BIT
-# define RCU_FANOUT 64
-# else
-# define RCU_FANOUT 32
-# endif
-#endif /* #else #ifdef CONFIG_RCU_FANOUT */
-
-#ifdef CONFIG_RCU_FANOUT_LEAF
-#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF
-#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */
-#define RCU_FANOUT_LEAF 16
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */
-
-#define RCU_FANOUT_1	      (RCU_FANOUT_LEAF)
-#define RCU_FANOUT_2	      (RCU_FANOUT_1 * RCU_FANOUT)
-#define RCU_FANOUT_3	      (RCU_FANOUT_2 * RCU_FANOUT)
-#define RCU_FANOUT_4	      (RCU_FANOUT_3 * RCU_FANOUT)
-
-#if NR_CPUS <= RCU_FANOUT_1
-#  define RCU_NUM_LVLS	      1
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_NODES	      NUM_RCU_LVL_0
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
-#elif NR_CPUS <= RCU_FANOUT_2
-#  define RCU_NUM_LVLS	      2
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#elif NR_CPUS <= RCU_FANOUT_3
-#  define RCU_NUM_LVLS	      3
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#elif NR_CPUS <= RCU_FANOUT_4
-#  define RCU_NUM_LVLS	      4
-#  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
-#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
-#  define NUM_RCU_LVL_3	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-#  define NUM_RCU_NODES	      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
-#  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
-#  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
-#  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#else
-# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
-#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
+#include <linux/rcu_node_tree.h>
 
 /*
  * Dynticks per-CPU state.
-- 
cgit v1.2.3


From 2b34c43cc1671c59bad6dd1682ae3ee4f0919eb7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Mar 2017 14:29:53 -0700
Subject: srcu: Move rcu_init_levelspread() to rcu_tree_node.h

This commit moves the rcu_init_levelspread() function from
kernel/rcu/tree.c to kernel/rcu/rcu.h so that SRCU can access it.  This is
another step towards enabling SRCU to create its own combining tree.
This commit is code-movement only, give or take knock-on adjustments.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_node_tree.h |  3 ---
 kernel/rcu/rcu.h              | 36 ++++++++++++++++++++++++++++++++++++
 kernel/rcu/srcu.c             |  1 +
 kernel/rcu/tree.c             | 25 -------------------------
 kernel/rcu/tree_trace.c       |  1 +
 5 files changed, 38 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h
index b7eb97096b1c..4b766b61e1a0 100644
--- a/include/linux/rcu_node_tree.h
+++ b/include/linux/rcu_node_tree.h
@@ -96,7 +96,4 @@
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
 
-extern int rcu_num_lvls;
-extern int rcu_num_nodes;
-
 #endif /* __LINUX_RCU_NODE_TREE_H */
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index a943b42a9cf7..87326479b39a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -190,4 +190,40 @@ void rcu_test_sync_prims(void);
  */
 extern void resched_cpu(int cpu);
 
+#if defined(SRCU) || !defined(TINY_RCU)
+
+#include <linux/rcu_node_tree.h>
+
+extern int rcu_num_lvls;
+extern int rcu_num_nodes;
+static bool rcu_fanout_exact;
+static int rcu_fanout_leaf;
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
+ */
+static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
+{
+	int i;
+
+	if (rcu_fanout_exact) {
+		levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+		for (i = rcu_num_lvls - 2; i >= 0; i--)
+			levelspread[i] = RCU_FANOUT;
+	} else {
+		int ccur;
+		int cprv;
+
+		cprv = nr_cpu_ids;
+		for (i = rcu_num_lvls - 1; i >= 0; i--) {
+			ccur = levelcnt[i];
+			levelspread[i] = (cprv + ccur - 1) / ccur;
+			cprv = ccur;
+		}
+	}
+}
+
+#endif /* #if defined(SRCU) || !defined(TINY_RCU) */
+
 #endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 56fd30862122..0b511de7ca4d 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -36,6 +36,7 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
+#include <linux/rcu_node_tree.h>
 #include "rcu.h"
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 844a030c1960..df3527744af8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3951,31 +3951,6 @@ void rcu_scheduler_starting(void)
 	rcu_test_sync_prims();
 }
 
-/*
- * Compute the per-level fanout, either using the exact fanout specified
- * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
- */
-static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
-{
-	int i;
-
-	if (rcu_fanout_exact) {
-		levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-		for (i = rcu_num_lvls - 2; i >= 0; i--)
-			levelspread[i] = RCU_FANOUT;
-	} else {
-		int ccur;
-		int cprv;
-
-		cprv = nr_cpu_ids;
-		for (i = rcu_num_lvls - 1; i >= 0; i--) {
-			ccur = levelcnt[i];
-			levelspread[i] = (cprv + ccur - 1) / ccur;
-			cprv = ccur;
-		}
-	}
-}
-
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.
  */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 066c64071a7b..30c5bf89ee58 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -45,6 +45,7 @@
 
 #define RCU_TREE_NONCORE
 #include "tree.h"
+#include "rcu.h"
 
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
-- 
cgit v1.2.3


From 80a7956fe36c2ee40c6ff12c77926d267802b7c8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 22 Mar 2017 15:26:18 -0700
Subject: srcu: Merge ->srcu_state into ->srcu_gp_seq

Updating ->srcu_state and ->srcu_gp_seq will lead to extremely complex
race conditions given multiple callback queues, so this commit takes
advantage of the two-bit state now available in rcu_seq counters to
store the state in the bottom two bits of ->srcu_gp_seq.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  5 +----
 kernel/rcu/rcu.h     | 10 ++++++++++
 kernel/rcu/srcu.c    | 55 +++++++++++++++++++++++++++++++++-------------------
 3 files changed, 46 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ad154a7bc114..e7dbc01b61a1 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -43,8 +43,7 @@ struct srcu_struct {
 	unsigned long completed;
 	unsigned long srcu_gp_seq;
 	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->srcu_cblist, ->srcu_state */
-	int srcu_state;
+	spinlock_t queue_lock; /* protect ->srcu_cblist */
 	struct rcu_segcblist srcu_cblist;
 	struct delayed_work work;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -56,7 +55,6 @@ struct srcu_struct {
 #define SRCU_STATE_IDLE		0
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
-#define SRCU_STATE_DONE		3
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
@@ -85,7 +83,6 @@ void process_srcu(struct work_struct *work);
 		.completed = -300,					\
 		.per_cpu_ref = &name##_srcu_array,			\
 		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.srcu_state = SRCU_STATE_IDLE,				\
 		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
 		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
 		__SRCU_DEP_MAP_INIT(name)				\
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 87a0ac95b551..73e16ec4054b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -82,6 +82,16 @@ static inline int rcu_seq_state(unsigned long s)
 	return s & RCU_SEQ_STATE_MASK;
 }
 
+/*
+ * Set the state portion of the pointed-to sequence number.
+ * The caller is responsible for preventing conflicting updates.
+ */
+static inline void rcu_seq_set_state(unsigned long *sp, int newstate)
+{
+	WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK);
+	WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate);
+}
+
 /* Adjust sequence number for start of update-side operation. */
 static inline void rcu_seq_start(unsigned long *sp)
 {
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 1a2dc74bb625..90ffea31b188 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -44,7 +44,6 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 	sp->completed = 0;
 	sp->srcu_gp_seq = 0;
 	spin_lock_init(&sp->queue_lock);
-	sp->srcu_state = SRCU_STATE_IDLE;
 	rcu_segcblist_init(&sp->srcu_cblist);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
 	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
@@ -180,6 +179,9 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	return sum;
 }
 
+#define SRCU_CALLBACK_BATCH	10
+#define SRCU_INTERVAL		1
+
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
  * @sp: structure to clean up.
@@ -200,8 +202,10 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
-	if (WARN_ON(READ_ONCE(sp->srcu_state) != SRCU_STATE_IDLE))
+	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
+		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
 		return; /* Caller forgot to stop doing call_srcu()? */
+	}
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
@@ -253,10 +257,13 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *sp)
 {
+	int state;
+
 	rcu_segcblist_accelerate(&sp->srcu_cblist,
 				 rcu_seq_snap(&sp->srcu_gp_seq));
-	WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN1);
 	rcu_seq_start(&sp->srcu_gp_seq);
+	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }
 
 /*
@@ -300,7 +307,6 @@ static void srcu_flip(struct srcu_struct *sp)
 static void srcu_gp_end(struct srcu_struct *sp)
 {
 	rcu_seq_end(&sp->srcu_gp_seq);
-	WRITE_ONCE(sp->srcu_state, SRCU_STATE_DONE);
 
 	spin_lock_irq(&sp->queue_lock);
 	rcu_segcblist_advance(&sp->srcu_cblist,
@@ -345,7 +351,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
 	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
 		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
@@ -378,7 +384,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (READ_ONCE(sp->srcu_state) == SRCU_STATE_IDLE) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
 		/* steal the processing owner */
 		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
 		srcu_gp_start(sp);
@@ -480,9 +486,6 @@ unsigned long srcu_batches_completed(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
-#define SRCU_CALLBACK_BATCH	10
-#define SRCU_INTERVAL		1
-
 /*
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
@@ -491,28 +494,40 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 {
 	int idx;
 
-	WARN_ON_ONCE(sp->srcu_state == SRCU_STATE_IDLE);
-
 	/*
 	 * Because readers might be delayed for an extended period after
 	 * fetching ->completed for their index, at any point in time there
 	 * might well be readers using both idx=0 and idx=1.  We therefore
 	 * need to wait for readers to clear from both index values before
 	 * invoking a callback.
+	 *
+	 * The load-acquire ensures that we see the accesses performed
+	 * by the prior grace period.
 	 */
+	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+	if (idx == SRCU_STATE_IDLE) {
+		spin_lock_irq(&sp->queue_lock);
+		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+			spin_unlock_irq(&sp->queue_lock);
+			return;
+		}
+		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		if (idx == SRCU_STATE_IDLE)
+			srcu_gp_start(sp);
+		spin_unlock_irq(&sp->queue_lock);
+		if (idx != SRCU_STATE_IDLE)
+			return; /* Someone else started the grace period. */
+	}
 
-	if (sp->srcu_state == SRCU_STATE_DONE)
-		srcu_gp_start(sp);
-
-	if (sp->srcu_state == SRCU_STATE_SCAN1) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (sp->completed & 1);
 		if (!try_check_zero(sp, idx, trycount))
 			return; /* readers present, retry after SRCU_INTERVAL */
 		srcu_flip(sp);
-		WRITE_ONCE(sp->srcu_state, SRCU_STATE_SCAN2);
+		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
 
-	if (sp->srcu_state == SRCU_STATE_SCAN2) {
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
 
 		/*
 		 * SRCU read-side critical sections are normally short,
@@ -563,14 +578,14 @@ static void srcu_invoke_callbacks(struct srcu_struct *sp)
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
 	bool pending = true;
+	int state;
 
 	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
 		spin_lock_irq(&sp->queue_lock);
+		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
-		    READ_ONCE(sp->srcu_state) == SRCU_STATE_DONE) {
-			WRITE_ONCE(sp->srcu_state, SRCU_STATE_IDLE);
+		    state == SRCU_STATE_IDLE)
 			pending = false;
-		}
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
-- 
cgit v1.2.3


From f60d231a87c5c9f23f10e69996f396d46f5bf901 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 24 Mar 2017 13:46:33 -0700
Subject: srcu: Crude control of expedited grace periods

SRCU's implementation of expedited grace periods has always assumed
that the SRCU instance is idle when the expedited request arrives.
This commit improves this a bit by maintaining a count of the number
of outstanding expedited requests, thus allowing prior non-expedited
grace periods accommodate these requests by shifting to expedited mode.
However, any non-expedited wait already in progress will still wait for
the full duration.

Improved control of expedited grace periods is planned, but one step
at a time.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h |  1 +
 kernel/rcu/srcu.c    | 84 ++++++++++++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index e7dbc01b61a1..73a1b6296224 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -42,6 +42,7 @@ struct srcu_array {
 struct srcu_struct {
 	unsigned long completed;
 	unsigned long srcu_gp_seq;
+	atomic_t srcu_exp_cnt;
 	struct srcu_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->srcu_cblist */
 	struct rcu_segcblist srcu_cblist;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 90ffea31b188..3cfcc59bddf3 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -43,6 +43,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
 	sp->srcu_gp_seq = 0;
+	atomic_set(&sp->srcu_exp_cnt, 0);
 	spin_lock_init(&sp->queue_lock);
 	rcu_segcblist_init(&sp->srcu_cblist);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
@@ -179,7 +180,6 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	return sum;
 }
 
-#define SRCU_CALLBACK_BATCH	10
 #define SRCU_INTERVAL		1
 
 /**
@@ -197,6 +197,7 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
+	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
 	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
@@ -244,13 +245,10 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  * We use an adaptive strategy for synchronize_srcu() and especially for
  * synchronize_srcu_expedited().  We spin for a fixed time period
  * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods.  This approach
- * has done well in testing, so there is no need for a config parameter.
+ * sections.  If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
  */
 #define SRCU_RETRY_CHECK_DELAY		5
-#define SYNCHRONIZE_SRCU_TRYCOUNT	2
-#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
 
 /*
  * Start an SRCU grace period.
@@ -267,16 +265,16 @@ static void srcu_gp_start(struct srcu_struct *sp)
 }
 
 /*
- * Wait until all readers counted by array index idx complete, but loop
- * a maximum of trycount times.  The caller must ensure that ->completed
- * is not changed while checking.
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->completed is not changed while checking.
  */
 static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
 	for (;;) {
 		if (srcu_readers_active_idx_check(sp, idx))
 			return true;
-		if (--trycount <= 0)
+		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -364,7 +362,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
+static void __synchronize_srcu(struct srcu_struct *sp)
 {
 	struct rcu_synchronize rcu;
 	struct rcu_head *head = &rcu.head;
@@ -400,6 +398,32 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 	smp_mb(); /* Caller's later accesses after GP. */
 }
 
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	bool do_norm = rcu_gp_is_normal();
+
+	if (!do_norm) {
+		atomic_inc(&sp->srcu_exp_cnt);
+		smp_mb__after_atomic(); /* increment before GP. */
+	}
+	__synchronize_srcu(sp);
+	if (!do_norm) {
+		smp_mb__before_atomic(); /* GP before decrement. */
+		atomic_dec(&sp->srcu_exp_cnt);
+	}
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
 /**
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
  * @sp: srcu_struct with which to synchronize.
@@ -441,28 +465,13 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
-			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
-			   : SYNCHRONIZE_SRCU_TRYCOUNT);
+	if (rcu_gp_is_expedited())
+		synchronize_srcu_expedited(sp);
+	else
+		__synchronize_srcu(sp);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
-	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @sp: srcu_struct on which to wait for in-flight callbacks.
@@ -490,7 +499,7 @@ EXPORT_SYMBOL_GPL(srcu_batches_completed);
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
  */
-static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
+static void srcu_advance_batches(struct srcu_struct *sp)
 {
 	int idx;
 
@@ -521,8 +530,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 
 	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
 		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, trycount))
-			return; /* readers present, retry after SRCU_INTERVAL */
+		if (!try_check_zero(sp, idx, 1))
+			return; /* readers present, retry later. */
 		srcu_flip(sp);
 		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
@@ -534,9 +543,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 		 * so check at least twice in quick succession after a flip.
 		 */
 		idx = 1 ^ (sp->completed & 1);
-		trycount = trycount < 2 ? 2 : trycount;
-		if (!try_check_zero(sp, idx, trycount))
-			return; /* readers present, retry after SRCU_INTERVAL */
+		if (!try_check_zero(sp, idx, 2))
+			return; /* readers present, retry after later. */
 		srcu_gp_end(sp);
 	}
 }
@@ -602,8 +610,8 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_batches(sp, 1);
+	srcu_advance_batches(sp);
 	srcu_invoke_callbacks(sp);
-	srcu_reschedule(sp, SRCU_INTERVAL);
+	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
-- 
cgit v1.2.3


From d8be81735aa89413b333de488251f0e64e2be591 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 25 Mar 2017 09:59:38 -0700
Subject: srcu: Create a tiny SRCU

In response to automated complaints about modifications to SRCU
increasing its size, this commit creates a tiny SRCU that is
used in SMP=n && PREEMPT=n builds.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h     |  69 ++-------------
 include/linux/srcutiny.h |  81 ++++++++++++++++++
 include/linux/srcutree.h |  91 ++++++++++++++++++++
 init/Kconfig             |  12 +++
 kernel/rcu/Makefile      |   3 +-
 kernel/rcu/rcutorture.c  |   2 +
 kernel/rcu/srcutiny.c    | 215 +++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 411 insertions(+), 62 deletions(-)
 create mode 100644 include/linux/srcutiny.h
 create mode 100644 include/linux/srcutree.h
 create mode 100644 kernel/rcu/srcutiny.c

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 73a1b6296224..907f09b14eda 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -34,28 +34,7 @@
 #include <linux/workqueue.h>
 #include <linux/rcu_segcblist.h>
 
-struct srcu_array {
-	unsigned long lock_count[2];
-	unsigned long unlock_count[2];
-};
-
-struct srcu_struct {
-	unsigned long completed;
-	unsigned long srcu_gp_seq;
-	atomic_t srcu_exp_cnt;
-	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->srcu_cblist */
-	struct rcu_segcblist srcu_cblist;
-	struct delayed_work work;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-};
-
-/* Values for -> state variable. */
-#define SRCU_STATE_IDLE		0
-#define SRCU_STATE_SCAN1	1
-#define SRCU_STATE_SCAN2	2
+struct srcu_struct;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
@@ -77,42 +56,13 @@ int init_srcu_struct(struct srcu_struct *sp);
 #define __SRCU_DEP_MAP_INIT(srcu_name)
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-void process_srcu(struct work_struct *work);
-
-#define __SRCU_STRUCT_INIT(name)					\
-	{								\
-		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
-		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
-		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
-		__SRCU_DEP_MAP_INIT(name)				\
-	}
-
-/*
- * Define and initialize a srcu struct at build time.
- * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
- *
- * Note that although DEFINE_STATIC_SRCU() hides the name from other
- * files, the per-CPU variable rules nevertheless require that the
- * chosen name be globally unique.  These rules also prohibit use of
- * DEFINE_STATIC_SRCU() within a function.  If these rules are too
- * restrictive, declare the srcu_struct manually.  For example, in
- * each file:
- *
- *	static struct srcu_struct my_srcu;
- *
- * Then, before the first use of each my_srcu, manually initialize it:
- *
- *	init_srcu_struct(&my_srcu);
- *
- * See include/linux/percpu-defs.h for the rules on per-CPU variables.
- */
-#define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
-	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
-#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
-#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+#ifdef CONFIG_TINY_SRCU
+#include <linux/srcutiny.h>
+#elif defined(CONFIG_TREE_SRCU)
+#include <linux/srcutree.h>
+#else
+#error "Unknown SRCU implementation specified to kernel configuration"
+#endif
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
@@ -138,9 +88,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp);
 int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
-void synchronize_srcu_expedited(struct srcu_struct *sp);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
-void srcu_barrier(struct srcu_struct *sp);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
new file mode 100644
index 000000000000..4f284e4f4d8c
--- /dev/null
+++ b/include/linux/srcutiny.h
@@ -0,0 +1,81 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	tiny variant.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#ifndef _LINUX_SRCU_TINY_H
+#define _LINUX_SRCU_TINY_H
+
+#include <linux/swait.h>
+
+struct srcu_struct {
+	int srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
+	struct swait_queue_head srcu_wq;
+					/* Last srcu_read_unlock() wakes GP. */
+	unsigned long srcu_gp_seq;	/* GP seq # for callback tagging. */
+	struct rcu_segcblist srcu_cblist;
+					/* Pending SRCU callbacks. */
+	int srcu_idx;			/* Current reader array element. */
+	bool srcu_gp_running;		/* GP workqueue running? */
+	bool srcu_gp_waiting;		/* GP waiting for readers? */
+	struct work_struct srcu_work;	/* For driving grace periods. */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+void srcu_drive_gp(struct work_struct *wp);
+
+#define __SRCU_STRUCT_INIT(name)					\
+{									\
+	.srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq),	\
+	.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),	\
+	.srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp),	\
+	__SRCU_DEP_MAP_INIT(name)					\
+}
+
+/*
+ * This odd _STATIC_ arrangement is needed for API compatibility with
+ * Tree SRCU, which needs some per-CPU data.
+ */
+#define DEFINE_SRCU(name) \
+	struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_STATIC_SRCU(name) \
+	static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+
+void synchronize_srcu(struct srcu_struct *sp);
+
+static inline void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	synchronize_srcu(sp);
+}
+
+static inline void srcu_barrier(struct srcu_struct *sp)
+{
+	synchronize_srcu(sp);
+}
+
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return 0;
+}
+
+#endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
new file mode 100644
index 000000000000..f2b3bd6c6bc2
--- /dev/null
+++ b/include/linux/srcutree.h
@@ -0,0 +1,91 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	tree variant.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#ifndef _LINUX_SRCU_TREE_H
+#define _LINUX_SRCU_TREE_H
+
+struct srcu_array {
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
+};
+
+struct srcu_struct {
+	unsigned long completed;
+	unsigned long srcu_gp_seq;
+	atomic_t srcu_exp_cnt;
+	struct srcu_array __percpu *per_cpu_ref;
+	spinlock_t queue_lock; /* protect ->srcu_cblist */
+	struct rcu_segcblist srcu_cblist;
+	struct delayed_work work;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+/* Values for -> state variable. */
+#define SRCU_STATE_IDLE		0
+#define SRCU_STATE_SCAN1	1
+#define SRCU_STATE_SCAN2	2
+
+void process_srcu(struct work_struct *work);
+
+#define __SRCU_STRUCT_INIT(name)					\
+	{								\
+		.completed = -300,					\
+		.per_cpu_ref = &name##_srcu_array,			\
+		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
+		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
+		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+		__SRCU_DEP_MAP_INIT(name)				\
+	}
+
+/*
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *	static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *	init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
+ */
+#define __DEFINE_SRCU(name, is_static)					\
+	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+
+void synchronize_srcu_expedited(struct srcu_struct *sp);
+void srcu_barrier(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index a92f27da4a27..d269f2ca17b8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,6 +526,18 @@ config SRCU
 	  permits arbitrary sleeping or blocking within RCU read-side critical
 	  sections.
 
+config TINY_SRCU
+	bool
+	default y if TINY_RCU
+	help
+	  This option selects the single-CPU non-preemptible version of SRCU.
+
+config TREE_SRCU
+	bool
+	default y if !TINY_RCU
+	help
+	  This option selects the full-fledged version of SRCU.
+
 config TASKS_RCU
 	bool
 	default n
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 18dfc485225c..b853214a2b99 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,7 +3,8 @@
 KCOV_INSTRUMENT := n
 
 obj-y += update.o sync.o
-obj-$(CONFIG_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcu.o
+obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index cccc417a8135..98591e16db1a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -559,6 +559,7 @@ static void srcu_torture_barrier(void)
 
 static void srcu_torture_stats(void)
 {
+#ifdef CONFIG_TREE_SRCU
 	int cpu;
 	int idx = srcu_ctlp->completed & 0x1;
 
@@ -587,6 +588,7 @@ static void srcu_torture_stats(void)
 		pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
 	}
 	pr_cont("\n");
+#endif
 }
 
 static void srcu_torture_synchronize_expedited(void)
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
new file mode 100644
index 000000000000..b8293527ee18
--- /dev/null
+++ b/kernel/rcu/srcutiny.c
@@ -0,0 +1,215 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	tiny version for non-preemptible single-CPU use.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu.h"
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+	sp->srcu_lock_nesting[0] = 0;
+	sp->srcu_lock_nesting[1] = 0;
+	init_swait_queue_head(&sp->srcu_wq);
+	sp->srcu_gp_seq = 0;
+	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->srcu_gp_running = false;
+	sp->srcu_gp_waiting = false;
+	sp->srcu_idx = 0;
+	INIT_WORK(&sp->srcu_work, srcu_drive_gp);
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+		       struct lock_class_key *key)
+{
+	/* Don't re-initialize a lock while it is held. */
+	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+	lockdep_init_map(&sp->dep_map, name, key, 0);
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]);
+	flush_work(&sp->srcu_work);
+	WARN_ON(rcu_seq_state(sp->srcu_gp_seq));
+	WARN_ON(sp->srcu_gp_running);
+	WARN_ON(sp->srcu_gp_waiting);
+	WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist));
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->srcu_idx);
+	WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1);
+	return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate element of
+ * the srcu_struct.  Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+	int newval = sp->srcu_lock_nesting[idx] - 1;
+
+	WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
+	if (!newval && READ_ONCE(sp->srcu_gp_waiting))
+		swake_up(&sp->srcu_wq);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * Workqueue handler to drive one grace period and invoke any callbacks
+ * that become ready as a result.  Single-CPU and !PREEMPT operation
+ * means that we get away with murder on synchronization.  ;-)
+ */
+void srcu_drive_gp(struct work_struct *wp)
+{
+	int idx;
+	struct rcu_cblist ready_cbs;
+	struct srcu_struct *sp;
+	struct rcu_head *rhp;
+
+	sp = container_of(wp, struct srcu_struct, srcu_work);
+	if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist))
+		return; /* Already running or nothing to do. */
+
+	/* Tag recently arrived callbacks and wait for readers. */
+	WRITE_ONCE(sp->srcu_gp_running, true);
+	rcu_segcblist_accelerate(&sp->srcu_cblist,
+				 rcu_seq_snap(&sp->srcu_gp_seq));
+	rcu_seq_start(&sp->srcu_gp_seq);
+	idx = sp->srcu_idx;
+	WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
+	WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+	swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+	WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
+	rcu_seq_end(&sp->srcu_gp_seq);
+
+	/* Update callback list based on GP, and invoke ready callbacks. */
+	rcu_segcblist_advance(&sp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+		rcu_cblist_init(&ready_cbs);
+		local_irq_disable();
+		rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+		local_irq_enable();
+		rhp = rcu_cblist_dequeue(&ready_cbs);
+		for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+			local_bh_disable();
+			rhp->func(rhp);
+			local_bh_enable();
+		}
+		local_irq_disable();
+		rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+		local_irq_enable();
+	}
+	WRITE_ONCE(sp->srcu_gp_running, false);
+
+	/*
+	 * If more callbacks, reschedule ourselves.  This can race with
+	 * a call_srcu() at interrupt level, but the ->srcu_gp_running
+	 * checks will straighten that out.
+	 */
+	if (!rcu_segcblist_empty(&sp->srcu_cblist))
+		schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(srcu_drive_gp);
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+	       rcu_callback_t func)
+{
+	unsigned long flags;
+
+	head->func = func;
+	local_irq_save(flags);
+	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+	local_irq_restore(flags);
+	if (!READ_ONCE(sp->srcu_gp_running))
+		schedule_work(&sp->srcu_work);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+/*
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	struct rcu_synchronize rs;
+
+	init_rcu_head_on_stack(&rs.head);
+	init_completion(&rs.completion);
+	call_srcu(sp, &rs.head, wakeme_after_rcu);
+	wait_for_completion(&rs.completion);
+	destroy_rcu_head_on_stack(&rs.head);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
-- 
cgit v1.2.3


From dad81a2026841b5e2651aab58a7398c13cc05847 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 25 Mar 2017 17:23:44 -0700
Subject: srcu: Introduce CLASSIC_SRCU Kconfig option

The TREE_SRCU rewrite is large and a bit on the non-simple side, so
this commit helps reduce risk by allowing the old v4.11 SRCU algorithm
to be selected using a new CLASSIC_SRCU Kconfig option that depends
on RCU_EXPERT.  The default is to use the new TREE_SRCU and TINY_SRCU
algorithms, in order to help get these the testing that they need.
However, if your users do not require the update-side scalability that
is to be provided by TREE_SRCU, select RCU_EXPERT and then CLASSIC_SRCU
to revert back to the old classic SRCU algorithm.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h        |   2 +
 include/linux/srcuclassic.h | 101 ++++++++
 init/Kconfig                |  21 +-
 kernel/rcu/Makefile         |   3 +-
 kernel/rcu/rcutorture.c     |   2 +-
 kernel/rcu/srcu.c           | 347 ++++++++++++++-----------
 kernel/rcu/srcutree.c       | 613 ++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 934 insertions(+), 155 deletions(-)
 create mode 100644 include/linux/srcuclassic.h
 create mode 100644 kernel/rcu/srcutree.c

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 907f09b14eda..167ad8831aaf 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -60,6 +60,8 @@ int init_srcu_struct(struct srcu_struct *sp);
 #include <linux/srcutiny.h>
 #elif defined(CONFIG_TREE_SRCU)
 #include <linux/srcutree.h>
+#elif defined(CONFIG_CLASSIC_SRCU)
+#include <linux/srcuclassic.h>
 #else
 #error "Unknown SRCU implementation specified to kernel configuration"
 #endif
diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
new file mode 100644
index 000000000000..41cf99930f34
--- /dev/null
+++ b/include/linux/srcuclassic.h
@@ -0,0 +1,101 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion,
+ *	classic v4.11 variant.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2017
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ */
+
+#ifndef _LINUX_SRCU_CLASSIC_H
+#define _LINUX_SRCU_CLASSIC_H
+
+struct srcu_array {
+	unsigned long lock_count[2];
+	unsigned long unlock_count[2];
+};
+
+struct rcu_batch {
+	struct rcu_head *head, **tail;
+};
+
+#define RCU_BATCH_INIT(name) { NULL, &(name.head) }
+
+struct srcu_struct {
+	unsigned long completed;
+	struct srcu_array __percpu *per_cpu_ref;
+	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
+	bool running;
+	/* callbacks just queued */
+	struct rcu_batch batch_queue;
+	/* callbacks try to do the first check_zero */
+	struct rcu_batch batch_check0;
+	/* callbacks done with the first check_zero and the flip */
+	struct rcu_batch batch_check1;
+	struct rcu_batch batch_done;
+	struct delayed_work work;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+};
+
+void process_srcu(struct work_struct *work);
+
+#define __SRCU_STRUCT_INIT(name)					\
+	{								\
+		.completed = -300,					\
+		.per_cpu_ref = &name##_srcu_array,			\
+		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
+		.running = false,					\
+		.batch_queue = RCU_BATCH_INIT(name.batch_queue),	\
+		.batch_check0 = RCU_BATCH_INIT(name.batch_check0),	\
+		.batch_check1 = RCU_BATCH_INIT(name.batch_check1),	\
+		.batch_done = RCU_BATCH_INIT(name.batch_done),		\
+		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+		__SRCU_DEP_MAP_INIT(name)				\
+	}
+
+/*
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *	static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *	init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
+ */
+#define __DEFINE_SRCU(name, is_static)					\
+	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
+
+void synchronize_srcu_expedited(struct srcu_struct *sp);
+void srcu_barrier(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index d269f2ca17b8..558cc3638ab9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -526,15 +526,32 @@ config SRCU
 	  permits arbitrary sleeping or blocking within RCU read-side critical
 	  sections.
 
+config CLASSIC_SRCU
+	bool "Use v4.11 classic SRCU implementation"
+	default n
+	depends on RCU_EXPERT && SRCU
+	help
+	  This option selects the traditional well-tested classic SRCU
+	  implementation from v4.11, as might be desired for enterprise
+	  Linux distributions.  Without this option, the shiny new
+	  Tiny SRCU and Tree SRCU implementations are used instead.
+	  At some point, it is hoped that Tiny SRCU and Tree SRCU
+	  will accumulate enough test time and confidence to allow
+	  Classic SRCU to be dropped entirely.
+
+	  Say Y if you need a rock-solid SRCU.
+
+	  Say N if you would like help test Tree SRCU.
+
 config TINY_SRCU
 	bool
-	default y if TINY_RCU
+	default y if TINY_RCU && !CLASSIC_SRCU
 	help
 	  This option selects the single-CPU non-preemptible version of SRCU.
 
 config TREE_SRCU
 	bool
-	default y if !TINY_RCU
+	default y if !TINY_RCU && !CLASSIC_SRCU
 	help
 	  This option selects the full-fledged version of SRCU.
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index b853214a2b99..158e6593d58c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,7 +3,8 @@
 KCOV_INSTRUMENT := n
 
 obj-y += update.o sync.o
-obj-$(CONFIG_TREE_SRCU) += srcu.o
+obj-$(CONFIG_CLASSIC_SRCU) += srcu.o
+obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 9cbb8a7b909d..6f344b6748a8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -562,7 +562,7 @@ static void srcu_torture_stats(void)
 	int __maybe_unused cpu;
 	int idx;
 
-#ifdef CONFIG_TREE_SRCU
+#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
 	idx = srcu_ctlp->completed & 0x1;
 	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3cfcc59bddf3..584d8a983883 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -36,16 +36,75 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
-#include <linux/rcu_node_tree.h>
 #include "rcu.h"
 
+/*
+ * Initialize an rcu_batch structure to empty.
+ */
+static inline void rcu_batch_init(struct rcu_batch *b)
+{
+	b->head = NULL;
+	b->tail = &b->head;
+}
+
+/*
+ * Enqueue a callback onto the tail of the specified rcu_batch structure.
+ */
+static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head)
+{
+	*b->tail = head;
+	b->tail = &head->next;
+}
+
+/*
+ * Is the specified rcu_batch structure empty?
+ */
+static inline bool rcu_batch_empty(struct rcu_batch *b)
+{
+	return b->tail == &b->head;
+}
+
+/*
+ * Remove the callback at the head of the specified rcu_batch structure
+ * and return a pointer to it, or return NULL if the structure is empty.
+ */
+static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b)
+{
+	struct rcu_head *head;
+
+	if (rcu_batch_empty(b))
+		return NULL;
+
+	head = b->head;
+	b->head = head->next;
+	if (b->tail == &head->next)
+		rcu_batch_init(b);
+
+	return head;
+}
+
+/*
+ * Move all callbacks from the rcu_batch structure specified by "from" to
+ * the structure specified by "to".
+ */
+static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from)
+{
+	if (!rcu_batch_empty(from)) {
+		*to->tail = from->head;
+		to->tail = from->tail;
+		rcu_batch_init(from);
+	}
+}
+
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
-	sp->srcu_gp_seq = 0;
-	atomic_set(&sp->srcu_exp_cnt, 0);
 	spin_lock_init(&sp->queue_lock);
-	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->running = false;
+	rcu_batch_init(&sp->batch_queue);
+	rcu_batch_init(&sp->batch_check0);
+	rcu_batch_init(&sp->batch_check1);
+	rcu_batch_init(&sp->batch_done);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
 	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
@@ -180,8 +239,6 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	return sum;
 }
 
-#define SRCU_INTERVAL		1
-
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
  * @sp: structure to clean up.
@@ -197,16 +254,8 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
-	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
-	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
-		return; /* Leakage unless caller handles error. */
-	flush_delayed_work(&sp->work);
-	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
-		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
-		return; /* Caller forgot to stop doing call_srcu()? */
-	}
 	free_percpu(sp->per_cpu_ref);
 	sp->per_cpu_ref = NULL;
 }
@@ -245,36 +294,26 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  * We use an adaptive strategy for synchronize_srcu() and especially for
  * synchronize_srcu_expedited().  We spin for a fixed time period
  * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after a few microseconds,
- * we repeatedly block for 1-millisecond time periods.
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
  */
 #define SRCU_RETRY_CHECK_DELAY		5
+#define SYNCHRONIZE_SRCU_TRYCOUNT	2
+#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT	12
 
 /*
- * Start an SRCU grace period.
- */
-static void srcu_gp_start(struct srcu_struct *sp)
-{
-	int state;
-
-	rcu_segcblist_accelerate(&sp->srcu_cblist,
-				 rcu_seq_snap(&sp->srcu_gp_seq));
-	rcu_seq_start(&sp->srcu_gp_seq);
-	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
-}
-
-/*
- * Wait until all readers counted by array index idx complete, but
- * loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->completed is not changed while checking.
+ * @@@ Wait until all pre-existing readers complete.  Such readers
+ * will have used the index specified by "idx".
+ * the caller should ensures the ->completed is not changed while checking
+ * and idx = (->completed & 1) ^ 1
  */
 static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
 	for (;;) {
 		if (srcu_readers_active_idx_check(sp, idx))
 			return true;
-		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
+		if (--trycount <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -299,19 +338,6 @@ static void srcu_flip(struct srcu_struct *sp)
 	smp_mb(); /* D */  /* Pairs with C. */
 }
 
-/*
- * End an SRCU grace period.
- */
-static void srcu_gp_end(struct srcu_struct *sp)
-{
-	rcu_seq_end(&sp->srcu_gp_seq);
-
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_advance(&sp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
-	spin_unlock_irq(&sp->queue_lock);
-}
-
 /*
  * Enqueue an SRCU callback on the specified srcu_struct structure,
  * initiating grace-period processing if it is not already running.
@@ -348,24 +374,26 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	head->func = func;
 	spin_lock_irqsave(&sp->queue_lock, flags);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
-		srcu_gp_start(sp);
+	rcu_batch_queue(&sp->batch_queue, head);
+	if (!sp->running) {
+		sp->running = true;
 		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
+static void srcu_reschedule(struct srcu_struct *sp);
 
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp)
+static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 {
 	struct rcu_synchronize rcu;
 	struct rcu_head *head = &rcu.head;
+	bool done = false;
 
 	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
 			 lock_is_held(&rcu_bh_lock_map) ||
@@ -373,8 +401,6 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
 
-	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
-		return;
 	might_sleep();
 	init_completion(&rcu.completion);
 
@@ -382,47 +408,31 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 	head->func = wakeme_after_rcu;
 	spin_lock_irq(&sp->queue_lock);
 	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
+	if (!sp->running) {
 		/* steal the processing owner */
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-		srcu_gp_start(sp);
+		sp->running = true;
+		rcu_batch_queue(&sp->batch_check0, head);
 		spin_unlock_irq(&sp->queue_lock);
+
+		srcu_advance_batches(sp, trycount);
+		if (!rcu_batch_empty(&sp->batch_done)) {
+			BUG_ON(sp->batch_done.head != head);
+			rcu_batch_dequeue(&sp->batch_done);
+			done = true;
+		}
 		/* give the processing owner to work_struct */
-		srcu_reschedule(sp, 0);
+		srcu_reschedule(sp);
 	} else {
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+		rcu_batch_queue(&sp->batch_queue, head);
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
-	wait_for_completion(&rcu.completion);
-	smp_mb(); /* Caller's later accesses after GP. */
-}
-
-/**
- * synchronize_srcu_expedited - Brute-force SRCU grace period
- * @sp: srcu_struct with which to synchronize.
- *
- * Wait for an SRCU grace period to elapse, but be more aggressive about
- * spinning rather than blocking when waiting.
- *
- * Note that synchronize_srcu_expedited() has the same deadlock and
- * memory-ordering properties as does synchronize_srcu().
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
-	bool do_norm = rcu_gp_is_normal();
-
-	if (!do_norm) {
-		atomic_inc(&sp->srcu_exp_cnt);
-		smp_mb__after_atomic(); /* increment before GP. */
-	}
-	__synchronize_srcu(sp);
-	if (!do_norm) {
-		smp_mb__before_atomic(); /* GP before decrement. */
-		atomic_dec(&sp->srcu_exp_cnt);
+	if (!done) {
+		wait_for_completion(&rcu.completion);
+		smp_mb(); /* Caller's later accesses after GP. */
 	}
+
 }
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 
 /**
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
@@ -465,13 +475,28 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	if (rcu_gp_is_expedited())
-		synchronize_srcu_expedited(sp);
-	else
-		__synchronize_srcu(sp);
+	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
+			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
+			   : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	__synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @sp: srcu_struct on which to wait for in-flight callbacks.
@@ -495,13 +520,29 @@ unsigned long srcu_batches_completed(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
+#define SRCU_CALLBACK_BATCH	10
+#define SRCU_INTERVAL		1
+
+/*
+ * Move any new SRCU callbacks to the first stage of the SRCU grace
+ * period pipeline.
+ */
+static void srcu_collect_new(struct srcu_struct *sp)
+{
+	if (!rcu_batch_empty(&sp->batch_queue)) {
+		spin_lock_irq(&sp->queue_lock);
+		rcu_batch_move(&sp->batch_check0, &sp->batch_queue);
+		spin_unlock_irq(&sp->queue_lock);
+	}
+}
+
 /*
  * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
  * ->batch_check1 and then to ->batch_done as readers drain.
  */
-static void srcu_advance_batches(struct srcu_struct *sp)
+static void srcu_advance_batches(struct srcu_struct *sp, int trycount)
 {
-	int idx;
+	int idx = 1 ^ (sp->completed & 1);
 
 	/*
 	 * Because readers might be delayed for an extended period after
@@ -509,44 +550,50 @@ static void srcu_advance_batches(struct srcu_struct *sp)
 	 * might well be readers using both idx=0 and idx=1.  We therefore
 	 * need to wait for readers to clear from both index values before
 	 * invoking a callback.
-	 *
-	 * The load-acquire ensures that we see the accesses performed
-	 * by the prior grace period.
 	 */
-	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
-	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq(&sp->queue_lock);
-		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
-			spin_unlock_irq(&sp->queue_lock);
-			return;
-		}
-		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-		if (idx == SRCU_STATE_IDLE)
-			srcu_gp_start(sp);
-		spin_unlock_irq(&sp->queue_lock);
-		if (idx != SRCU_STATE_IDLE)
-			return; /* Someone else started the grace period. */
-	}
 
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 1))
-			return; /* readers present, retry later. */
-		srcu_flip(sp);
-		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
-	}
+	if (rcu_batch_empty(&sp->batch_check0) &&
+	    rcu_batch_empty(&sp->batch_check1))
+		return; /* no callbacks need to be advanced */
 
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+	if (!try_check_zero(sp, idx, trycount))
+		return; /* failed to advance, will try after SRCU_INTERVAL */
 
-		/*
-		 * SRCU read-side critical sections are normally short,
-		 * so check at least twice in quick succession after a flip.
-		 */
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 2))
-			return; /* readers present, retry after later. */
-		srcu_gp_end(sp);
-	}
+	/*
+	 * The callbacks in ->batch_check1 have already done with their
+	 * first zero check and flip back when they were enqueued on
+	 * ->batch_check0 in a previous invocation of srcu_advance_batches().
+	 * (Presumably try_check_zero() returned false during that
+	 * invocation, leaving the callbacks stranded on ->batch_check1.)
+	 * They are therefore ready to invoke, so move them to ->batch_done.
+	 */
+	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
+
+	if (rcu_batch_empty(&sp->batch_check0))
+		return; /* no callbacks need to be advanced */
+	srcu_flip(sp);
+
+	/*
+	 * The callbacks in ->batch_check0 just finished their
+	 * first check zero and flip, so move them to ->batch_check1
+	 * for future checking on the other idx.
+	 */
+	rcu_batch_move(&sp->batch_check1, &sp->batch_check0);
+
+	/*
+	 * SRCU read-side critical sections are normally short, so check
+	 * at least twice in quick succession after a flip.
+	 */
+	trycount = trycount < 2 ? 2 : trycount;
+	if (!try_check_zero(sp, idx^1, trycount))
+		return; /* failed to advance, will try after SRCU_INTERVAL */
+
+	/*
+	 * The callbacks in ->batch_check1 have now waited for all
+	 * pre-existing readers using both idx values.  They are therefore
+	 * ready to invoke, so move them to ->batch_done.
+	 */
+	rcu_batch_move(&sp->batch_done, &sp->batch_check1);
 }
 
 /*
@@ -557,48 +604,45 @@ static void srcu_advance_batches(struct srcu_struct *sp)
  */
 static void srcu_invoke_callbacks(struct srcu_struct *sp)
 {
-	struct rcu_cblist ready_cbs;
-	struct rcu_head *rhp;
+	int i;
+	struct rcu_head *head;
 
-	spin_lock_irq(&sp->queue_lock);
-	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
-		spin_unlock_irq(&sp->queue_lock);
-		return;
-	}
-	rcu_cblist_init(&ready_cbs);
-	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
-	rhp = rcu_cblist_dequeue(&ready_cbs);
-	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+	for (i = 0; i < SRCU_CALLBACK_BATCH; i++) {
+		head = rcu_batch_dequeue(&sp->batch_done);
+		if (!head)
+			break;
 		local_bh_disable();
-		rhp->func(rhp);
+		head->func(head);
 		local_bh_enable();
 	}
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
 }
 
 /*
  * Finished one round of SRCU grace period.  Start another if there are
  * more SRCU callbacks queued, otherwise put SRCU into not-running state.
  */
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+static void srcu_reschedule(struct srcu_struct *sp)
 {
 	bool pending = true;
-	int state;
 
-	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+	if (rcu_batch_empty(&sp->batch_done) &&
+	    rcu_batch_empty(&sp->batch_check1) &&
+	    rcu_batch_empty(&sp->batch_check0) &&
+	    rcu_batch_empty(&sp->batch_queue)) {
 		spin_lock_irq(&sp->queue_lock);
-		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
-		    state == SRCU_STATE_IDLE)
+		if (rcu_batch_empty(&sp->batch_done) &&
+		    rcu_batch_empty(&sp->batch_check1) &&
+		    rcu_batch_empty(&sp->batch_check0) &&
+		    rcu_batch_empty(&sp->batch_queue)) {
+			sp->running = false;
 			pending = false;
+		}
 		spin_unlock_irq(&sp->queue_lock);
 	}
 
 	if (pending)
-		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+		queue_delayed_work(system_power_efficient_wq,
+				   &sp->work, SRCU_INTERVAL);
 }
 
 /*
@@ -610,8 +654,9 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_batches(sp);
+	srcu_collect_new(sp);
+	srcu_advance_batches(sp, 1);
 	srcu_invoke_callbacks(sp);
-	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+	srcu_reschedule(sp);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
new file mode 100644
index 000000000000..da676b0d016b
--- /dev/null
+++ b/kernel/rcu/srcutree.c
@@ -0,0 +1,613 @@
+/*
+ * Sleepable Read-Copy Update mechanism for mutual exclusion.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
+ *
+ * Author: Paul McKenney <paulmck@us.ibm.com>
+ *	   Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ *		Documentation/RCU/ *.txt
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/delay.h>
+#include <linux/srcu.h>
+
+#include <linux/rcu_node_tree.h>
+#include "rcu.h"
+
+static int init_srcu_struct_fields(struct srcu_struct *sp)
+{
+	sp->completed = 0;
+	sp->srcu_gp_seq = 0;
+	atomic_set(&sp->srcu_exp_cnt, 0);
+	spin_lock_init(&sp->queue_lock);
+	rcu_segcblist_init(&sp->srcu_cblist);
+	INIT_DELAYED_WORK(&sp->work, process_srcu);
+	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
+	return sp->per_cpu_ref ? 0 : -ENOMEM;
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+
+int __init_srcu_struct(struct srcu_struct *sp, const char *name,
+		       struct lock_class_key *key)
+{
+	/* Don't re-initialize a lock while it is held. */
+	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
+	lockdep_init_map(&sp->dep_map, name, key, 0);
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(__init_srcu_struct);
+
+#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/**
+ * init_srcu_struct - initialize a sleep-RCU structure
+ * @sp: structure to initialize.
+ *
+ * Must invoke this on a given srcu_struct before passing that srcu_struct
+ * to any other function.  Each srcu_struct represents a separate domain
+ * of SRCU protection.
+ */
+int init_srcu_struct(struct srcu_struct *sp)
+{
+	return init_srcu_struct_fields(sp);
+}
+EXPORT_SYMBOL_GPL(init_srcu_struct);
+
+#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+
+/*
+ * Returns approximate total of the readers' ->lock_count[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
+{
+	int cpu;
+	unsigned long sum = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[idx]);
+	}
+	return sum;
+}
+
+/*
+ * Returns approximate total of the readers' ->unlock_count[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
+{
+	int cpu;
+	unsigned long sum = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->unlock_count[idx]);
+	}
+	return sum;
+}
+
+/*
+ * Return true if the number of pre-existing readers is determined to
+ * be zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
+{
+	unsigned long unlocks;
+
+	unlocks = srcu_readers_unlock_idx(sp, idx);
+
+	/*
+	 * Make sure that a lock is always counted if the corresponding
+	 * unlock is counted. Needs to be a smp_mb() as the read side may
+	 * contain a read from a variable that is written to before the
+	 * synchronize_srcu() in the write side. In this case smp_mb()s
+	 * A and B act like the store buffering pattern.
+	 *
+	 * This smp_mb() also pairs with smp_mb() C to prevent accesses
+	 * after the synchronize_srcu() from being executed before the
+	 * grace period ends.
+	 */
+	smp_mb(); /* A */
+
+	/*
+	 * If the locks are the same as the unlocks, then there must have
+	 * been no readers on this index at some time in between. This does
+	 * not mean that there are no more readers, as one could have read
+	 * the current index but not have incremented the lock counter yet.
+	 *
+	 * Possible bug: There is no guarantee that there haven't been
+	 * ULONG_MAX increments of ->lock_count[] since the unlocks were
+	 * counted, meaning that this could return true even if there are
+	 * still active readers.  Since there are no memory barriers around
+	 * srcu_flip(), the CPU is not required to increment ->completed
+	 * before running srcu_readers_unlock_idx(), which means that there
+	 * could be an arbitrarily large number of critical sections that
+	 * execute after srcu_readers_unlock_idx() but use the old value
+	 * of ->completed.
+	 */
+	return srcu_readers_lock_idx(sp, idx) == unlocks;
+}
+
+/**
+ * srcu_readers_active - returns true if there are readers. and false
+ *                       otherwise
+ * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
+ *
+ * Note that this is not an atomic primitive, and can therefore suffer
+ * severe errors when invoked on an active srcu_struct.  That said, it
+ * can be useful as an error check at cleanup time.
+ */
+static bool srcu_readers_active(struct srcu_struct *sp)
+{
+	int cpu;
+	unsigned long sum = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+
+		sum += READ_ONCE(cpuc->lock_count[0]);
+		sum += READ_ONCE(cpuc->lock_count[1]);
+		sum -= READ_ONCE(cpuc->unlock_count[0]);
+		sum -= READ_ONCE(cpuc->unlock_count[1]);
+	}
+	return sum;
+}
+
+#define SRCU_INTERVAL		1
+
+/**
+ * cleanup_srcu_struct - deconstruct a sleep-RCU structure
+ * @sp: structure to clean up.
+ *
+ * Must invoke this after you are finished using a given srcu_struct that
+ * was initialized via init_srcu_struct(), else you leak memory.
+ */
+void cleanup_srcu_struct(struct srcu_struct *sp)
+{
+	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
+	if (WARN_ON(srcu_readers_active(sp)))
+		return; /* Leakage unless caller handles error. */
+	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
+		return; /* Leakage unless caller handles error. */
+	flush_delayed_work(&sp->work);
+	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
+		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+		return; /* Caller forgot to stop doing call_srcu()? */
+	}
+	free_percpu(sp->per_cpu_ref);
+	sp->per_cpu_ref = NULL;
+}
+EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct.  Must be called from process context.
+ * Returns an index that must be passed to the matching srcu_read_unlock().
+ */
+int __srcu_read_lock(struct srcu_struct *sp)
+{
+	int idx;
+
+	idx = READ_ONCE(sp->completed) & 0x1;
+	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
+	smp_mb(); /* B */  /* Avoid leaking the critical section. */
+	return idx;
+}
+EXPORT_SYMBOL_GPL(__srcu_read_lock);
+
+/*
+ * Removes the count for the old reader from the appropriate per-CPU
+ * element of the srcu_struct.  Note that this may well be a different
+ * CPU than that which was incremented by the corresponding srcu_read_lock().
+ * Must be called from process context.
+ */
+void __srcu_read_unlock(struct srcu_struct *sp, int idx)
+{
+	smp_mb(); /* C */  /* Avoid leaking the critical section. */
+	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
+}
+EXPORT_SYMBOL_GPL(__srcu_read_unlock);
+
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after a few microseconds,
+ * we repeatedly block for 1-millisecond time periods.
+ */
+#define SRCU_RETRY_CHECK_DELAY		5
+
+/*
+ * Start an SRCU grace period.
+ */
+static void srcu_gp_start(struct srcu_struct *sp)
+{
+	int state;
+
+	rcu_segcblist_accelerate(&sp->srcu_cblist,
+				 rcu_seq_snap(&sp->srcu_gp_seq));
+	rcu_seq_start(&sp->srcu_gp_seq);
+	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
+}
+
+/*
+ * Wait until all readers counted by array index idx complete, but
+ * loop an additional time if there is an expedited grace period pending.
+ * The caller must ensure that ->completed is not changed while checking.
+ */
+static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
+{
+	for (;;) {
+		if (srcu_readers_active_idx_check(sp, idx))
+			return true;
+		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
+			return false;
+		udelay(SRCU_RETRY_CHECK_DELAY);
+	}
+}
+
+/*
+ * Increment the ->completed counter so that future SRCU readers will
+ * use the other rank of the ->(un)lock_count[] arrays.  This allows
+ * us to wait for pre-existing readers in a starvation-free manner.
+ */
+static void srcu_flip(struct srcu_struct *sp)
+{
+	WRITE_ONCE(sp->completed, sp->completed + 1);
+
+	/*
+	 * Ensure that if the updater misses an __srcu_read_unlock()
+	 * increment, that task's next __srcu_read_lock() will see the
+	 * above counter update.  Note that both this memory barrier
+	 * and the one in srcu_readers_active_idx_check() provide the
+	 * guarantee for __srcu_read_lock().
+	 */
+	smp_mb(); /* D */  /* Pairs with C. */
+}
+
+/*
+ * End an SRCU grace period.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+	rcu_seq_end(&sp->srcu_gp_seq);
+
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_advance(&sp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	spin_unlock_irq(&sp->queue_lock);
+}
+
+/*
+ * Enqueue an SRCU callback on the specified srcu_struct structure,
+ * initiating grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section.  On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu().  It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
+ */
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+	       rcu_callback_t func)
+{
+	unsigned long flags;
+
+	head->next = NULL;
+	head->func = func;
+	spin_lock_irqsave(&sp->queue_lock, flags);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
+	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
+		srcu_gp_start(sp);
+		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
+	}
+	spin_unlock_irqrestore(&sp->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
+
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp)
+{
+	struct rcu_synchronize rcu;
+	struct rcu_head *head = &rcu.head;
+
+	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+			 lock_is_held(&rcu_bh_lock_map) ||
+			 lock_is_held(&rcu_lock_map) ||
+			 lock_is_held(&rcu_sched_lock_map),
+			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+
+	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+		return;
+	might_sleep();
+	init_completion(&rcu.completion);
+
+	head->next = NULL;
+	head->func = wakeme_after_rcu;
+	spin_lock_irq(&sp->queue_lock);
+	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
+		/* steal the processing owner */
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+		srcu_gp_start(sp);
+		spin_unlock_irq(&sp->queue_lock);
+		/* give the processing owner to work_struct */
+		srcu_reschedule(sp, 0);
+	} else {
+		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
+		spin_unlock_irq(&sp->queue_lock);
+	}
+
+	wait_for_completion(&rcu.completion);
+	smp_mb(); /* Caller's later accesses after GP. */
+}
+
+/**
+ * synchronize_srcu_expedited - Brute-force SRCU grace period
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
+ *
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
+ */
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	bool do_norm = rcu_gp_is_normal();
+
+	if (!do_norm) {
+		atomic_inc(&sp->srcu_exp_cnt);
+		smp_mb__after_atomic(); /* increment before GP. */
+	}
+	__synchronize_srcu(sp);
+	if (!do_norm) {
+		smp_mb__before_atomic(); /* GP before decrement. */
+		atomic_dec(&sp->srcu_exp_cnt);
+	}
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
+
+/**
+ * synchronize_srcu - wait for prior SRCU read-side critical-section completion
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Wait for the count to drain to zero of both indexes. To avoid the
+ * possible starvation of synchronize_srcu(), it waits for the count of
+ * the index=((->completed & 1) ^ 1) to drain to zero at first,
+ * and then flip the completed and wait for the count of the other index.
+ *
+ * Can block; must be called from process context.
+ *
+ * Note that it is illegal to call synchronize_srcu() from the corresponding
+ * SRCU read-side critical section; doing so will result in deadlock.
+ * However, it is perfectly legal to call synchronize_srcu() on one
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu().  In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section.  Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu().  This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
+ */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+	if (rcu_gp_is_expedited())
+		synchronize_srcu_expedited(sp);
+	else
+		__synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/**
+ * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
+ */
+void srcu_barrier(struct srcu_struct *sp)
+{
+	synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(srcu_barrier);
+
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
+ */
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
+{
+	return sp->completed;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+/*
+ * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
+ * ->batch_check1 and then to ->batch_done as readers drain.
+ */
+static void srcu_advance_batches(struct srcu_struct *sp)
+{
+	int idx;
+
+	/*
+	 * Because readers might be delayed for an extended period after
+	 * fetching ->completed for their index, at any point in time there
+	 * might well be readers using both idx=0 and idx=1.  We therefore
+	 * need to wait for readers to clear from both index values before
+	 * invoking a callback.
+	 *
+	 * The load-acquire ensures that we see the accesses performed
+	 * by the prior grace period.
+	 */
+	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
+	if (idx == SRCU_STATE_IDLE) {
+		spin_lock_irq(&sp->queue_lock);
+		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+			spin_unlock_irq(&sp->queue_lock);
+			return;
+		}
+		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		if (idx == SRCU_STATE_IDLE)
+			srcu_gp_start(sp);
+		spin_unlock_irq(&sp->queue_lock);
+		if (idx != SRCU_STATE_IDLE)
+			return; /* Someone else started the grace period. */
+	}
+
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+		idx = 1 ^ (sp->completed & 1);
+		if (!try_check_zero(sp, idx, 1))
+			return; /* readers present, retry later. */
+		srcu_flip(sp);
+		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
+	}
+
+	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+
+		/*
+		 * SRCU read-side critical sections are normally short,
+		 * so check at least twice in quick succession after a flip.
+		 */
+		idx = 1 ^ (sp->completed & 1);
+		if (!try_check_zero(sp, idx, 2))
+			return; /* readers present, retry after later. */
+		srcu_gp_end(sp);
+	}
+}
+
+/*
+ * Invoke a limited number of SRCU callbacks that have passed through
+ * their grace period.  If there are more to do, SRCU will reschedule
+ * the workqueue.  Note that needed memory barriers have been executed
+ * in this task's context by srcu_readers_active_idx_check().
+ */
+static void srcu_invoke_callbacks(struct srcu_struct *sp)
+{
+	struct rcu_cblist ready_cbs;
+	struct rcu_head *rhp;
+
+	spin_lock_irq(&sp->queue_lock);
+	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
+		spin_unlock_irq(&sp->queue_lock);
+		return;
+	}
+	rcu_cblist_init(&ready_cbs);
+	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
+	rhp = rcu_cblist_dequeue(&ready_cbs);
+	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
+		local_bh_disable();
+		rhp->func(rhp);
+		local_bh_enable();
+	}
+	spin_lock_irq(&sp->queue_lock);
+	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sp->queue_lock);
+}
+
+/*
+ * Finished one round of SRCU grace period.  Start another if there are
+ * more SRCU callbacks queued, otherwise put SRCU into not-running state.
+ */
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
+{
+	bool pending = true;
+	int state;
+
+	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
+		spin_lock_irq(&sp->queue_lock);
+		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
+		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
+		    state == SRCU_STATE_IDLE)
+			pending = false;
+		spin_unlock_irq(&sp->queue_lock);
+	}
+
+	if (pending)
+		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
+}
+
+/*
+ * This is the work-queue function that handles SRCU grace periods.
+ */
+void process_srcu(struct work_struct *work)
+{
+	struct srcu_struct *sp;
+
+	sp = container_of(work, struct srcu_struct, work.work);
+
+	srcu_advance_batches(sp);
+	srcu_invoke_callbacks(sp);
+	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+}
+EXPORT_SYMBOL_GPL(process_srcu);
-- 
cgit v1.2.3


From 25ce4be72411867e0471908ee9319599035cc624 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Apr 2017 09:06:41 +0200
Subject: genirq: Return the IRQ name from free_irq()

This allows callers to get back at them instead of having to store it in
another variable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  2 +-
 kernel/irq/manage.c       | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 53144e78a369..a6fba4804672 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -155,7 +155,7 @@ extern int __must_check
 request_percpu_irq(unsigned int irq, irq_handler_t handler,
 		   const char *devname, void __percpu *percpu_dev_id);
 
-extern void free_irq(unsigned int, void *);
+extern const void *free_irq(unsigned int, void *);
 extern void free_percpu_irq(unsigned int, void __percpu *);
 
 struct device;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 391cb738b2db..e688e7e06772 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1574,20 +1574,27 @@ EXPORT_SYMBOL_GPL(remove_irq);
  *	have completed.
  *
  *	This function must not be called from interrupt context.
+ *
+ *	Returns the devname argument passed to request_irq.
  */
-void free_irq(unsigned int irq, void *dev_id)
+const void *free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+	struct irqaction *action;
+	const char *devname;
 
 	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
-		return;
+		return NULL;
 
 #ifdef CONFIG_SMP
 	if (WARN_ON(desc->affinity_notify))
 		desc->affinity_notify = NULL;
 #endif
 
-	kfree(__free_irq(irq, dev_id));
+	action = __free_irq(irq, dev_id);
+	devname = action->name;
+	kfree(action);
+	return devname;
 }
 EXPORT_SYMBOL(free_irq);
 
-- 
cgit v1.2.3


From 704e8953d3e9db29d5d93c0bf6973d86fe15e679 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Apr 2017 09:06:42 +0200
Subject: PCI/irq: Add pci_request_irq() and pci_free_irq() helpers

These are small wrappers around request_threaded_irq() and free_irq(),
which dynamically allocate space for the device name so that drivers don't
need to keep static buffers for these around.  Additionally it works with
device-relative vector numbers to make the usage easier, and force the
IRQF_SHARED flag on given that it has no runtime overhead and should be
supported by all PCI devices.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/irq.c   | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/pci.h |  6 ++++++
 2 files changed, 66 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/irq.c b/drivers/pci/irq.c
index 6684f153ab57..f1e46d255c5f 100644
--- a/drivers/pci/irq.c
+++ b/drivers/pci/irq.c
@@ -1,7 +1,8 @@
 /*
- * PCI IRQ failure handing code
+ * PCI IRQ handling code
  *
  * Copyright (c) 2008 James Bottomley <James.Bottomley@HansenPartnership.com>
+ * Copyright (C) 2017 Christoph Hellwig.
  */
 
 #include <linux/acpi.h>
@@ -59,3 +60,61 @@ enum pci_lost_interrupt_reason pci_lost_interrupt(struct pci_dev *pdev)
 	return PCI_LOST_IRQ_NO_INFORMATION;
 }
 EXPORT_SYMBOL(pci_lost_interrupt);
+
+/**
+ * pci_request_irq - allocate an interrupt line for a PCI device
+ * @dev:	PCI device to operate on
+ * @nr:		device-relative interrupt vector index (0-based).
+ * @handler:	Function to be called when the IRQ occurs.
+ *		Primary handler for threaded interrupts.
+ *		If NULL and thread_fn != NULL the default primary handler is
+ *		installed.
+ * @thread_fn:	Function called from the IRQ handler thread
+ *		If NULL, no IRQ thread is created
+ * @dev_id:	Cookie passed back to the handler function
+ * @fmt:	Printf-like format string naming the handler
+ *
+ * This call allocates interrupt resources and enables the interrupt line and
+ * IRQ handling. From the point this call is made @handler and @thread_fn may
+ * be invoked.  All interrupts requested using this function might be shared.
+ *
+ * @dev_id must not be NULL and must be globally unique.
+ */
+int pci_request_irq(struct pci_dev *dev, unsigned int nr, irq_handler_t handler,
+		irq_handler_t thread_fn, void *dev_id, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	char *devname;
+
+	va_start(ap, fmt);
+	devname = kvasprintf(GFP_KERNEL, fmt, ap);
+	va_end(ap);
+
+	ret = request_threaded_irq(pci_irq_vector(dev, nr), handler, thread_fn,
+			IRQF_SHARED, devname, dev_id);
+	if (ret)
+		kfree(devname);
+	return ret;
+}
+EXPORT_SYMBOL(pci_request_irq);
+
+/**
+ * pci_free_irq - free an interrupt allocated with pci_request_irq
+ * @dev:	PCI device to operate on
+ * @nr:		device-relative interrupt vector index (0-based).
+ * @dev_id:	Device identity to free
+ *
+ * Remove an interrupt handler. The handler is removed and if the interrupt
+ * line is no longer in use by any driver it is disabled.  The caller must
+ * ensure the interrupt is disabled on the device before calling this function.
+ * The function does not return until any executing interrupts for this IRQ
+ * have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void pci_free_irq(struct pci_dev *dev, unsigned int nr, void *dev_id)
+{
+	kfree(free_irq(pci_irq_vector(dev, nr), dev_id));
+}
+EXPORT_SYMBOL(pci_free_irq);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..b23f81b583ab 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -28,6 +28,7 @@
 #include <linux/kobject.h>
 #include <linux/atomic.h>
 #include <linux/device.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/resource_ext.h>
 #include <uapi/linux/pci.h>
@@ -1072,6 +1073,11 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 bool pci_device_is_present(struct pci_dev *pdev);
 void pci_ignore_hotplug(struct pci_dev *dev);
 
+int __printf(6, 7) pci_request_irq(struct pci_dev *dev, unsigned int nr,
+		irq_handler_t handler, irq_handler_t thread_fn, void *dev_id,
+		const char *fmt, ...);
+void pci_free_irq(struct pci_dev *dev, unsigned int nr, void *dev_id);
+
 /* ROM control related routines */
 int pci_enable_rom(struct pci_dev *pdev);
 void pci_disable_rom(struct pci_dev *pdev);
-- 
cgit v1.2.3


From f318dd083c8128c50e48ceb8c3e812e52800fc4f Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 18 Apr 2017 11:27:03 -0700
Subject: cma: Store a name in the cma structure

Frameworks that may want to enumerate CMA heaps (e.g. Ion) will find it
useful to have an explicit name attached to each region. Store the name
in each CMA structure.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kvm/book3s_hv_builtin.c |  3 ++-
 drivers/base/dma-contiguous.c        |  5 +++--
 include/linux/cma.h                  |  4 +++-
 mm/cma.c                             | 17 +++++++++++++++--
 mm/cma.h                             |  1 +
 mm/cma_debug.c                       |  2 +-
 6 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 4d6c64b3041c..b739ff80e979 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -100,7 +100,8 @@ void __init kvm_cma_reserve(void)
 			 (unsigned long)selected_size / SZ_1M);
 		align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
 		cma_declare_contiguous(0, selected_size, 0, align_size,
-			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma);
+			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, "kvm_cma",
+			&kvm_cma);
 	}
 }
 
diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index b55804cac4c4..ea9726e71468 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -165,7 +165,8 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base,
 {
 	int ret;
 
-	ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, res_cma);
+	ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed,
+					"reserved", res_cma);
 	if (ret)
 		return ret;
 
@@ -258,7 +259,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
 		return -EINVAL;
 	}
 
-	err = cma_init_reserved_mem(rmem->base, rmem->size, 0, &cma);
+	err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma);
 	if (err) {
 		pr_err("Reserved memory: unable to setup CMA region\n");
 		return err;
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 03f32d0bd1d8..d41d1f8d1e28 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -21,13 +21,15 @@ struct cma;
 extern unsigned long totalcma_pages;
 extern phys_addr_t cma_get_base(const struct cma *cma);
 extern unsigned long cma_get_size(const struct cma *cma);
+extern const char *cma_get_name(const struct cma *cma);
 
 extern int __init cma_declare_contiguous(phys_addr_t base,
 			phys_addr_t size, phys_addr_t limit,
 			phys_addr_t alignment, unsigned int order_per_bit,
-			bool fixed, struct cma **res_cma);
+			bool fixed, const char *name, struct cma **res_cma);
 extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 					unsigned int order_per_bit,
+					const char *name,
 					struct cma **res_cma);
 extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 			      gfp_t gfp_mask);
diff --git a/mm/cma.c b/mm/cma.c
index a6033e344430..43c1b2c1ac67 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -53,6 +53,11 @@ unsigned long cma_get_size(const struct cma *cma)
 	return cma->count << PAGE_SHIFT;
 }
 
+const char *cma_get_name(const struct cma *cma)
+{
+	return cma->name ? cma->name : "(undefined)";
+}
+
 static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
 					     int align_order)
 {
@@ -168,6 +173,7 @@ core_initcall(cma_init_reserved_areas);
  */
 int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 				 unsigned int order_per_bit,
+				 const char *name,
 				 struct cma **res_cma)
 {
 	struct cma *cma;
@@ -198,6 +204,13 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 	 * subsystems (like slab allocator) are available.
 	 */
 	cma = &cma_areas[cma_area_count];
+	if (name) {
+		cma->name = name;
+	} else {
+		cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count);
+		if (!cma->name)
+			return -ENOMEM;
+	}
 	cma->base_pfn = PFN_DOWN(base);
 	cma->count = size >> PAGE_SHIFT;
 	cma->order_per_bit = order_per_bit;
@@ -229,7 +242,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 int __init cma_declare_contiguous(phys_addr_t base,
 			phys_addr_t size, phys_addr_t limit,
 			phys_addr_t alignment, unsigned int order_per_bit,
-			bool fixed, struct cma **res_cma)
+			bool fixed, const char *name, struct cma **res_cma)
 {
 	phys_addr_t memblock_end = memblock_end_of_DRAM();
 	phys_addr_t highmem_start;
@@ -335,7 +348,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
 		base = addr;
 	}
 
-	ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
+	ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
 	if (ret)
 		goto err;
 
diff --git a/mm/cma.h b/mm/cma.h
index 17c75a4246c8..49861286279d 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -11,6 +11,7 @@ struct cma {
 	struct hlist_head mem_head;
 	spinlock_t mem_head_lock;
 #endif
+	const char *name;
 };
 
 extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index ffc0c3d0ae64..595b757bef72 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
 	char name[16];
 	int u32s;
 
-	sprintf(name, "cma-%d", idx);
+	sprintf(name, "cma-%s", cma->name);
 
 	tmp = debugfs_create_dir(name, cma_debugfs_root);
 
-- 
cgit v1.2.3


From e4231bcda72daef497af45e195a33daa0f9357d0 Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 18 Apr 2017 11:27:04 -0700
Subject: cma: Introduce cma_for_each_area

Frameworks (e.g. Ion) may want to iterate over each possible CMA area to
allow for enumeration. Introduce a function to allow a callback.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/cma.h |  2 ++
 mm/cma.c            | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cma.h b/include/linux/cma.h
index d41d1f8d1e28..3e8fbf5a5c73 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -34,4 +34,6 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 			      gfp_t gfp_mask);
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
+
+extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
 #endif
diff --git a/mm/cma.c b/mm/cma.c
index 43c1b2c1ac67..978b4a1441ef 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -504,3 +504,17 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
 
 	return true;
 }
+
+int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
+{
+	int i;
+
+	for (i = 0; i < cma_area_count; i++) {
+		int ret = it(&cma_areas[i], data);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 5f0d5a3ae7cff0d7fa943c199c3a2e44f23e1fac Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Jan 2017 02:53:44 -0800
Subject: mm: Rename SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU

A group of Linux kernel hackers reported chasing a bug that resulted
from their assumption that SLAB_DESTROY_BY_RCU provided an existence
guarantee, that is, that no block from such a slab would be reallocated
during an RCU read-side critical section.  Of course, that is not the
case.  Instead, SLAB_DESTROY_BY_RCU only prevents freeing of an entire
slab of blocks.

However, there is a phrase for this, namely "type safety".  This commit
therefore renames SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU in order
to avoid future instances of this sort of confusion.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
[ paulmck: Add comments mentioning the old name, as requested by Eric
  Dumazet, in order to help people familiar with the old name find
  the new one. ]
Acked-by: David Rientjes <rientjes@google.com>
---
 Documentation/RCU/00-INDEX                      |  2 +-
 Documentation/RCU/rculist_nulls.txt             |  6 +++---
 Documentation/RCU/whatisRCU.txt                 |  3 ++-
 drivers/gpu/drm/i915/i915_gem.c                 |  2 +-
 drivers/gpu/drm/i915/i915_gem_request.h         |  2 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c |  2 +-
 fs/jbd2/journal.c                               |  2 +-
 fs/signalfd.c                                   |  2 +-
 include/linux/dma-fence.h                       |  4 ++--
 include/linux/slab.h                            |  6 ++++--
 include/net/sock.h                              |  2 +-
 kernel/fork.c                                   |  4 ++--
 kernel/signal.c                                 |  2 +-
 mm/kasan/kasan.c                                |  6 +++---
 mm/kmemcheck.c                                  |  2 +-
 mm/rmap.c                                       |  4 ++--
 mm/slab.c                                       |  6 +++---
 mm/slab.h                                       |  4 ++--
 mm/slab_common.c                                |  6 +++---
 mm/slob.c                                       |  6 +++---
 mm/slub.c                                       | 12 ++++++------
 net/dccp/ipv4.c                                 |  2 +-
 net/dccp/ipv6.c                                 |  2 +-
 net/ipv4/tcp_ipv4.c                             |  2 +-
 net/ipv6/tcp_ipv6.c                             |  2 +-
 net/llc/af_llc.c                                |  2 +-
 net/llc/llc_conn.c                              |  4 ++--
 net/llc/llc_sap.c                               |  2 +-
 net/netfilter/nf_conntrack_core.c               |  8 ++++----
 net/smc/af_smc.c                                |  2 +-
 30 files changed, 57 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index f773a264ae02..1672573b037a 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -17,7 +17,7 @@ rcu_dereference.txt
 rcubarrier.txt
 	- RCU and Unloadable Modules
 rculist_nulls.txt
-	- RCU list primitives for use with SLAB_DESTROY_BY_RCU
+	- RCU list primitives for use with SLAB_TYPESAFE_BY_RCU
 rcuref.txt
 	- Reference-count design for elements of lists/arrays protected by RCU
 rcu.txt
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
index 18f9651ff23d..8151f0195f76 100644
--- a/Documentation/RCU/rculist_nulls.txt
+++ b/Documentation/RCU/rculist_nulls.txt
@@ -1,5 +1,5 @@
 Using hlist_nulls to protect read-mostly linked lists and
-objects using SLAB_DESTROY_BY_RCU allocations.
+objects using SLAB_TYPESAFE_BY_RCU allocations.
 
 Please read the basics in Documentation/RCU/listRCU.txt
 
@@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way
 to solve following problem :
 
 A typical RCU linked list managing objects which are
-allocated with SLAB_DESTROY_BY_RCU kmem_cache can
+allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
 use following algos :
 
 1) Lookup algo
@@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock()
 3) Remove algo
 --------------
 Nothing special here, we can use a standard RCU hlist deletion.
-But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused
+But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
 very very fast (before the end of RCU grace period)
 
 if (put_last_reference_on(obj) {
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 5cbd8b2395b8..91c912e86915 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -925,7 +925,8 @@ d.	Do you need RCU grace periods to complete even in the face
 
 e.	Is your workload too update-intensive for normal use of
 	RCU, but inappropriate for other synchronization mechanisms?
-	If so, consider SLAB_DESTROY_BY_RCU.  But please be careful!
+	If so, consider SLAB_TYPESAFE_BY_RCU (which was originally
+	named SLAB_DESTROY_BY_RCU).  But please be careful!
 
 f.	Do you need read-side critical sections that are respected
 	even though they are in the middle of the idle loop, during
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6908123162d1..3b668895ac24 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4552,7 +4552,7 @@ i915_gem_load_init(struct drm_i915_private *dev_priv)
 	dev_priv->requests = KMEM_CACHE(drm_i915_gem_request,
 					SLAB_HWCACHE_ALIGN |
 					SLAB_RECLAIM_ACCOUNT |
-					SLAB_DESTROY_BY_RCU);
+					SLAB_TYPESAFE_BY_RCU);
 	if (!dev_priv->requests)
 		goto err_vmas;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index ea511f06efaf..9ee2750e1dde 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -493,7 +493,7 @@ static inline struct drm_i915_gem_request *
 __i915_gem_active_get_rcu(const struct i915_gem_active *active)
 {
 	/* Performing a lockless retrieval of the active request is super
-	 * tricky. SLAB_DESTROY_BY_RCU merely guarantees that the backing
+	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
 	 * slab of request objects will not be freed whilst we hold the
 	 * RCU read lock. It does not guarantee that the request itself
 	 * will not be freed and then *reused*. Viz,
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
index 12647af5a336..e7fb47e84a93 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c
@@ -1071,7 +1071,7 @@ int ldlm_init(void)
 	ldlm_lock_slab = kmem_cache_create("ldlm_locks",
 					   sizeof(struct ldlm_lock), 0,
 					   SLAB_HWCACHE_ALIGN |
-					   SLAB_DESTROY_BY_RCU, NULL);
+					   SLAB_TYPESAFE_BY_RCU, NULL);
 	if (!ldlm_lock_slab) {
 		kmem_cache_destroy(ldlm_resource_slab);
 		return -ENOMEM;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a1a359bfcc9c..7f8f962454e5 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2340,7 +2340,7 @@ static int jbd2_journal_init_journal_head_cache(void)
 	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
 				sizeof(struct journal_head),
 				0,		/* offset */
-				SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
+				SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
 				NULL);		/* ctor */
 	retval = 0;
 	if (!jbd2_journal_head_cache) {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 270221fcef42..7e3d71109f51 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand)
 	/*
 	 * The lockless check can race with remove_wait_queue() in progress,
 	 * but in this case its caller should run under rcu_read_lock() and
-	 * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return.
+	 * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return.
 	 */
 	if (likely(!waitqueue_active(wqh)))
 		return;
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index 6048fa404e57..a5195a7d6f77 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence)
  *
  * Function returns NULL if no refcount could be obtained, or the fence.
  * This function handles acquiring a reference to a fence that may be
- * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU),
+ * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU),
  * so long as the caller is using RCU on the pointer to the fence.
  *
  * An alternative mechanism is to employ a seqlock to protect a bunch of
@@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep)
 		 * have successfully acquire a reference to it. If it no
 		 * longer matches, we are holding a reference to some other
 		 * reallocated pointer. This is possible if the allocator
-		 * is using a freelist like SLAB_DESTROY_BY_RCU where the
+		 * is using a freelist like SLAB_TYPESAFE_BY_RCU where the
 		 * fence remains valid for the RCU grace period, but it
 		 * may be reallocated. When using such allocators, we are
 		 * responsible for ensuring the reference we get is to
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3c37a8c51921..04a7f7993e67 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -28,7 +28,7 @@
 #define SLAB_STORE_USER		0x00010000UL	/* DEBUG: Store the last owner for bug hunting */
 #define SLAB_PANIC		0x00040000UL	/* Panic if kmem_cache_create() fails */
 /*
- * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS!
+ * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
  *
  * This delays freeing the SLAB page by a grace period, it does _NOT_
  * delay object freeing. This means that if you do kmem_cache_free()
@@ -61,8 +61,10 @@
  *
  * rcu_read_lock before reading the address, then rcu_read_unlock after
  * taking the spinlock within the structure expected at that address.
+ *
+ * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
  */
-#define SLAB_DESTROY_BY_RCU	0x00080000UL	/* Defer freeing slabs to RCU */
+#define SLAB_TYPESAFE_BY_RCU	0x00080000UL	/* Defer freeing slabs to RCU */
 #define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
 #define SLAB_TRACE		0x00200000UL	/* Trace allocations and frees */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 5e5997654db6..59cdccaa30e7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -993,7 +993,7 @@ struct smc_hashinfo;
 struct module;
 
 /*
- * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
+ * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
  * un-modified. Special care is taken when initializing object to zero.
  */
 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..9330ce24f1bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1313,7 +1313,7 @@ void __cleanup_sighand(struct sighand_struct *sighand)
 	if (atomic_dec_and_test(&sighand->count)) {
 		signalfd_cleanup(sighand);
 		/*
-		 * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+		 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
 		 * without an RCU grace period, see __lock_task_sighand().
 		 */
 		kmem_cache_free(sighand_cachep, sighand);
@@ -2144,7 +2144,7 @@ void __init proc_caches_init(void)
 {
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
 			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..6df5f72158e4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
 		}
 		/*
 		 * This sighand can be already freed and even reused, but
-		 * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+		 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
 		 * initializes ->siglock: this slab can't go away, it has
 		 * the same object type, ->siglock can't be reinitialized.
 		 *
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 98b27195e38b..4b20061102f6 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 	*size += sizeof(struct kasan_alloc_meta);
 
 	/* Add free meta. */
-	if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+	if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
 	    cache->object_size < sizeof(struct kasan_free_meta)) {
 		cache->kasan_info.free_meta_offset = *size;
 		*size += sizeof(struct kasan_free_meta);
@@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
 
 	/* RCU slabs could be legally used after free within the RCU period */
-	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return;
 
 	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
@@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
 	s8 shadow_byte;
 
 	/* RCU slabs could be legally used after free within the RCU period */
-	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+	if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
 		return false;
 
 	shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 5bf191756a4a..2d5959c5f7c5 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
 void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
 {
 	/* TODO: RCU freeing is unsupported for now; hide false positives. */
-	if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU))
+	if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
 		kmemcheck_mark_freed(object, size);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 49ed681ccc7b..8ffd59df8a3f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data)
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+			0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
 			anon_vma_ctor);
 	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
 			SLAB_PANIC|SLAB_ACCOUNT);
@@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
 	 * If this page is still mapped, then its anon_vma cannot have been
 	 * freed.  But if it has been unmapped, we have no security against the
 	 * anon_vma structure being freed and reused (for another anon_vma:
-	 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
+	 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
 	 * above cannot corrupt).
 	 */
 	if (!page_mapped(page)) {
diff --git a/mm/slab.c b/mm/slab.c
index 807d86c76908..93c827864862 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
 
 	freelist = page->freelist;
 	slab_destroy_debugcheck(cachep, page);
-	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+	if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
 		call_rcu(&page->rcu_head, kmem_rcu_free);
 	else
 		kmem_freepages(cachep, page);
@@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
 
 	cachep->num = 0;
 
-	if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+	if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
 		return false;
 
 	left = calculate_slab_order(cachep, size,
@@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
 						2 * sizeof(unsigned long long)))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
-	if (!(flags & SLAB_DESTROY_BY_RCU))
+	if (!(flags & SLAB_TYPESAFE_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
 #endif
diff --git a/mm/slab.h b/mm/slab.h
index 65e7c3fcac72..9cfcf099709c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 
 /* Legal flag mask for kmem_cache_create(), for various configurations */
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
 
 #if defined(CONFIG_DEBUG_SLAB)
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
@@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
 	 * back there or track user information then we can
 	 * only use the space before that information.
 	 */
-	if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+	if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
 		return s->inuse;
 	/*
 	 * Else we can use all the padding etc for the allocation
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 09d0e849b07f..01a0fe2eb332 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
  * Set of flags that will prevent slab merging
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
@@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
 	struct kmem_cache *s, *s2;
 
 	/*
-	 * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the
+	 * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
 	 * @slab_caches_to_rcu_destroy list.  The slab pages are freed
 	 * through RCU and and the associated kmem_cache are dereferenced
 	 * while freeing the pages, so the kmem_caches should be freed only
@@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s)
 	memcg_unlink_cache(s);
 	list_del(&s->list);
 
-	if (s->flags & SLAB_DESTROY_BY_RCU) {
+	if (s->flags & SLAB_TYPESAFE_BY_RCU) {
 		list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
 		schedule_work(&slab_caches_to_rcu_destroy_work);
 	} else {
diff --git a/mm/slob.c b/mm/slob.c
index eac04d4357ec..1bae78d71096 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp)
 
 /*
  * struct slob_rcu is inserted at the tail of allocated slob blocks, which
- * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
+ * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free
  * the block using call_rcu.
  */
 struct slob_rcu {
@@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize);
 
 int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 {
-	if (flags & SLAB_DESTROY_BY_RCU) {
+	if (flags & SLAB_TYPESAFE_BY_RCU) {
 		/* leave room for rcu footer at the end of object */
 		c->size += sizeof(struct slob_rcu);
 	}
@@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
 	kmemleak_free_recursive(b, c->flags);
-	if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
+	if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
 		struct slob_rcu *slob_rcu;
 		slob_rcu = b + (c->size - sizeof(struct slob_rcu));
 		slob_rcu->size = c->size;
diff --git a/mm/slub.c b/mm/slub.c
index 7f4bc7027ed5..57e5156f02be 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h)
 
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
-	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
+	if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
 		struct rcu_head *head;
 
 		if (need_reserve_slab_rcu) {
@@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
 	 * slab_free_freelist_hook() could have put the items into quarantine.
 	 * If so, no need to free them.
 	 */
-	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU))
+	if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU))
 		return;
 	do_slab_free(s, page, head, tail, cnt, addr);
 }
@@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	 * the slab may touch the object after free or before allocation
 	 * then we should never poison the object itself.
 	 */
-	if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
+	if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) &&
 			!s->ctor)
 		s->flags |= __OBJECT_POISON;
 	else
@@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	 */
 	s->inuse = size;
 
-	if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
+	if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
 		s->ctor)) {
 		/*
 		 * Relocate free pointer after the object if it is not
@@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 	s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
 	s->reserved = 0;
 
-	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+	if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
 		s->reserved = sizeof(struct rcu_head);
 
 	if (!calculate_sizes(s, -1))
@@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma);
 
 static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 {
-	return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+	return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
 }
 SLAB_ATTR_RO(destroy_by_rcu);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 409d0cfd3447..90210a0e3888 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -950,7 +950,7 @@ static struct proto dccp_v4_prot = {
 	.orphan_count		= &dccp_orphan_count,
 	.max_header		= MAX_DCCP_HEADER,
 	.obj_size		= sizeof(struct dccp_sock),
-	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.rsk_prot		= &dccp_request_sock_ops,
 	.twsk_prot		= &dccp_timewait_sock_ops,
 	.h.hashinfo		= &dccp_hashinfo,
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 233b57367758..b4019a5e4551 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1012,7 +1012,7 @@ static struct proto dccp_v6_prot = {
 	.orphan_count	   = &dccp_orphan_count,
 	.max_header	   = MAX_DCCP_HEADER,
 	.obj_size	   = sizeof(struct dccp6_sock),
-	.slab_flags	   = SLAB_DESTROY_BY_RCU,
+	.slab_flags	   = SLAB_TYPESAFE_BY_RCU,
 	.rsk_prot	   = &dccp6_request_sock_ops,
 	.twsk_prot	   = &dccp6_timewait_sock_ops,
 	.h.hashinfo	   = &dccp_hashinfo,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9a89b8deafae..82c89abeb989 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2398,7 +2398,7 @@ struct proto tcp_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp_sock),
-	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.twsk_prot		= &tcp_timewait_sock_ops,
 	.rsk_prot		= &tcp_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 60a5295a7de6..bdbc4327ebee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1919,7 +1919,7 @@ struct proto tcpv6_prot = {
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
 	.obj_size		= sizeof(struct tcp6_sock),
-	.slab_flags		= SLAB_DESTROY_BY_RCU,
+	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
 	.twsk_prot		= &tcp6_timewait_sock_ops,
 	.rsk_prot		= &tcp6_request_sock_ops,
 	.h.hashinfo		= &tcp_hashinfo,
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 06186d608a27..d096ca563054 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,7 +142,7 @@ static struct proto llc_proto = {
 	.name	  = "LLC",
 	.owner	  = THIS_MODULE,
 	.obj_size = sizeof(struct llc_sock),
-	.slab_flags = SLAB_DESTROY_BY_RCU,
+	.slab_flags = SLAB_TYPESAFE_BY_RCU,
 };
 
 /**
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 8bc5a1bd2d45..9b02c13d258b 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
 again:
 	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
 		if (llc_estab_match(sap, daddr, laddr, rc)) {
-			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
 			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
 				goto again;
 			if (unlikely(llc_sk(rc)->sap != sap ||
@@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
 again:
 	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
 		if (llc_listener_match(sap, laddr, rc)) {
-			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
 			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
 				goto again;
 			if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index 5404d0d195cc..63b6ab056370 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
 again:
 	sk_nulls_for_each_rcu(rc, node, laddr_hb) {
 		if (llc_dgram_match(sap, laddr, rc)) {
-			/* Extra checks required by SLAB_DESTROY_BY_RCU */
+			/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
 			if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt)))
 				goto again;
 			if (unlikely(llc_sk(rc)->sap != sap ||
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 071b97fcbefb..fdcdac7916b2 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -914,7 +914,7 @@ static unsigned int early_drop_list(struct net *net,
 			continue;
 
 		/* kill only if still in same netns -- might have moved due to
-		 * SLAB_DESTROY_BY_RCU rules.
+		 * SLAB_TYPESAFE_BY_RCU rules.
 		 *
 		 * We steal the timer reference.  If that fails timer has
 		 * already fired or someone else deleted it. Just drop ref
@@ -1069,7 +1069,7 @@ __nf_conntrack_alloc(struct net *net,
 
 	/*
 	 * Do not use kmem_cache_zalloc(), as this cache uses
-	 * SLAB_DESTROY_BY_RCU.
+	 * SLAB_TYPESAFE_BY_RCU.
 	 */
 	ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
 	if (ct == NULL)
@@ -1114,7 +1114,7 @@ void nf_conntrack_free(struct nf_conn *ct)
 	struct net *net = nf_ct_net(ct);
 
 	/* A freed object has refcnt == 0, that's
-	 * the golden rule for SLAB_DESTROY_BY_RCU
+	 * the golden rule for SLAB_TYPESAFE_BY_RCU
 	 */
 	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
 
@@ -1878,7 +1878,7 @@ int nf_conntrack_init_start(void)
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
 						NFCT_INFOMASK + 1,
-						SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
+						SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
 	if (!nf_conntrack_cachep)
 		goto err_cachep;
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 85837ab90e89..d34bbd6d8f38 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -101,7 +101,7 @@ struct proto smc_proto = {
 	.unhash		= smc_unhash_sk,
 	.obj_size	= sizeof(struct smc_sock),
 	.h.smc_hash	= &smc_v4_hashinfo,
-	.slab_flags	= SLAB_DESTROY_BY_RCU,
+	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 };
 EXPORT_SYMBOL_GPL(smc_proto);
 
-- 
cgit v1.2.3


From de5bbdd01cf9ee3cd4586b5a970d3ea015c6d7e3 Mon Sep 17 00:00:00 2001
From: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
Date: Tue, 18 Apr 2017 14:21:04 -0500
Subject: PCI: Change pci_host_common_probe() visibility

pci_host_common_probe() is defined when CONFIG_PCI_HOST_COMMON=y;
therefore the function declaration should match that.

  drivers/pci/host/pcie-tango.c:300:9: error:
	implicit declaration of function 'pci_host_common_probe'

Signed-off-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci-ecam.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h
index b8f11d783a11..809c2f1873ac 100644
--- a/include/linux/pci-ecam.h
+++ b/include/linux/pci-ecam.h
@@ -69,7 +69,7 @@ extern struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */
 extern struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */
 #endif
 
-#ifdef CONFIG_PCI_HOST_GENERIC
+#ifdef CONFIG_PCI_HOST_COMMON
 /* for DT-based PCI controllers that support ECAM */
 int pci_host_common_probe(struct platform_device *pdev,
 			  struct pci_ecam_ops *ops);
-- 
cgit v1.2.3


From f49c3f90a31f6e19ef3343dcc8809dac1019b59e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Fri, 7 Apr 2017 16:22:47 +0800
Subject: ACPI / tables: Drop acpi_parse_entries() which is not used

Function acpi_parse_entries() is not used any more and if necessary,
acpi_table_parse_entries() can be used instead of it, so drop it.

Signed-off-by: Baoquan He <bhe@redhat.com>
[ rjw: Subject / changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/tables.c | 16 ----------------
 include/linux/acpi.h  |  4 ----
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 2604189d6cd1..0dae722ab2ec 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -310,22 +310,6 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 	return errs ? -EINVAL : count;
 }
 
-int __init
-acpi_parse_entries(char *id,
-			unsigned long table_size,
-			acpi_tbl_entry_handler handler,
-			struct acpi_table_header *table_header,
-			int entry_id, unsigned int max_entries)
-{
-	struct acpi_subtable_proc proc = {
-		.id		= entry_id,
-		.handler	= handler,
-	};
-
-	return acpi_parse_entries_array(id, table_size, table_header,
-			&proc, 1, max_entries);
-}
-
 int __init
 acpi_table_parse_entries_array(char *id,
 			 unsigned long table_size,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..83abbfceabad 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -233,10 +233,6 @@ int acpi_numa_init (void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
-int __init acpi_parse_entries(char *id, unsigned long table_size,
-			      acpi_tbl_entry_handler handler,
-			      struct acpi_table_header *table_header,
-			      int entry_id, unsigned int max_entries);
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
 			      int entry_id,
 			      acpi_tbl_entry_handler handler,
-- 
cgit v1.2.3


From 139c279fb9423833fb730ccb07e549b5a9183f44 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 08:53:42 +0200
Subject: quota: Remove dquot_quotactl_ops

Nobody uses them anymore.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 12 ------------
 include/linux/quotaops.h |  1 -
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7e94cb0ecdde..ebf80c7739e1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2771,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
 }
 EXPORT_SYMBOL(dquot_set_dqinfo);
 
-const struct quotactl_ops dquot_quotactl_ops = {
-	.quota_on	= dquot_quota_on,
-	.quota_off	= dquot_quota_off,
-	.quota_sync	= dquot_quota_sync,
-	.get_state	= dquot_get_state,
-	.set_info	= dquot_set_dqinfo,
-	.get_dqblk	= dquot_get_dqblk,
-	.get_nextdqblk	= dquot_get_next_dqblk,
-	.set_dqblk	= dquot_set_dqblk
-};
-EXPORT_SYMBOL(dquot_quotactl_ops);
-
 const struct quotactl_ops dquot_quotactl_sysfile_ops = {
 	.quota_enable	= dquot_quota_enable,
 	.quota_disable	= dquot_quota_disable,
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 799a63d0e1a8..9c6f768b7d32 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -162,7 +162,6 @@ static inline bool sb_has_quota_active(struct super_block *sb, int type)
  * Operations supported for diskquotas.
  */
 extern const struct dquot_operations dquot_operations;
-extern const struct quotactl_ops dquot_quotactl_ops;
 extern const struct quotactl_ops dquot_quotactl_sysfile_ops;
 
 #else
-- 
cgit v1.2.3


From e21b7a0b988772e82e7147e1c659a5afe2ae003c Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini.arianna@gmail.com>
Date: Wed, 12 Apr 2017 18:23:08 +0200
Subject: block, bfq: add full hierarchical scheduling and cgroups support

Add complete support for full hierarchical scheduling, with a cgroups
interface. Full hierarchical scheduling is implemented through the
'entity' abstraction: both bfq_queues, i.e., the internal BFQ queues
associated with processes, and groups are represented in general by
entities. Given the bfq_queues associated with the processes belonging
to a given group, the entities representing these queues are sons of
the entity representing the group. At higher levels, if a group, say
G, contains other groups, then the entity representing G is the parent
entity of the entities representing the groups in G.

Hierarchical scheduling is performed as follows: if the timestamps of
a leaf entity (i.e., of a bfq_queue) change, and such a change lets
the entity become the next-to-serve entity for its parent entity, then
the timestamps of the parent entity are recomputed as a function of
the budget of its new next-to-serve leaf entity. If the parent entity
belongs, in its turn, to a group, and its new timestamps let it become
the next-to-serve for its parent entity, then the timestamps of the
latter parent entity are recomputed as well, and so on. When a new
bfq_queue must be set in service, the reverse path is followed: the
next-to-serve highest-level entity is chosen, then its next-to-serve
child entity, and so on, until the next-to-serve leaf entity is
reached, and the bfq_queue that this entity represents is set in
service.

Writeback is accounted for on a per-group basis, i.e., for each group,
the async I/O requests of the processes of the group are enqueued in a
distinct bfq_queue, and the entity associated with this queue is a
child of the entity associated with the group.

Weights can be assigned explicitly to groups and processes through the
cgroups interface, differently from what happens, for single
processes, if the cgroups interface is not used (as explained in the
description of the previous patch). In particular, since each node has
a full scheduler, each group can be assigned its own weight.

Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/bfq-iosched.txt |   17 +-
 block/Kconfig.iosched               |   10 +
 block/bfq-iosched.c                 | 2568 ++++++++++++++++++++++++++++++-----
 include/linux/blkdev.h              |    2 +-
 4 files changed, 2213 insertions(+), 384 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index cbf85f6f1fd8..461b27fce979 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -253,9 +253,14 @@ of slice_idle are copied from CFQ too.
 per-process ioprio and weight
 -----------------------------
 
-Unless the cgroups interface is used, weights can be assigned to
-processes only indirectly, through I/O priorities, and according to
-the relation: weight = (IOPRIO_BE_NR - ioprio) * 10.
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
 
 slice_idle
 ----------
@@ -450,9 +455,9 @@ may be reactivated for an already busy async queue (in ms).
 4. Group scheduling with BFQ
 ============================
 
-BFQ supports both cgroup-v1 and cgroup-v2 io controllers, namely blkio
-and io. In particular, BFQ supports weight-based proportional
-share.
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
 
 4-1 Service guarantees provided
 -------------------------------
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 6fc36027b70e..fd2cefa47d35 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
 	  Enable group IO scheduling in CFQ.
 
 choice
+
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
 	help
@@ -89,6 +90,15 @@ config IOSCHED_BFQ
 	real-time applications.  Details in
 	Documentation/block/bfq-iosched.txt
 
+config BFQ_GROUP_IOSCHED
+       bool "BFQ hierarchical scheduling support"
+       depends on IOSCHED_BFQ && BLK_CGROUP
+       default n
+       ---help---
+
+       Enable hierarchical scheduling in BFQ, using the blkio
+       (cgroups-v1) or io (cgroups-v2) controller.
+
 endmenu
 
 endif
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c4e7d8db796a..af1740a1d453 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -90,6 +90,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/cgroup.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
@@ -114,7 +115,7 @@
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO	4
 
-#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_WEIGHT_LEGACY_DFL	100
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
 
@@ -149,10 +150,11 @@ struct bfq_service_tree {
  * struct bfq_sched_data - multi-class scheduler.
  *
  * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as
- * an intermediate queue on a hierarchical setup.
- * @next_in_service points to the active entity of the sched_data
- * service trees that will be scheduled next.
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
  *
  * The supported ioprio_classes are the same as in CFQ, in descending
  * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
@@ -164,19 +166,23 @@ struct bfq_service_tree {
 struct bfq_sched_data {
 	/* entity in service */
 	struct bfq_entity *in_service_entity;
-	/* head-of-the-line entity in the scheduler */
+	/* head-of-line entity (see comments above) */
 	struct bfq_entity *next_in_service;
 	/* array of service trees, one per ioprio_class */
 	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+	/* last time CLASS_IDLE was served */
+	unsigned long bfq_class_idle_last_service;
+
 };
 
 /**
  * struct bfq_entity - schedulable entity.
  *
- * A bfq_entity is used to represent a bfq_queue (leaf node in the upper
- * level scheduler). Each entity belongs to the sched_data of the parent
- * group hierarchy. Non-leaf entities have also their own sched_data,
- * stored in @my_sched_data.
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
  *
  * Each entity stores independently its priority values; this would
  * allow different weights on different devices, but this
@@ -187,23 +193,24 @@ struct bfq_sched_data {
  * update to take place the effective and the requested priority
  * values are synchronized.
  *
- * The weight value is calculated from the ioprio to export the same
- * interface as CFQ.  When dealing with  ``well-behaved'' queues (i.e.,
- * queues that do not spend too much time to consume their budget
- * and have true sequential behavior, and when there are no external
- * factors breaking anticipation) the relative weights at each level
- * of the hierarchy should be guaranteed.  All the fields are
- * protected by the queue lock of the containing bfqd.
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
  */
 struct bfq_entity {
 	/* service_tree member */
 	struct rb_node rb_node;
 
 	/*
-	 * flag, true if the entity is on a tree (either the active or
-	 * the idle one of its service_tree).
+	 * Flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree) or is in service.
 	 */
-	int on_st;
+	bool on_st;
 
 	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
 	u64 start, finish;
@@ -246,6 +253,8 @@ struct bfq_entity {
 	int prio_changed;
 };
 
+struct bfq_group;
+
 /**
  * struct bfq_ttime - per process thinktime stats.
  */
@@ -265,7 +274,11 @@ struct bfq_ttime {
  * struct bfq_queue - leaf schedulable entity.
  *
  * A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it is async.
+ * io_context or more, if it is async. @cgroup holds a reference to
+ * the cgroup, to be sure that it does not disappear while a bfqq
+ * still references it (mostly to avoid races between request issuing
+ * and task migration followed by cgroup destruction).  All the fields
+ * are protected by the queue lock of the containing bfqd.
  */
 struct bfq_queue {
 	/* reference counter */
@@ -338,6 +351,9 @@ struct bfq_io_cq {
 	struct bfq_queue *bfqq[2];
 	/* per (request_queue, blkcg) ioprio */
 	int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
 };
 
 /**
@@ -351,8 +367,8 @@ struct bfq_data {
 	/* dispatch queue */
 	struct list_head dispatch;
 
-	/* root @bfq_sched_data for the device */
-	struct bfq_sched_data sched_data;
+	/* root bfq_group for the device */
+	struct bfq_group *root_group;
 
 	/*
 	 * Number of bfq_queues containing requests (including the
@@ -423,8 +439,6 @@ struct bfq_data {
 	unsigned int bfq_back_max;
 	/* maximum idling time */
 	u32 bfq_slice_idle;
-	/* last time CLASS_IDLE was served */
-	u64 bfq_class_idle_last_service;
 
 	/* user-configured max budget value (0 for auto-tuning) */
 	int bfq_user_max_budget;
@@ -516,8 +530,35 @@ BFQ_BFQQ_FNS(IO_bound);
 #undef BFQ_BFQQ_FNS
 
 /* Logging facilities. */
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+				##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_log(bfqd, fmt, args...) \
 	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
@@ -534,15 +575,120 @@ enum bfqq_expiration {
 	BFQQE_PREEMPTED		/* preemption in progress */
 };
 
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned short weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @stats: stats for this bfqg.
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
+};
+#endif
+
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	return bfqq ? bfqq->ioprio_class - 1 :
+		BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
 static struct bfq_service_tree *
 bfq_entity_service_tree(struct bfq_entity *entity)
 {
 	struct bfq_sched_data *sched_data = entity->sched_data;
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
-				  BFQ_DEFAULT_GRP_CLASS - 1;
+	unsigned int idx = bfq_class_idx(entity);
 
 	return sched_data->service_tree + idx;
 }
@@ -568,16 +714,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				       struct bio *bio, bool is_sync,
 				       struct bfq_io_cq *bic);
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 
-/*
- * Array of async queues for all the processes, one queue
- * per ioprio value per ioprio_class.
- */
-struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-/* Async queue for the idle class (ioprio is ignored) */
-struct bfq_queue *async_idle_bfqq;
-
 /* Expiration time of sync (0) and async (1) requests, in ns. */
 static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
 
@@ -663,30 +802,222 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
 }
 
 /*
- * Next two macros are just fake loops for the moment. They will
- * become true loops in the cgroups-enabled variant of the code. Such
- * a variant, in its turn, will be introduced by next commit.
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		blk_mq_run_hw_queues(bfqd->queue, true);
+	}
+}
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+	struct rb_node *node = tree->rb_node;
+
+	return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ *		requeueing or repositionig triggered the invocation of
+ *		this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
  */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+				       struct bfq_entity *new_entity)
+{
+	struct bfq_entity *next_in_service = sd->next_in_service;
+	bool parent_sched_may_change = false;
+
+	/*
+	 * If this update is triggered by the activation, requeueing
+	 * or repositiong of an entity that does not coincide with
+	 * sd->next_in_service, then a full lookup in the active tree
+	 * can be avoided. In fact, it is enough to check whether the
+	 * just-modified entity has a higher priority than
+	 * sd->next_in_service, or, even if it has the same priority
+	 * as sd->next_in_service, is eligible and has a lower virtual
+	 * finish time than sd->next_in_service. If this compound
+	 * condition holds, then the new entity becomes the new
+	 * next_in_service. Otherwise no change is needed.
+	 */
+	if (new_entity && new_entity != sd->next_in_service) {
+		/*
+		 * Flag used to decide whether to replace
+		 * sd->next_in_service with new_entity. Tentatively
+		 * set to true, and left as true if
+		 * sd->next_in_service is NULL.
+		 */
+		bool replace_next = true;
+
+		/*
+		 * If there is already a next_in_service candidate
+		 * entity, then compare class priorities or timestamps
+		 * to decide whether to replace sd->service_tree with
+		 * new_entity.
+		 */
+		if (next_in_service) {
+			unsigned int new_entity_class_idx =
+				bfq_class_idx(new_entity);
+			struct bfq_service_tree *st =
+				sd->service_tree + new_entity_class_idx;
+
+			/*
+			 * For efficiency, evaluate the most likely
+			 * sub-condition first.
+			 */
+			replace_next =
+				(new_entity_class_idx ==
+				 bfq_class_idx(next_in_service)
+				 &&
+				 !bfq_gt(new_entity->start, st->vtime)
+				 &&
+				 bfq_gt(next_in_service->finish,
+					new_entity->finish))
+				||
+				new_entity_class_idx <
+				bfq_class_idx(next_in_service);
+		}
+
+		if (replace_next)
+			next_in_service = new_entity;
+	} else /* invoked because of a deactivation: lookup needed */
+		next_in_service = bfq_lookup_next_entity(sd);
+
+	if (next_in_service) {
+		parent_sched_may_change = !sd->next_in_service ||
+			bfq_update_parent_budget(next_in_service);
+	}
+
+	sd->next_in_service = next_in_service;
+
+	if (!next_in_service)
+		return parent_sched_may_change;
+
+	return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
 #define for_each_entity(entity)	\
-	for (; entity ; entity = NULL)
+	for (; entity ; entity = entity->parent)
 
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
 #define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity ; entity = parent)
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
 
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
-	return 0;
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+	bool ret = false;
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity) {
+		if (bfqg_entity->budget > next_in_service->budget)
+			ret = true;
+		bfqg_entity->budget = next_in_service->budget;
+	}
+
+	return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+	if (bfq_entity_to_bfqq(entity))
+		return true;
+
+	return false;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
+	return false;
 }
 
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 {
+	return true;
 }
 
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
 /*
  * Shift for timestamp calculations.  This actually limits the maximum
  * service allowed in one timestamp delta (small shift values increase it),
@@ -696,18 +1027,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
  */
 #define WFQ_SERVICE_SHIFT	22
 
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = NULL;
@@ -926,6 +1245,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
 
 	bfq_insert(&st->active, entity);
 
@@ -936,6 +1260,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 
 	bfq_update_active_tree(node);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
 	if (bfqq)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
 }
@@ -1014,6 +1343,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
 
 	node = bfq_find_deepest(&entity->rb_node);
 	bfq_extract(&st->active, entity);
@@ -1021,6 +1355,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 	if (node)
 		bfq_update_active_tree(node);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
 	if (bfqq)
 		list_del(&bfqq->bfqq_list);
 }
@@ -1069,7 +1408,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	entity->on_st = 0;
+	entity->on_st = false;
 	st->wsum -= entity->weight;
 	if (bfqq && !is_in_service)
 		bfq_put_queue(bfqq);
@@ -1115,7 +1454,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 
 static struct bfq_service_tree *
 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-			 struct bfq_entity *entity)
+				struct bfq_entity *entity)
 {
 	struct bfq_service_tree *new_st = old_st;
 
@@ -1123,9 +1462,20 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned short prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
 
 		if (bfqq)
 			bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+		}
+#endif
 
 		old_st->wsum -= entity->weight;
 
@@ -1171,6 +1521,9 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 	return new_st;
 }
 
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
 /**
  * bfq_bfqq_served - update the scheduler status after selection for
  *                   service.
@@ -1194,6 +1547,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 		st->vtime += bfq_delta(served, st->wsum);
 		bfq_forget_idle(st);
 	}
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
 	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
 }
 
@@ -1216,78 +1570,10 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
 	bfq_bfqq_served(bfqq, entity->budget - entity->service);
 }
 
-/**
- * __bfq_activate_entity - activate an entity.
- * @entity: the entity being activated.
- * @non_blocking_wait_rq: true if this entity was waiting for a request
- *
- * Called whenever an entity is activated, i.e., it is not active and one
- * of its children receives a new request, or has to be reactivated due to
- * budget exhaustion.  It uses the current budget of the entity (and the
- * service received if @entity is active) of the queue to calculate its
- * timestamps.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity,
-				  bool non_blocking_wait_rq)
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+					struct bfq_service_tree *st,
+					bool backshifted)
 {
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	bool backshifted = false;
-
-	if (entity == sd->in_service_entity) {
-		/*
-		 * If we are requeueing the current entity we have
-		 * to take care of not charging to it service it has
-		 * not received.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active) {
-		/*
-		 * Requeueing an entity due to a change of some
-		 * next_in_service entity below it.  We reuse the
-		 * old start time.
-		 */
-		bfq_active_extract(st, entity);
-	} else {
-		unsigned long long min_vstart;
-
-		/* See comments on bfq_fqq_update_budg_for_activation */
-		if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
-			backshifted = true;
-			min_vstart = entity->finish;
-		} else
-			min_vstart = st->vtime;
-
-		if (entity->tree == &st->idle) {
-			/*
-			 * Must be on the idle tree, bfq_idle_extract() will
-			 * check for that.
-			 */
-			bfq_idle_extract(st, entity);
-			entity->start = bfq_gt(min_vstart, entity->finish) ?
-				min_vstart : entity->finish;
-		} else {
-			/*
-			 * The finish time of the entity may be invalid, and
-			 * it is in the past for sure, otherwise the queue
-			 * would have been on the idle tree.
-			 */
-			entity->start = min_vstart;
-			st->wsum += entity->weight;
-			/*
-			 * entity is about to be inserted into a service tree,
-			 * and then set in service: get a reference to make
-			 * sure entity does not disappear until it is no
-			 * longer in service or scheduled for service.
-			 */
-			bfq_get_entity(entity);
-
-			entity->on_st = 1;
-		}
-	}
-
 	st = __bfq_entity_update_weight_prio(st, entity);
 	bfq_calc_finish(entity, entity->budget);
 
@@ -1329,27 +1615,185 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
 }
 
 /**
- * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	bool backshifted = false;
+	unsigned long long min_vstart;
+
+	/* See comments on bfq_fqq_update_budg_for_activation */
+	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+		backshifted = true;
+		min_vstart = entity->finish;
+	} else
+		min_vstart = st->vtime;
+
+	if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(min_vstart, entity->finish) ?
+			min_vstart : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = min_vstart;
+		st->wsum += entity->weight;
+		/*
+		 * entity is about to be inserted into a service tree,
+		 * and then set in service: get a reference to make
+		 * sure entity does not disappear until it is no
+		 * longer in service or scheduled for service.
+		 */
+		bfq_get_entity(entity);
+
+		entity->on_st = true;
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->in_service_entity) {
+		/*
+		 * We are requeueing the current in-service entity,
+		 * which may have to be done for one of the following
+		 * reasons:
+		 * - entity represents the in-service queue, and the
+		 *   in-service queue is being requeued after an
+		 *   expiration;
+		 * - entity represents a group, and its budget has
+		 *   changed because one of its child entities has
+		 *   just been either activated or requeued for some
+		 *   reason; the timestamps of the entity need then to
+		 *   be updated, and the entity needs to be enqueued
+		 *   or repositioned accordingly.
+		 *
+		 * In particular, before requeueing, the start time of
+		 * the entity must be moved forward to account for the
+		 * service that the entity has received while in
+		 * service. This is done by the next instructions. The
+		 * finish time will then be updated according to this
+		 * new value of the start time, and to the budget of
+		 * the entity.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		/*
+		 * In addition, if the entity had more than one child
+		 * when set in service, then was not extracted from
+		 * the active tree. This implies that the position of
+		 * the entity in the active tree may need to be
+		 * changed now, because we have just updated the start
+		 * time of the entity, and we will update its finish
+		 * time in a moment (the requeueing is then, more
+		 * precisely, a repositioning in this case). To
+		 * implement this repositioning, we: 1) dequeue the
+		 * entity here, 2) update the finish time and
+		 * requeue the entity according to the new
+		 * timestamps below.
+		 */
+		if (entity->tree)
+			bfq_active_extract(st, entity);
+	} else { /* The entity is already active, and not in service */
+		/*
+		 * In this case, this function gets called only if the
+		 * next_in_service entity below this entity has
+		 * changed, and this change has caused the budget of
+		 * this entity to change, which, finally implies that
+		 * the finish time of this entity must be
+		 * updated. Such an update may cause the scheduling,
+		 * i.e., the position in the active tree, of this
+		 * entity to change. We handle this change by: 1)
+		 * dequeueing the entity here, 2) updating the finish
+		 * time and requeueing the entity according to the new
+		 * timestamps below. This is the same approach as the
+		 * non-extracted-entity sub-case above.
+		 */
+		bfq_active_extract(st, entity);
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+					  struct bfq_sched_data *sd,
+					  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (sd->in_service_entity == entity || entity->tree == &st->active)
+		 /*
+		  * in service or already queued on the active tree,
+		  * requeue or reposition
+		  */
+		__bfq_requeue_entity(entity);
+	else
+		/*
+		 * Not in service and not queued on its active tree:
+		 * the activity is idle and this is a true activation.
+		 */
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ *			 and activate, requeue or reposition all ancestors
+ *			 for which such an update becomes necessary.
  * @entity: the entity to activate.
  * @non_blocking_wait_rq: true if this entity was waiting for a request
- *
- * Activate @entity and all the entities on the path from it to the root.
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ *	     being expired; thus ALL its ancestors stop being served and must
+ *	     therefore be requeued
  */
-static void bfq_activate_entity(struct bfq_entity *entity,
-				bool non_blocking_wait_rq)
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+					bool non_blocking_wait_rq,
+					bool requeue)
 {
 	struct bfq_sched_data *sd;
 
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity, non_blocking_wait_rq);
-
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
-			/*
-			 * No need to propagate the activation to the
-			 * upper entities, as they will be updated when
-			 * the in-service entity is rescheduled.
-			 */
+		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+		if (!bfq_update_next_in_service(sd, entity) && !requeue)
 			break;
 	}
 }
@@ -1357,52 +1801,48 @@ static void bfq_activate_entity(struct bfq_entity *entity,
 /**
  * __bfq_deactivate_entity - deactivate an entity from its service tree.
  * @entity: the entity to deactivate.
- * @requeue: if false, the entity will not be put into the idle tree.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ *			idle tree.
  *
- * Deactivate an entity, independently from its previous state.  If the
- * entity was not on a service tree just return, otherwise if it is on
- * any scheduler tree, extract it from that tree, and if necessary
- * and if the caller did not specify @requeue, put it on the idle tree.
- *
- * Return %1 if the caller should update the entity hierarchy, i.e.,
- * if the entity was in service or if it was the next_in_service for
- * its sched_data; return %0 otherwise.
+ * Deactivates an entity, independently from its previous state.  Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
  */
-static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static bool __bfq_deactivate_entity(struct bfq_entity *entity,
+				    bool ins_into_idle_tree)
 {
 	struct bfq_sched_data *sd = entity->sched_data;
 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 	int is_in_service = entity == sd->in_service_entity;
-	int ret = 0;
 
-	if (!entity->on_st)
-		return 0;
+	if (!entity->on_st) /* entity never activated, or already inactive */
+		return false;
 
-	if (is_in_service) {
+	if (is_in_service)
 		bfq_calc_finish(entity, entity->service);
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active)
+
+	if (entity->tree == &st->active)
 		bfq_active_extract(st, entity);
-	else if (entity->tree == &st->idle)
+	else if (!is_in_service && entity->tree == &st->idle)
 		bfq_idle_extract(st, entity);
 
-	if (is_in_service || sd->next_in_service == entity)
-		ret = bfq_update_next_in_service(sd);
-
-	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
 		bfq_forget_entity(st, entity, is_in_service);
 	else
 		bfq_idle_insert(st, entity);
 
-	return ret;
+	return true;
 }
 
 /**
- * bfq_deactivate_entity - deactivate an entity.
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
  * @entity: the entity to deactivate.
- * @requeue: true if the entity can be put on the idle tree
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
  */
-static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+				  bool ins_into_idle_tree,
+				  bool expiration)
 {
 	struct bfq_sched_data *sd;
 	struct bfq_entity *parent = NULL;
@@ -1410,63 +1850,102 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 	for_each_entity_safe(entity, parent) {
 		sd = entity->sched_data;
 
-		if (!__bfq_deactivate_entity(entity, requeue))
+		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
 			/*
-			 * The parent entity is still backlogged, and
-			 * we don't need to update it as it is still
-			 * in service.
+			 * entity is not in any tree any more, so
+			 * this deactivation is a no-op, and there is
+			 * nothing to change for upper-level entities
+			 * (in case of expiration, this can never
+			 * happen).
 			 */
-			break;
+			return;
+		}
+
+		if (sd->next_in_service == entity)
+			/*
+			 * entity was the next_in_service entity,
+			 * then, since entity has just been
+			 * deactivated, a new one must be found.
+			 */
+			bfq_update_next_in_service(sd, NULL);
 
 		if (sd->next_in_service)
 			/*
-			 * The parent entity is still backlogged and
-			 * the budgets on the path towards the root
-			 * need to be updated.
+			 * The parent entity is still backlogged,
+			 * because next_in_service is not NULL. So, no
+			 * further upwards deactivation must be
+			 * performed.  Yet, next_in_service has
+			 * changed.  Then the schedule does need to be
+			 * updated upwards.
 			 */
-			goto update;
+			break;
 
 		/*
-		 * If we get here, then the parent is no more backlogged and
-		 * we want to propagate the deactivation upwards.
+		 * If we get here, then the parent is no more
+		 * backlogged and we need to propagate the
+		 * deactivation upwards. Thus let the loop go on.
 		 */
-		requeue = 1;
-	}
 
-	return;
+		/*
+		 * Also let parent be queued into the idle tree on
+		 * deactivation, to preserve service guarantees, and
+		 * assuming that who invoked this function does not
+		 * need parent entities too to be removed completely.
+		 */
+		ins_into_idle_tree = true;
+	}
 
-update:
+	/*
+	 * If the deactivation loop is fully executed, then there are
+	 * no more entities to touch and next loop is not executed at
+	 * all. Otherwise, requeue remaining entities if they are
+	 * about to stop receiving service, or reposition them if this
+	 * is not the case.
+	 */
 	entity = parent;
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity, false);
+		/*
+		 * Invoke __bfq_requeue_entity on entity, even if
+		 * already active, to requeue/reposition it in the
+		 * active tree (because sd->next_in_service has
+		 * changed)
+		 */
+		__bfq_requeue_entity(entity);
 
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
+		if (!bfq_update_next_in_service(sd, entity) &&
+		    !expiration)
+			/*
+			 * next_in_service unchanged or not causing
+			 * any change in entity->parent->sd, and no
+			 * requeueing needed for expiration: stop
+			 * here.
+			 */
 			break;
 	}
 }
 
 /**
- * bfq_update_vtime - update vtime if necessary.
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ *                       if needed, to have at least one entity eligible.
  * @st: the service tree to act upon.
  *
- * If necessary update the service tree vtime to have at least one
- * eligible entity, skipping to its start time.  Assumes that the
- * active tree of the device is not empty.
- *
- * NOTE: this hierarchical implementation updates vtimes quite often,
- * we may end up with reactivated processes getting timestamps after a
- * vtime skip done because we needed a ->first_active entity on some
- * intermediate node.
+ * Assumes that st is not empty.
  */
-static void bfq_update_vtime(struct bfq_service_tree *st)
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
 {
-	struct bfq_entity *entry;
-	struct rb_node *node = st->active.rb_node;
+	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+	if (bfq_gt(root_entity->min_start, st->vtime))
+		return root_entity->min_start;
+
+	return st->vtime;
+}
 
-	entry = rb_entry(node, struct bfq_entity, rb_node);
-	if (bfq_gt(entry->min_start, st->vtime)) {
-		st->vtime = entry->min_start;
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+	if (new_value > st->vtime) {
+		st->vtime = new_value;
 		bfq_forget_idle(st);
 	}
 }
@@ -1475,6 +1954,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * bfq_first_active_entity - find the eligible entity with
  *                           the smallest finish time
  * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
  *
  * This function searches the first schedulable entity, starting from the
  * root of the tree and going on the left every time on this side there is
@@ -1482,7 +1962,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * the right is followed only if a) the left subtree contains no eligible
  * entities and b) no eligible entity has been found yet.
  */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+						  u64 vtime)
 {
 	struct bfq_entity *entry, *first = NULL;
 	struct rb_node *node = st->active.rb_node;
@@ -1490,13 +1971,13 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
 	while (node) {
 		entry = rb_entry(node, struct bfq_entity, rb_node);
 left:
-		if (!bfq_gt(entry->start, st->vtime))
+		if (!bfq_gt(entry->start, vtime))
 			first = entry;
 
 		if (node->rb_left) {
 			entry = rb_entry(node->rb_left,
 					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, st->vtime)) {
+			if (!bfq_gt(entry->min_start, vtime)) {
 				node = node->rb_left;
 				goto left;
 			}
@@ -1506,222 +1987,1429 @@ left:
 		node = node->rb_right;
 	}
 
-	return first;
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+	struct bfq_entity *entity;
+	u64 new_vtime;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	/*
+	 * Get the value of the system virtual time for which at
+	 * least one entity is eligible.
+	 */
+	new_vtime = bfq_calc_vtime_jump(st);
+
+	/*
+	 * If there is no in-service entity for the sched_data this
+	 * active tree belongs to, then push the system virtual time
+	 * up to the value that guarantees that at least one entity is
+	 * eligible. If, instead, there is an in-service entity, then
+	 * do not make any such update, because there is already an
+	 * eligible entity, namely the in-service one (even if the
+	 * entity is not on st, because it was extracted when set in
+	 * service).
+	 */
+	if (!in_service)
+		bfq_update_vtime(st, new_vtime);
+
+	entity = bfq_first_active_entity(st, new_vtime);
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+	struct bfq_entity *entity = NULL;
+	int class_idx = 0;
+
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class (and if there is some active entity
+	 * in idle class). This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+				   BFQ_CL_IDLE_TIMEOUT)) {
+		if (!RB_EMPTY_ROOT(&idle_class_st->active))
+			class_idx = BFQ_IOPRIO_CLASSES - 1;
+		/* About to be served if backlogged, or not yet backlogged */
+		sd->bfq_class_idle_last_service = jiffies;
+	}
+
+	/*
+	 * Find the next entity to serve for the highest-priority
+	 * class, unless the idle class needs to be served.
+	 */
+	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+		entity = __bfq_lookup_next_entity(st + class_idx,
+						  sd->in_service_entity);
+
+		if (entity)
+			break;
+	}
+
+	if (!entity)
+		return NULL;
+
+	return entity;
+}
+
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	/*
+	 * Traverse the path from the root to the leaf entity to
+	 * serve. Set in service all the entities visited along the
+	 * way.
+	 */
+	sd = &bfqd->root_group->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+		/*
+		 * WARNING. We are about to set the in-service entity
+		 * to sd->next_in_service, i.e., to the (cached) value
+		 * returned by bfq_lookup_next_entity(sd) the last
+		 * time it was invoked, i.e., the last time when the
+		 * service order in sd changed as a consequence of the
+		 * activation or deactivation of an entity. In this
+		 * respect, if we execute bfq_lookup_next_entity(sd)
+		 * in this very moment, it may, although with low
+		 * probability, yield a different entity than that
+		 * pointed to by sd->next_in_service. This rare event
+		 * happens in case there was no CLASS_IDLE entity to
+		 * serve for sd when bfq_lookup_next_entity(sd) was
+		 * invoked for the last time, while there is now one
+		 * such entity.
+		 *
+		 * If the above event happens, then the scheduling of
+		 * such entity in CLASS_IDLE is postponed until the
+		 * service of the sd->next_in_service entity
+		 * finishes. In fact, when the latter is expired,
+		 * bfq_lookup_next_entity(sd) gets called again,
+		 * exactly to update sd->next_in_service.
+		 */
+
+		/* Make next_in_service entity become in_service_entity */
+		entity = sd->next_in_service;
+		sd->in_service_entity = entity;
+
+		/*
+		 * Reset the accumulator of the amount of service that
+		 * the entity is about to receive.
+		 */
+		entity->service = 0;
+
+		/*
+		 * If entity is no longer a candidate for next
+		 * service, then we extract it from its active tree,
+		 * for the following reason. To further boost the
+		 * throughput in some special case, BFQ needs to know
+		 * which is the next candidate entity to serve, while
+		 * there is already an entity in service. In this
+		 * respect, to make it easy to compute/update the next
+		 * candidate entity to serve after the current
+		 * candidate has been set in service, there is a case
+		 * where it is necessary to extract the current
+		 * candidate from its service tree. Such a case is
+		 * when the entity just set in service cannot be also
+		 * a candidate for next service. Details about when
+		 * this conditions holds are reported in the comments
+		 * on the function bfq_no_longer_next_in_service()
+		 * invoked below.
+		 */
+		if (bfq_no_longer_next_in_service(entity))
+			bfq_active_extract(bfq_entity_service_tree(entity),
+					   entity);
+
+		/*
+		 * For the same reason why we may have just extracted
+		 * entity from its active tree, we may need to update
+		 * next_in_service for the sched_data of entity too,
+		 * regardless of whether entity has been extracted.
+		 * In fact, even if entity has not been extracted, a
+		 * descendant entity may get extracted. Such an event
+		 * would cause a change in next_in_service for the
+		 * level of the descendant entity, and thus possibly
+		 * back to upper levels.
+		 *
+		 * We cannot perform the resulting needed update
+		 * before the end of this loop, because, to know which
+		 * is the correct next-to-serve candidate entity for
+		 * each level, we need first to find the leaf entity
+		 * to set in service. In fact, only after we know
+		 * which is the next-to-serve leaf entity, we can
+		 * discover whether the parent entity of the leaf
+		 * entity becomes the next-to-serve, and so on.
+		 */
+
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+
+	/*
+	 * We can finally update all next-to-serve entities along the
+	 * path from the leaf entity just set in service to the root.
+	 */
+	for_each_entity(entity) {
+		struct bfq_sched_data *sd = entity->sched_data;
+
+		if (!bfq_update_next_in_service(sd, NULL))
+			break;
+	}
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	struct bfq_entity *entity = in_serv_entity;
+
+	if (bfqd->in_service_bic) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfq_clear_bfqq_wait_request(in_serv_bfqq);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+	bfqd->in_service_queue = NULL;
+
+	/*
+	 * When this function is called, all in-service entities have
+	 * been properly deactivated or requeued, so we can safely
+	 * execute the final step: reset in_service_entity along the
+	 * path from entity to the root.
+	 */
+	for_each_entity(entity)
+		entity->sched_data->in_service_entity = NULL;
+
+	/*
+	 * in_serv_entity is no longer in service, so, if it is in no
+	 * service tree either, then release the service reference to
+	 * the queue it represents (taken with bfq_get_entity).
+	 */
+	if (!in_serv_entity->on_st)
+		bfq_put_queue(in_serv_bfqq);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				bool ins_into_idle_tree, bool expiration)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+				    false);
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, false,
+				    bfqq == bfqd->in_service_queue);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      bool expiration)
+{
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	bfqd->busy_queues--;
+
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+	}
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+
+static struct blkcg_policy blkcg_policy_bfq;
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, op,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples,
+			  &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+	struct bfq_group *parent;
+
+	if (!bfqg) /* root_group */
+		return;
+
+	parent = bfqg_parent(bfqg);
+
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	}
+	entity->parent = bfqg->my_entity; /* NULL for root group */
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+	struct bfq_group_data *bgd;
+
+	bgd = kzalloc(sizeof(*bgd), gfp);
+	if (!bgd)
+		return NULL;
+	return &bgd->pd;
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	return kfree(bfqg);
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+					 struct blkcg *blkcg)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = blkg_lookup(blkcg, bfqd->queue);
+	if (likely(blkg))
+		return blkg_to_bfqg(blkg);
+	return NULL;
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	struct bfq_group *bfqg, *parent;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		return NULL;
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	/* If bfqq is empty, then bfq_bfqq_expire also invokes
+	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+	 * from data structures related to current group. Otherwise we
+	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+	 * we do below.
+	 */
+	if (bfqq == bfqd->in_service_queue)
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQQE_PREEMPTED);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+	else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_activate_bfqq(bfqd, bfqq);
+
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_find_set_group(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		bfqg = bfqd->root_group;
+
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq,
+				     async_bfqq->ref);
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+	}
+
+	return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_group *bfqg = NULL;
+	uint64_t serial_nr;
+
+	rcu_read_lock();
+	serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+		goto out;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bic->blkcg_serial_nr = serial_nr;
+out:
+	rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
  *
- * Update the virtual time in @st and return the first eligible entity
- * it contains.
+ * Needs queue_lock to be taken and reference to be valid over the call.
  */
-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
-						   bool force)
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
 {
-	struct bfq_entity *entity, *new_next_in_service = NULL;
-
-	if (RB_EMPTY_ROOT(&st->active))
-		return NULL;
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
 
-	bfq_update_vtime(st);
-	entity = bfq_first_active_entity(st);
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
 
-	/*
-	 * If the chosen entity does not match with the sched_data's
-	 * next_in_service and we are forcedly serving the IDLE priority
-	 * class tree, bubble up budget update.
-	 */
-	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
-		new_next_in_service = entity;
-		for_each_entity(new_next_in_service)
-			bfq_update_budget(new_next_in_service);
-	}
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
 
-	return entity;
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
 }
 
 /**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- * @extract: if true the returned entity will be also extracted from @sd.
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *		    and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
  *
- * NOTE: since we cache the next_in_service entity at each level of the
- * hierarchy, the complexity of the lookup can be decreased with
- * absolutely no effort just returning the cached next_in_service value;
- * we prefer to do full lookups to test the consistency of the data
- * structures.
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
  */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd)
+static void bfq_pd_offline(struct blkg_policy_data *pd)
 {
-	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_entity *entity;
-	int i = 0;
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	struct bfq_data *bfqd = bfqg->bfqd;
+	struct bfq_entity *entity = bfqg->my_entity;
+	unsigned long flags;
+	int i;
 
+	if (!entity) /* root group */
+		return;
+
+	spin_lock_irqsave(&bfqd->lock, flags);
 	/*
-	 * Choose from idle class, if needed to guarantee a minimum
-	 * bandwidth to this class. This should also mitigate
-	 * priority-inversion problems in case a low priority task is
-	 * holding file system resources.
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
 	 */
-	if (bfqd &&
-	    jiffies - bfqd->bfq_class_idle_last_service >
-	    BFQ_CL_IDLE_TIMEOUT) {
-		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
-						  true);
-		if (entity) {
-			i = BFQ_IOPRIO_CLASSES - 1;
-			bfqd->bfq_class_idle_last_service = jiffies;
-			sd->next_in_service = entity;
-		}
-	}
-	for (; i < BFQ_IOPRIO_CLASSES; i++) {
-		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity) {
-			if (extract) {
-				bfq_check_next_in_service(sd, entity);
-				bfq_active_extract(st + i, entity);
-				sd->in_service_entity = entity;
-				sd->next_in_service = NULL;
-			}
-			break;
-		}
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		st = bfqg->sched_data.service_tree + i;
+
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
 	}
 
-	return entity;
+	__bfq_deactivate_entity(entity, false);
+	bfq_put_async_queues(bfqd, bfqg);
+
+	spin_unlock_irqrestore(&bfqd->lock, flags);
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	bfqg_stats_xfer_dead(bfqg);
 }
 
-static bool next_queue_may_preempt(struct bfq_data *bfqd)
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
 {
-	struct bfq_sched_data *sd = &bfqd->sched_data;
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	unsigned int val = 0;
 
-	return sd->next_in_service != sd->in_service_entity;
-}
+	if (bfqgd)
+		val = bfqgd->weight;
 
+	seq_printf(sf, "%u\n", val);
 
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+	return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+				    struct cftype *cftype,
+				    u64 val)
 {
-	struct bfq_entity *entity = NULL;
-	struct bfq_sched_data *sd;
-	struct bfq_queue *bfqq;
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -ERANGE;
 
-	if (bfqd->busy_queues == 0)
-		return NULL;
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
 
-	sd = &bfqd->sched_data;
-	for (; sd ; sd = entity->my_sched_data) {
-		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		entity->service = 0;
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
 	}
+	spin_unlock_irq(&blkcg->lock);
 
-	bfqq = bfq_entity_to_bfqq(entity);
+	return ret;
+}
 
-	return bfqq;
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
+{
+	u64 weight;
+	/* First unsigned long found in the file is used */
+	int ret = kstrtoull(strim(buf), 0, &weight);
+
+	if (ret)
+		return ret;
+
+	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
 }
 
-static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+static int bfqg_print_stat(struct seq_file *sf, void *v)
 {
-	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
-	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
 
-	if (bfqd->in_service_bic) {
-		put_io_context(bfqd->in_service_bic->icq.ioc);
-		bfqd->in_service_bic = NULL;
-	}
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
 
-	bfq_clear_bfqq_wait_request(in_serv_bfqq);
-	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-	bfqd->in_service_queue = NULL;
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_bfq, off);
+	return __blkg_prfill_u64(sf, pd, sum);
+}
 
-	/*
-	 * in_serv_entity is no longer in service, so, if it is in no
-	 * service tree either, then release the service reference to
-	 * the queue it represents (taken with bfq_get_entity).
-	 */
-	if (!in_serv_entity->on_st)
-		bfq_put_queue(in_serv_bfqq);
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							   &blkcg_policy_bfq,
+							   off);
+	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int requeue)
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
 {
-	struct bfq_entity *entity = &bfqq->entity;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
 
-	bfq_deactivate_entity(entity, requeue);
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
 }
 
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
 {
-	struct bfq_entity *entity = &bfqq->entity;
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
 
-	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
-	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
 }
 
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      int requeue)
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
 {
-	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+	return 0;
+}
 
-	bfq_clear_bfqq_busy(bfqq);
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 
-	bfqd->busy_queues--;
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
 
-	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+			  false);
+	return 0;
 }
 
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
 {
-	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
 
-	bfq_activate_bfqq(bfqd, bfqq);
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
 
-	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+	return blkg_to_bfqg(bfqd->queue->root_blkg);
 }
 
-static void bfq_init_entity(struct bfq_entity *entity)
+static struct cftype bfq_blkcg_legacy_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write_u64 = bfq_io_set_weight_legacy,
+	},
+
+	/* statistics, covers only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.seq_show = bfqg_print_stat_sectors,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.seq_show = bfqg_print_stat_sectors_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct cftype bfq_blkg_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write = bfq_io_set_weight,
+	},
+	{} /* terminate */
+};
+
+#else	/* CONFIG_BFQ_GROUP_IOSCHED */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+			struct bfq_queue *bfqq, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+				     struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg) {}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
 	entity->weight = entity->new_weight;
 	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	return bfqd->root_group;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
 
-	bfqq->ioprio = bfqq->new_ioprio;
-	bfqq->ioprio_class = bfqq->new_ioprio_class;
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
+						    int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
 
-	entity->sched_data = &bfqq->bfqd->sched_data;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
 }
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define bfq_sample_valid(samples)	((samples) > 80)
 
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
-	if (bfqd->queued != 0) {
-		bfq_log(bfqd, "schedule dispatch");
-		blk_mq_run_hw_queues(bfqd->queue, true);
-	}
-}
-
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closesr to the head right now.  Distance
@@ -1905,7 +3593,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
 					 new_budget);
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
 	}
 }
 
@@ -2076,6 +3764,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 			bfqq->ttime.last_end_request +
 			bfqd->bfq_slice_idle * 3;
 
+	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
+
 	/*
 	 * Update budget and check whether bfqq may want to preempt
 	 * the in-service queue.
@@ -2195,7 +3885,7 @@ static void bfq_remove_request(struct request_queue *q,
 		bfqq->next_rq = NULL;
 
 		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
-			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+			bfq_del_bfqq_busy(bfqd, bfqq, false);
 			/*
 			 * bfqq emptied. In normal operation, when
 			 * bfqq is empty, bfqq->entity.service and
@@ -2215,6 +3905,8 @@ static void bfq_remove_request(struct request_queue *q,
 
 	if (rq->cmd_flags & REQ_META)
 		bfqq->meta_pending--;
+
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
 }
 
 static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
@@ -2300,7 +3992,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
 
 	if (!RB_EMPTY_NODE(&rq->rb_node))
-		return;
+		goto end;
 	spin_lock_irq(&bfqq->bfqd->lock);
 
 	/*
@@ -2326,6 +4018,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	bfq_remove_request(q, next);
 
 	spin_unlock_irq(&bfqq->bfqd->lock);
+end:
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
 }
 
 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -2355,6 +4049,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
 	if (bfqq) {
+		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
 		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
 
@@ -2441,6 +4136,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	bfqd->last_idling_start = ktime_get();
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
 		      HRTIMER_MODE_REL);
+	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
 }
 
 /*
@@ -2490,12 +4186,17 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
 
 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
-	__bfq_bfqd_reset_in_service(bfqd);
-
 	if (RB_EMPTY_ROOT(&bfqq->sort_list))
-		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		bfq_del_bfqq_busy(bfqd, bfqq, true);
 	else
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
+
+	/*
+	 * All in-service entities must have been properly deactivated
+	 * or requeued before executing the next function, which
+	 * resets all in-service entites as no more in service.
+	 */
+	__bfq_bfqd_reset_in_service(bfqd);
 }
 
 /**
@@ -2972,6 +4673,7 @@ check_queue:
 				 */
 				bfq_clear_bfqq_wait_request(bfqq);
 				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+				bfqg_stats_update_idle_time(bfqq_group(bfqq));
 			}
 			goto keep_queue;
 		}
@@ -3159,6 +4861,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  */
 static void bfq_put_queue(struct bfq_queue *bfqq)
 {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
 	if (bfqq->bfqd)
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
 			     bfqq, bfqq->ref);
@@ -3167,7 +4873,12 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+
 	kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_put(bfqg);
+#endif
 }
 
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -3323,18 +5034,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 }
 
 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
 					       int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
-		return &async_bfqq[0][ioprio];
+		return &bfqg->async_bfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
 		ioprio = IOPRIO_NORM;
 		/* fall through */
 	case IOPRIO_CLASS_BE:
-		return &async_bfqq[1][ioprio];
+		return &bfqg->async_bfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
-		return &async_idle_bfqq;
+		return &bfqg->async_idle_bfqq;
 	default:
 		return NULL;
 	}
@@ -3348,11 +5060,18 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
 	struct bfq_queue **async_bfqq = NULL;
 	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
 
 	rcu_read_lock();
 
+	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+	if (!bfqg) {
+		bfqq = &bfqd->oom_bfqq;
+		goto out;
+	}
+
 	if (!is_sync) {
-		async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class,
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
 						  ioprio);
 		bfqq = *async_bfqq;
 		if (bfqq)
@@ -3366,7 +5085,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	if (bfqq) {
 		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
 			      is_sync);
-		bfq_init_entity(&bfqq->entity);
+		bfq_init_entity(&bfqq->entity, bfqg);
 		bfq_log_bfqq(bfqd, bfqq, "allocated");
 	} else {
 		bfqq = &bfqd->oom_bfqq;
@@ -3379,9 +5098,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	 * prune it.
 	 */
 	if (async_bfqq) {
-		bfqq->ref++;
-		bfq_log_bfqq(bfqd, bfqq,
-			     "get_queue, bfqq not in async: %p, %d",
+		bfqq->ref++; /*
+			      * Extra group reference, w.r.t. sync
+			      * queue. This extra reference is removed
+			      * only if bfqq->bfqg disappears, to
+			      * guarantee that this queue is not freed
+			      * until its group goes away.
+			      */
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
 			     bfqq, bfqq->ref);
 		*async_bfqq = bfqq;
 	}
@@ -3516,6 +5240,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
 
 		/*
 		 * The queue is not empty, because a new request just
@@ -3657,6 +5382,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
 
+	if (rq->rq_flags & RQF_STARTED)
+		bfqg_stats_update_completion(bfqq_group(bfqq),
+					     rq_start_time_ns(rq),
+					     rq_io_start_time_ns(rq),
+					     rq->cmd_flags);
 
 	if (likely(rq->rq_flags & RQF_STARTED)) {
 		unsigned long flags;
@@ -3707,6 +5437,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 	if (!bic)
 		goto queue_fail;
 
+	bfq_bic_update_cgroup(bic, bio);
+
 	bfqq = bic_to_bfqq(bic, is_sync);
 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
 		if (bfqq)
@@ -3803,6 +5535,8 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 
 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
 	if (bfqq) {
+		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+
 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
 			     bfqq, bfqq->ref);
 		bfq_put_queue(bfqq);
@@ -3811,18 +5545,20 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 }
 
 /*
- * Release the extra reference of the async queues as the device
- * goes away.
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
  */
-static void bfq_put_async_queues(struct bfq_data *bfqd)
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	int i, j;
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < IOPRIO_BE_NR; j++)
-			__bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]);
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
 
-	__bfq_put_async_bfqq(bfqd, &async_idle_bfqq);
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
 }
 
 static void bfq_exit_queue(struct elevator_queue *e)
@@ -3834,20 +5570,42 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	spin_lock_irq(&bfqd->lock);
 	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
-		bfq_deactivate_bfqq(bfqd, bfqq, false);
-	bfq_put_async_queues(bfqd);
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	spin_unlock_irq(&bfqd->lock);
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+#else
+	spin_lock_irq(&bfqd->lock);
+	bfq_put_async_queues(bfqd, bfqd->root_group);
+	kfree(bfqd->root_group);
+	spin_unlock_irq(&bfqd->lock);
+#endif
+
 	kfree(bfqd);
 }
 
+static void bfq_init_root_group(struct bfq_group *root_group,
+				struct bfq_data *bfqd)
+{
+	int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	root_group->entity.parent = NULL;
+	root_group->my_entity = NULL;
+	root_group->bfqd = bfqd;
+#endif
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct bfq_data *bfqd;
 	struct elevator_queue *eq;
-	int i;
 
 	eq = elevator_alloc(q, e);
 	if (!eq)
@@ -3860,6 +5618,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	}
 	eq->elevator_data = bfqd;
 
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
@@ -3880,8 +5642,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	bfqd->queue = q;
 
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	INIT_LIST_HEAD(&bfqd->dispatch);
 
 	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL);
@@ -3899,17 +5660,40 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->bfq_back_max = bfq_back_max;
 	bfqd->bfq_back_penalty = bfq_back_penalty;
 	bfqd->bfq_slice_idle = bfq_slice_idle;
-	bfqd->bfq_class_idle_last_service = 0;
 	bfqd->bfq_timeout = bfq_timeout;
 
 	bfqd->bfq_requests_within_timer = 120;
 
 	spin_lock_init(&bfqd->lock);
-	INIT_LIST_HEAD(&bfqd->dispatch);
 
-	q->elevator = eq;
+	/*
+	 * The invocation of the next bfq_create_group_hierarchy
+	 * function is the head of a chain of function calls
+	 * (bfq_create_group_hierarchy->blkcg_activate_policy->
+	 * blk_mq_freeze_queue) that may lead to the invocation of the
+	 * has_work hook function. For this reason,
+	 * bfq_create_group_hierarchy is invoked only after all
+	 * scheduler data has been initialized, apart from the fields
+	 * that can be initialized only after invoking
+	 * bfq_create_group_hierarchy. This, in particular, enables
+	 * has_work to correctly return false. Of course, to avoid
+	 * other inconsistencies, the blk-mq stack must then refrain
+	 * from invoking further scheduler hooks before this init
+	 * function is finished.
+	 */
+	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+	if (!bfqd->root_group)
+		goto out_free;
+	bfq_init_root_group(bfqd->root_group, bfqd);
+	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
 
 	return 0;
+
+out_free:
+	kfree(bfqd);
+	kobject_put(&eq->kobj);
+	return -ENOMEM;
 }
 
 static void bfq_slab_kill(void)
@@ -4134,10 +5918,34 @@ static struct elevator_type iosched_bfq_mq = {
 	.elevator_owner =	THIS_MODULE,
 };
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct blkcg_policy blkcg_policy_bfq = {
+	.dfl_cftypes		= bfq_blkg_files,
+	.legacy_cftypes		= bfq_blkcg_legacy_files,
+
+	.cpd_alloc_fn		= bfq_cpd_alloc,
+	.cpd_init_fn		= bfq_cpd_init,
+	.cpd_bind_fn	        = bfq_cpd_init,
+	.cpd_free_fn		= bfq_cpd_free,
+
+	.pd_alloc_fn		= bfq_pd_alloc,
+	.pd_init_fn		= bfq_pd_init,
+	.pd_offline_fn		= bfq_pd_offline,
+	.pd_free_fn		= bfq_pd_free,
+	.pd_reset_stats_fn	= bfq_pd_reset_stats,
+};
+#endif
+
 static int __init bfq_init(void)
 {
 	int ret;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	ret = blkcg_policy_register(&blkcg_policy_bfq);
+	if (ret)
+		return ret;
+#endif
+
 	ret = -ENOMEM;
 	if (bfq_slab_setup())
 		goto err_pol_unreg;
@@ -4149,12 +5957,18 @@ static int __init bfq_init(void)
 	return 0;
 
 err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	return ret;
 }
 
 static void __exit bfq_exit(void)
 {
 	elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	bfq_slab_kill();
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec993573e0a8..fe9c512cc6fa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -50,7 +50,7 @@ struct blk_stat_callback;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		2
+#define BLKCG_MAX_POLS		3
 
 typedef void (rq_end_io_fn)(struct request *, int);
 
-- 
cgit v1.2.3


From 5f1ae4ebe578319a0cd5dae9591dd426070be106 Mon Sep 17 00:00:00 2001
From: Fu Wei <fu.wei@linaro.org>
Date: Sat, 1 Apr 2017 01:51:01 +0800
Subject: acpi/arm64: Add GTDT table parse driver

This patch adds support for parsing arch timer info in GTDT,
provides some kernel APIs to parse all the PPIs and
always-on info in GTDT and export them.

By this driver, we can simplify arm_arch_timer drivers, and
separate the ACPI GTDT knowledge from it.

Signed-off-by: Fu Wei <fu.wei@linaro.org>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/Kconfig          |   1 +
 drivers/acpi/arm64/Kconfig  |   3 +
 drivers/acpi/arm64/Makefile |   1 +
 drivers/acpi/arm64/gtdt.c   | 157 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h        |   6 ++
 5 files changed, 168 insertions(+)
 create mode 100644 drivers/acpi/arm64/gtdt.c

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3741859765cf..7e2baec6f23a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
 	def_bool y
 	select ACPI_CCA_REQUIRED if ACPI
 	select ACPI_GENERIC_GSI if ACPI
+	select ACPI_GTDT if ACPI
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
 	select ACPI_MCFG if ACPI
 	select ACPI_SPCR_TABLE if ACPI
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index 4616da4c15be..5a6f80fce0d6 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -4,3 +4,6 @@
 
 config ACPI_IORT
 	bool
+
+config ACPI_GTDT
+	bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 72331f2ce0e9..1017def2ea12 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_ACPI_IORT) 	+= iort.o
+obj-$(CONFIG_ACPI_GTDT) 	+= gtdt.o
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
new file mode 100644
index 000000000000..3d95af8f73c6
--- /dev/null
+++ b/drivers/acpi/arm64/gtdt.c
@@ -0,0 +1,157 @@
+/*
+ * ARM Specific GTDT table Support
+ *
+ * Copyright (C) 2016, Linaro Ltd.
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *         Fu Wei <fu.wei@linaro.org>
+ *         Hanjun Guo <hanjun.guo@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <clocksource/arm_arch_timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ACPI GTDT: " fmt
+
+/**
+ * struct acpi_gtdt_descriptor - Store the key info of GTDT for all functions
+ * @gtdt:	The pointer to the struct acpi_table_gtdt of GTDT table.
+ * @gtdt_end:	The pointer to the end of GTDT table.
+ * @platform_timer:	The pointer to the start of Platform Timer Structure
+ *
+ * The struct store the key info of GTDT table, it should be initialized by
+ * acpi_gtdt_init.
+ */
+struct acpi_gtdt_descriptor {
+	struct acpi_table_gtdt *gtdt;
+	void *gtdt_end;
+	void *platform_timer;
+};
+
+static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
+
+static int __init map_gt_gsi(u32 interrupt, u32 flags)
+{
+	int trigger, polarity;
+
+	trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE
+			: ACPI_LEVEL_SENSITIVE;
+
+	polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW
+			: ACPI_ACTIVE_HIGH;
+
+	return acpi_register_gsi(NULL, interrupt, trigger, polarity);
+}
+
+/**
+ * acpi_gtdt_map_ppi() - Map the PPIs of per-cpu arch_timer.
+ * @type:	the type of PPI.
+ *
+ * Note: Secure state is not managed by the kernel on ARM64 systems.
+ * So we only handle the non-secure timer PPIs,
+ * ARCH_TIMER_PHYS_SECURE_PPI is treated as invalid type.
+ *
+ * Return: the mapped PPI value, 0 if error.
+ */
+int __init acpi_gtdt_map_ppi(int type)
+{
+	struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+	switch (type) {
+	case ARCH_TIMER_PHYS_NONSECURE_PPI:
+		return map_gt_gsi(gtdt->non_secure_el1_interrupt,
+				  gtdt->non_secure_el1_flags);
+	case ARCH_TIMER_VIRT_PPI:
+		return map_gt_gsi(gtdt->virtual_timer_interrupt,
+				  gtdt->virtual_timer_flags);
+
+	case ARCH_TIMER_HYP_PPI:
+		return map_gt_gsi(gtdt->non_secure_el2_interrupt,
+				  gtdt->non_secure_el2_flags);
+	default:
+		pr_err("Failed to map timer interrupt: invalid type.\n");
+	}
+
+	return 0;
+}
+
+/**
+ * acpi_gtdt_c3stop() - Got c3stop info from GTDT according to the type of PPI.
+ * @type:	the type of PPI.
+ *
+ * Return: true if the timer HW state is lost when a CPU enters an idle state,
+ * false otherwise
+ */
+bool __init acpi_gtdt_c3stop(int type)
+{
+	struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+	switch (type) {
+	case ARCH_TIMER_PHYS_NONSECURE_PPI:
+		return !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
+
+	case ARCH_TIMER_VIRT_PPI:
+		return !(gtdt->virtual_timer_flags & ACPI_GTDT_ALWAYS_ON);
+
+	case ARCH_TIMER_HYP_PPI:
+		return !(gtdt->non_secure_el2_flags & ACPI_GTDT_ALWAYS_ON);
+
+	default:
+		pr_err("Failed to get c3stop info: invalid type.\n");
+	}
+
+	return false;
+}
+
+/**
+ * acpi_gtdt_init() - Get the info of GTDT table to prepare for further init.
+ * @table:			The pointer to GTDT table.
+ * @platform_timer_count:	It points to a integer variable which is used
+ *				for storing the number of platform timers.
+ *				This pointer could be NULL, if the caller
+ *				doesn't need this info.
+ *
+ * Return: 0 if success, -EINVAL if error.
+ */
+int __init acpi_gtdt_init(struct acpi_table_header *table,
+			  int *platform_timer_count)
+{
+	void *platform_timer;
+	struct acpi_table_gtdt *gtdt;
+
+	gtdt = container_of(table, struct acpi_table_gtdt, header);
+	acpi_gtdt_desc.gtdt = gtdt;
+	acpi_gtdt_desc.gtdt_end = (void *)table + table->length;
+	acpi_gtdt_desc.platform_timer = NULL;
+	if (platform_timer_count)
+		*platform_timer_count = 0;
+
+	if (table->revision < 2) {
+		pr_warn("Revision:%d doesn't support Platform Timers.\n",
+			table->revision);
+		return 0;
+	}
+
+	if (!gtdt->platform_timer_count) {
+		pr_debug("No Platform Timer.\n");
+		return 0;
+	}
+
+	platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
+	if (platform_timer < (void *)table + sizeof(struct acpi_table_gtdt)) {
+		pr_err(FW_BUG "invalid timer data.\n");
+		return -EINVAL;
+	}
+	acpi_gtdt_desc.platform_timer = platform_timer;
+	if (platform_timer_count)
+		*platform_timer_count = gtdt->platform_timer_count;
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..4b5c146fba97 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -595,6 +595,12 @@ enum acpi_reconfig_event  {
 int acpi_reconfig_notifier_register(struct notifier_block *nb);
 int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 
+#ifdef CONFIG_ACPI_GTDT
+int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
+int acpi_gtdt_map_ppi(int type);
+bool acpi_gtdt_c3stop(int type);
+#endif
+
 #else	/* !CONFIG_ACPI */
 
 #define acpi_disabled 1
-- 
cgit v1.2.3


From a712c3ed9b8a4565a200f3e8e09c42079b1666b3 Mon Sep 17 00:00:00 2001
From: Fu Wei <fu.wei@linaro.org>
Date: Sat, 1 Apr 2017 01:51:03 +0800
Subject: acpi/arm64: Add memory-mapped timer support in GTDT driver

On platforms booting with ACPI, architected memory-mapped timers'
configuration data is provided by firmware through the ACPI GTDT
static table.

The clocksource architected timer kernel driver requires a firmware
interface to collect timer configuration and configure its driver.
this infrastructure is present for device tree systems, but it is
missing on systems booting with ACPI.

Implement the kernel infrastructure required to parse the static
ACPI GTDT table so that the architected timer clocksource driver can
make use of it on systems booting with ACPI, therefore enabling
the corresponding timers configuration.

Signed-off-by: Fu Wei <fu.wei@linaro.org>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
[Mark: restructure error handling]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
---
 drivers/acpi/arm64/gtdt.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h      |   1 +
 2 files changed, 158 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index 3d95af8f73c6..b9046678fb43 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -13,6 +13,7 @@
 
 #include <linux/acpi.h>
 #include <linux/init.h>
+#include <linux/irqdomain.h>
 #include <linux/kernel.h>
 
 #include <clocksource/arm_arch_timer.h>
@@ -37,6 +38,28 @@ struct acpi_gtdt_descriptor {
 
 static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
 
+static inline void *next_platform_timer(void *platform_timer)
+{
+	struct acpi_gtdt_header *gh = platform_timer;
+
+	platform_timer += gh->length;
+	if (platform_timer < acpi_gtdt_desc.gtdt_end)
+		return platform_timer;
+
+	return NULL;
+}
+
+#define for_each_platform_timer(_g)				\
+	for (_g = acpi_gtdt_desc.platform_timer; _g;	\
+	     _g = next_platform_timer(_g))
+
+static inline bool is_timer_block(void *platform_timer)
+{
+	struct acpi_gtdt_header *gh = platform_timer;
+
+	return gh->type == ACPI_GTDT_TYPE_TIMER_BLOCK;
+}
+
 static int __init map_gt_gsi(u32 interrupt, u32 flags)
 {
 	int trigger, polarity;
@@ -155,3 +178,137 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
 
 	return 0;
 }
+
+static int __init gtdt_parse_timer_block(struct acpi_gtdt_timer_block *block,
+					 struct arch_timer_mem *timer_mem)
+{
+	int i;
+	struct arch_timer_mem_frame *frame;
+	struct acpi_gtdt_timer_entry *gtdt_frame;
+
+	if (!block->timer_count) {
+		pr_err(FW_BUG "GT block present, but frame count is zero.");
+		return -ENODEV;
+	}
+
+	if (block->timer_count > ARCH_TIMER_MEM_MAX_FRAMES) {
+		pr_err(FW_BUG "GT block lists %d frames, ACPI spec only allows 8\n",
+		       block->timer_count);
+		return -EINVAL;
+	}
+
+	timer_mem->cntctlbase = (phys_addr_t)block->block_address;
+	/*
+	 * The CNTCTLBase frame is 4KB (register offsets 0x000 - 0xFFC).
+	 * See ARM DDI 0487A.k_iss10775, page I1-5129, Table I1-3
+	 * "CNTCTLBase memory map".
+	 */
+	timer_mem->size = SZ_4K;
+
+	gtdt_frame = (void *)block + block->timer_offset;
+	if (gtdt_frame + block->timer_count != (void *)block + block->header.length)
+		return -EINVAL;
+
+	/*
+	 * Get the GT timer Frame data for every GT Block Timer
+	 */
+	for (i = 0; i < block->timer_count; i++, gtdt_frame++) {
+		if (gtdt_frame->common_flags & ACPI_GTDT_GT_IS_SECURE_TIMER)
+			continue;
+		if (gtdt_frame->frame_number >= ARCH_TIMER_MEM_MAX_FRAMES ||
+		    !gtdt_frame->base_address || !gtdt_frame->timer_interrupt)
+			goto error;
+
+		frame = &timer_mem->frame[gtdt_frame->frame_number];
+
+		/* duplicate frame */
+		if (frame->valid)
+			goto error;
+
+		frame->phys_irq = map_gt_gsi(gtdt_frame->timer_interrupt,
+					     gtdt_frame->timer_flags);
+		if (frame->phys_irq <= 0) {
+			pr_warn("failed to map physical timer irq in frame %d.\n",
+				gtdt_frame->frame_number);
+			goto error;
+		}
+
+		if (gtdt_frame->virtual_timer_interrupt) {
+			frame->virt_irq =
+				map_gt_gsi(gtdt_frame->virtual_timer_interrupt,
+					   gtdt_frame->virtual_timer_flags);
+			if (frame->virt_irq <= 0) {
+				pr_warn("failed to map virtual timer irq in frame %d.\n",
+					gtdt_frame->frame_number);
+				goto error;
+			}
+		} else {
+			pr_debug("virtual timer in frame %d not implemented.\n",
+				 gtdt_frame->frame_number);
+		}
+
+		frame->cntbase = gtdt_frame->base_address;
+		/*
+		 * The CNTBaseN frame is 4KB (register offsets 0x000 - 0xFFC).
+		 * See ARM DDI 0487A.k_iss10775, page I1-5130, Table I1-4
+		 * "CNTBaseN memory map".
+		 */
+		frame->size = SZ_4K;
+		frame->valid = true;
+	}
+
+	return 0;
+
+error:
+	do {
+		if (gtdt_frame->common_flags & ACPI_GTDT_GT_IS_SECURE_TIMER ||
+		    gtdt_frame->frame_number >= ARCH_TIMER_MEM_MAX_FRAMES)
+			continue;
+
+		frame = &timer_mem->frame[gtdt_frame->frame_number];
+
+		if (frame->phys_irq > 0)
+			acpi_unregister_gsi(gtdt_frame->timer_interrupt);
+		frame->phys_irq = 0;
+
+		if (frame->virt_irq > 0)
+			acpi_unregister_gsi(gtdt_frame->virtual_timer_interrupt);
+		frame->virt_irq = 0;
+	} while (i-- >= 0 && gtdt_frame--);
+
+	return -EINVAL;
+}
+
+/**
+ * acpi_arch_timer_mem_init() - Get the info of all GT blocks in GTDT table.
+ * @timer_mem:	The pointer to the array of struct arch_timer_mem for returning
+ *		the result of parsing. The element number of this array should
+ *		be platform_timer_count(the total number of platform timers).
+ * @timer_count: It points to a integer variable which is used for storing the
+ *		number of GT blocks we have parsed.
+ *
+ * Return: 0 if success, -EINVAL/-ENODEV if error.
+ */
+int __init acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem,
+				    int *timer_count)
+{
+	int ret;
+	void *platform_timer;
+
+	*timer_count = 0;
+	for_each_platform_timer(platform_timer) {
+		if (is_timer_block(platform_timer)) {
+			ret = gtdt_parse_timer_block(platform_timer, timer_mem);
+			if (ret)
+				return ret;
+			timer_mem++;
+			(*timer_count)++;
+		}
+	}
+
+	if (*timer_count)
+		pr_info("found %d memory-mapped timer block(s).\n",
+			*timer_count);
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4b5c146fba97..31937249f8cc 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -599,6 +599,7 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
 int acpi_gtdt_map_ppi(int type);
 bool acpi_gtdt_c3stop(int type);
+int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
 #endif
 
 #else	/* !CONFIG_ACPI */
-- 
cgit v1.2.3


From 314fe91b4a99949bb720501ba74d2228093bbf47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 12 Apr 2017 12:13:57 +0200
Subject: block: remove blk_end_request_err and __blk_end_request_err

Both functions are entirely unused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 39 ---------------------------------------
 include/linux/blkdev.h |  2 --
 2 files changed, 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8654aa0cef6d..f6e18ab551e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2875,25 +2875,6 @@ bool blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_end_request_cur);
 
-/**
- * blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- *     Complete @rq till the next failure boundary.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool blk_end_request_err(struct request *rq, int error)
-{
-	WARN_ON(error >= 0);
-	return blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(blk_end_request_err);
-
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -2953,26 +2934,6 @@ bool __blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 
-/**
- * __blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- *     Complete @rq till the next failure boundary.  Must be called
- *     with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool __blk_end_request_err(struct request *rq, int error)
-{
-	WARN_ON(error >= 0);
-	return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(__blk_end_request_err);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe9c512cc6fa..cca704c80b01 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1127,12 +1127,10 @@ extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
 extern bool blk_end_request_cur(struct request *rq, int error);
-extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
-extern bool __blk_end_request_err(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
cgit v1.2.3


From fa1a15c08e23cb89c5837915b1989909bce47456 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 12 Apr 2017 12:13:58 +0200
Subject: block: remove blk_end_request_cur

This function is not used anywhere in the kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 18 ------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index f6e18ab551e7..728299323f65 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2857,24 +2857,6 @@ void blk_end_request_all(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_end_request_all);
 
-/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool blk_end_request_cur(struct request *rq, int error)
-{
-	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-EXPORT_SYMBOL(blk_end_request_cur);
-
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cca704c80b01..5b52b3d7818c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1126,7 +1126,6 @@ extern void blk_finish_request(struct request *rq, int error);
 extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
-extern bool blk_end_request_cur(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
-- 
cgit v1.2.3


From 468d01bec544286bb5283f012b95b5b84636565b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Feb 2017 11:40:15 -0800
Subject: types: Update obsolete callback_head comment

The comment header for callback_head (and thus for rcu_head) states that
the bottom two bits of a pointer to these structures must be zero.  This
is obsolete:  The new requirement is that only the bottom bit need be
zero.  This commit therefore updates this comment.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 1e7bd24848fc..258099a4ed82 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -209,7 +209,7 @@ struct ustat {
  * naturally due ABI requirements, but some architectures (like CRIS) have
  * weird ABI and we need to ask it explicitly.
  *
- * The alignment is required to guarantee that bits 0 and 1 of @next will be
+ * The alignment is required to guarantee that bit 0 of @next will be
  * clear under normal conditions -- as long as we use call_rcu(),
  * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback.
  *
-- 
cgit v1.2.3


From 48ac34666ff76843d8743db1cc78b303759916f1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 27 Feb 2017 21:14:19 +0200
Subject: hlist_add_tail_rcu disable sparse warning

sparse is unhappy about this code in hlist_add_tail_rcu:

        struct hlist_node *i, *last = NULL;

        for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
                last = i;

This is because hlist_next_rcu and hlist_next_rcu return
__rcu pointers.

It's a false positive - it's a write side primitive and so
does not need to be called in a read side critical section.

The following trivial patch disables the warning
without changing the behaviour in any way.

Note: __hlist_for_each_rcu would also remove the warning but it would be
confusing since it calls rcu_derefence and is designed to run in the rcu
read side critical section.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4f7a9561b8c4..b1fd8bf85fdc 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n,
 {
 	struct hlist_node *i, *last = NULL;
 
-	for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i))
+	/* Note: write side code, so rcu accessors are not needed. */
+	for (i = h->first; i; i = i->next)
 		last = i;
 
 	if (last) {
-- 
cgit v1.2.3


From 28c5fe99016d28f15d1b825df8acb1558a3a63a1 Mon Sep 17 00:00:00 2001
From: Felix Brack <fb@ltec.ch>
Date: Thu, 13 Apr 2017 09:51:38 +0200
Subject: leds: pca9532: Extend pca9532 device tree support

This patch extends the device tree support for the pca9532 by adding
the leds 'default-state' property.

Signed-off-by: Felix Brack <fb@ltec.ch>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 .../devicetree/bindings/leds/leds-pca9532.txt      | 10 ++++++++
 drivers/leds/leds-pca9532.c                        | 27 +++++++++++++++++++++-
 include/linux/leds-pca9532.h                       |  4 ++--
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/leds/leds-pca9532.txt b/Documentation/devicetree/bindings/leds/leds-pca9532.txt
index 198f3ba0e01f..f769c52e3643 100644
--- a/Documentation/devicetree/bindings/leds/leds-pca9532.txt
+++ b/Documentation/devicetree/bindings/leds/leds-pca9532.txt
@@ -17,6 +17,8 @@ Optional sub-node properties:
 	- label: see Documentation/devicetree/bindings/leds/common.txt
 	- type: Output configuration, see dt-bindings/leds/leds-pca9532.h (default NONE)
 	- linux,default-trigger: see Documentation/devicetree/bindings/leds/common.txt
+	- default-state: see Documentation/devicetree/bindings/leds/common.txt
+	  This property is only valid for sub-nodes of type <PCA9532_TYPE_LED>.
 
 Example:
   #include <dt-bindings/leds/leds-pca9532.h>
@@ -33,6 +35,14 @@ Example:
       label = "pca:green:power";
       type = <PCA9532_TYPE_LED>;
     };
+    kernel-booting {
+      type = <PCA9532_TYPE_LED>;
+      default-state = "on";
+    };
+    sys-stat {
+      type = <PCA9532_TYPE_LED>;
+      default-state = "keep"; // don't touch, was set by U-Boot
+    };
   };
 
 For more product information please see the link below:
diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c
index 06e63106ae1e..7fea18b0c15d 100644
--- a/drivers/leds/leds-pca9532.c
+++ b/drivers/leds/leds-pca9532.c
@@ -254,6 +254,21 @@ static void pca9532_input_work(struct work_struct *work)
 	mutex_unlock(&data->update_lock);
 }
 
+static enum pca9532_state pca9532_getled(struct pca9532_led *led)
+{
+	struct i2c_client *client = led->client;
+	struct pca9532_data *data = i2c_get_clientdata(client);
+	u8 maxleds = data->chip_info->num_leds;
+	char reg;
+	enum pca9532_state ret;
+
+	mutex_lock(&data->update_lock);
+	reg = i2c_smbus_read_byte_data(client, LED_REG(maxleds, led->id));
+	ret = reg >> LED_NUM(led->id)/2;
+	mutex_unlock(&data->update_lock);
+	return ret;
+}
+
 #ifdef CONFIG_LEDS_PCA9532_GPIO
 static int pca9532_gpio_request_pin(struct gpio_chip *gc, unsigned offset)
 {
@@ -366,7 +381,10 @@ static int pca9532_configure(struct i2c_client *client,
 			gpios++;
 			break;
 		case PCA9532_TYPE_LED:
-			led->state = pled->state;
+			if (pled->state == PCA9532_KEEP)
+				led->state = pca9532_getled(led);
+			else
+				led->state = pled->state;
 			led->name = pled->name;
 			led->ldev.name = led->name;
 			led->ldev.default_trigger = pled->default_trigger;
@@ -456,6 +474,7 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
 	const struct of_device_id *match;
 	int devid, maxleds;
 	int i = 0;
+	const char *state;
 
 	match = of_match_device(of_pca9532_leds_match, dev);
 	if (!match)
@@ -475,6 +494,12 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
 		of_property_read_u32(child, "type", &pdata->leds[i].type);
 		of_property_read_string(child, "linux,default-trigger",
 					&pdata->leds[i].default_trigger);
+		if (!of_property_read_string(child, "default-state", &state)) {
+			if (!strcmp(state, "on"))
+				pdata->leds[i].state = PCA9532_ON;
+			else if (!strcmp(state, "keep"))
+				pdata->leds[i].state = PCA9532_KEEP;
+		}
 		if (++i >= maxleds) {
 			of_node_put(child);
 			break;
diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h
index d215b4561180..5e240b2b4d58 100644
--- a/include/linux/leds-pca9532.h
+++ b/include/linux/leds-pca9532.h
@@ -22,7 +22,8 @@ enum pca9532_state {
 	PCA9532_OFF  = 0x0,
 	PCA9532_ON   = 0x1,
 	PCA9532_PWM0 = 0x2,
-	PCA9532_PWM1 = 0x3
+	PCA9532_PWM1 = 0x3,
+	PCA9532_KEEP = 0xff,
 };
 
 struct pca9532_led {
@@ -44,4 +45,3 @@ struct pca9532_platform_data {
 };
 
 #endif /* __LINUX_PCA9532_H */
-
-- 
cgit v1.2.3


From 4a67c9fde04fc1b6752fa68c495310ca3ed29eeb Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Fri, 31 Mar 2017 11:11:48 +0200
Subject: mtd: use dev_of_node helper in mtd_get_of_node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows better compile-time optimizations with CONFIG_OF disabled.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>
---
 include/linux/mtd/mtd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eebdc63cf6af..f8db5b2e4028 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -393,7 +393,7 @@ static inline void mtd_set_of_node(struct mtd_info *mtd,
 
 static inline struct device_node *mtd_get_of_node(struct mtd_info *mtd)
 {
-	return mtd->dev.of_node;
+	return dev_of_node(&mtd->dev);
 }
 
 static inline int mtd_oobavail(struct mtd_info *mtd, struct mtd_oob_ops *ops)
-- 
cgit v1.2.3


From cf9ea8ca4a0bea7eda12f8fb04dc34146839a215 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Wed, 19 Apr 2017 17:48:51 +0100
Subject: linux/io.h: Add pci_remap_cfgspace() interface

The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and Posting")
mandate non-posted configuration transactions. As further highlighted in
the PCIe specifications (4.0 - Rev0.3, "Ordering Considerations for the
Enhanced Configuration Access Mechanism"), through ECAM and ECAM-derivative
configuration mechanism, the memory mapped transactions from the host CPU
into Configuration Requests on the PCI express fabric may create ordering
problems for software because writes to memory address are typically posted
transactions (unless the architecture can enforce through virtual address
mapping non-posted write transactions behaviour) but writes to
Configuration Space are not posted on the PCI express fabric.

Current DT and ACPI host bridge controllers map PCI configuration space
(ECAM and ECAM-derivative) into the virtual address space through ioremap()
calls, that are non-cacheable device accesses on most architectures, but
may provide "bufferable" or "posted" write semantics in architecture like
eg ARM/ARM64 that allow ioremap'ed regions writes to be buffered in the bus
connecting the host CPU to the PCI fabric; this behaviour, as underlined in
the PCIe specifications, may trigger transactions ordering rules and must
be prevented.

Introduce a new generic and explicit API to create a memory mapping for
ECAM and ECAM-derivative config space area that defaults to
ioremap_nocache() (which should provide a sane default behaviour) but still
allowing architectures on which ioremap_nocache() results in posted write
transactions to override the function call with an arch specific
implementation that complies with the PCI specifications for configuration
transactions.

[bhelgaas: fold in #ifdef CONFIG_PCI wrapper]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/io.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index 82ef36eac8a1..2195d9ea4aaa 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -90,6 +90,27 @@ void devm_memunmap(struct device *dev, void *addr);
 
 void *__devm_memremap_pages(struct device *dev, struct resource *res);
 
+#ifdef CONFIG_PCI
+/*
+ * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and
+ * Posting") mandate non-posted configuration transactions. There is
+ * no ioremap API in the kernel that can guarantee non-posted write
+ * semantics across arches so provide a default implementation for
+ * mapping PCI config space that defaults to ioremap_nocache(); arches
+ * should override it if they have memory mapping implementations that
+ * guarantee non-posted writes semantics to make the memory mapping
+ * compliant with the PCI specification.
+ */
+#ifndef pci_remap_cfgspace
+#define pci_remap_cfgspace pci_remap_cfgspace
+static inline void __iomem *pci_remap_cfgspace(phys_addr_t offset,
+					       size_t size)
+{
+	return ioremap_nocache(offset, size);
+}
+#endif
+#endif
+
 /*
  * Some systems do not have legacy ISA devices.
  * /dev/port is not a valid interface on these systems.
-- 
cgit v1.2.3


From 8661423eea1a1b58417014716e3f1ba286072379 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 19 Apr 2017 14:02:08 +0200
Subject: ACPI / utils: Add new acpi_dev_present helper

acpi_dev_found just iterates over all ACPI-ids and sees if one matches.
This means that it will return true for devices which are in the DSDT
but disabled (their _STA method returns 0).

For some drivers it is useful to be able to check if a certain HID
is not only present in the namespace, but also actually present as in
acpi_device_is_present() will return true for the device. For example
because if a certain device is present then the driver will want to use
an extcon or IIO ADC channel provided by that device.

This commit adds a new acpi_dev_present helper which drivers can use
to this end.

Like acpi_dev_found, acpi_dev_present take a HID as argument, but
it also has 2 extra optional arguments to only check for an ACPI
device with a specific UID and/or HRV value. This makes it more
generic and allows it to replace custom code doing similar checks
in several places.

Arguably acpi_dev_present is what acpi_dev_found should have been, but
there are too many users to just change acpi_dev_found without the risk
of breaking something.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  5 ++++
 3 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 22c09952e177..27d0dcfcf47d 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -736,6 +736,72 @@ bool acpi_dev_found(const char *hid)
 }
 EXPORT_SYMBOL(acpi_dev_found);
 
+struct acpi_dev_present_info {
+	struct acpi_device_id hid[2];
+	const char *uid;
+	s64 hrv;
+};
+
+static int acpi_dev_present_cb(struct device *dev, void *data)
+{
+	struct acpi_device *adev = to_acpi_device(dev);
+	struct acpi_dev_present_info *match = data;
+	unsigned long long hrv;
+	acpi_status status;
+
+	if (acpi_match_device_ids(adev, match->hid))
+		return 0;
+
+	if (match->uid && (!adev->pnp.unique_id ||
+	    strcmp(adev->pnp.unique_id, match->uid)))
+		return 0;
+
+	if (match->hrv == -1)
+		return 1;
+
+	status = acpi_evaluate_integer(adev->handle, "_HRV", NULL, &hrv);
+	if (ACPI_FAILURE(status))
+		return 0;
+
+	return hrv == match->hrv;
+}
+
+/**
+ * acpi_dev_present - Detect that a given ACPI device is present
+ * @hid: Hardware ID of the device.
+ * @uid: Unique ID of the device, pass NULL to not check _UID
+ * @hrv: Hardware Revision of the device, pass -1 to not check _HRV
+ *
+ * Return %true if a matching device was present at the moment of invocation.
+ * Note that if the device is pluggable, it may since have disappeared.
+ *
+ * Note that unlike acpi_dev_found() this function checks the status
+ * of the device. So for devices which are present in the dsdt, but
+ * which are disabled (their _STA callback returns 0) this function
+ * will return false.
+ *
+ * For this function to work, acpi_bus_scan() must have been executed
+ * which happens in the subsys_initcall() subsection. Hence, do not
+ * call from a subsys_initcall() or earlier (use acpi_get_devices()
+ * instead). Calling from module_init() is fine (which is synonymous
+ * with device_initcall()).
+ */
+bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
+{
+	struct acpi_dev_present_info match = {};
+	struct device *dev;
+
+	strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
+	match.uid = uid;
+	match.hrv = hrv;
+
+	dev = bus_find_device(&acpi_bus_type, NULL, &match,
+			      acpi_dev_present_cb);
+
+	return !!dev;
+}
+EXPORT_SYMBOL(acpi_dev_present);
+
 /*
  * acpi_backlight= handling, this is done here rather then in video_detect.c
  * because __setup cannot be used in modules.
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ef0ae8aaa567..b53c058e7009 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -88,6 +88,7 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const u8 *uuid, u64 rev, u64 func,
 	}
 
 bool acpi_dev_found(const char *hid);
+bool acpi_dev_present(const char *hid, const char *uid, s64 hrv);
 
 #ifdef CONFIG_ACPI
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..841a8dc55ade 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -611,6 +611,11 @@ static inline bool acpi_dev_found(const char *hid)
 	return false;
 }
 
+static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
+{
+	return false;
+}
+
 static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 {
 	return false;
-- 
cgit v1.2.3


From 72058005411ffddcae6c06f7b691d635489132af Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 19 Apr 2017 15:14:31 -0700
Subject: dax: add a facility to lookup a dax device by 'host' device name

For the current block_device based filesystem-dax path, we need a way
for it to lookup the dax_device associated with a block_device. Add a
'host' property of a dax_device that can be used for this purpose. It is
a free form string, but for a dax_device associated with a block device
it is the bdev name.

This is a stop-gap until filesystems are able to mount on a dax-inode
directly.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.h    |  2 +-
 drivers/dax/device.c |  2 +-
 drivers/dax/super.c  | 87 +++++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/dax.h  |  1 +
 4 files changed, 86 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 2472d9da96db..246a24d68d4c 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,7 +13,7 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_device *alloc_dax(void *private);
+struct dax_device *alloc_dax(void *private, const char *host);
 void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 19a42edbfa03..db68f4fa8ce0 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -645,7 +645,7 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		goto err_id;
 	}
 
-	dax_dev = alloc_dax(dev_dax);
+	dax_dev = alloc_dax(dev_dax, NULL);
 	if (!dax_dev)
 		goto err_dax;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index c9f85f1c086e..8d446674c1da 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -30,6 +30,10 @@ static DEFINE_IDA(dax_minor_ida);
 static struct kmem_cache *dax_cache __read_mostly;
 static struct super_block *dax_superblock __read_mostly;
 
+#define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head))
+static struct hlist_head dax_host_list[DAX_HASH_SIZE];
+static DEFINE_SPINLOCK(dax_host_lock);
+
 int dax_read_lock(void)
 {
 	return srcu_read_lock(&dax_srcu);
@@ -46,12 +50,15 @@ EXPORT_SYMBOL_GPL(dax_read_unlock);
  * struct dax_device - anchor object for dax services
  * @inode: core vfs
  * @cdev: optional character interface for "device dax"
+ * @host: optional name for lookups where the device path is not available
  * @private: dax driver private data
  * @alive: !alive + rcu grace period == no new operations / mappings
  */
 struct dax_device {
+	struct hlist_node list;
 	struct inode inode;
 	struct cdev cdev;
+	const char *host;
 	void *private;
 	bool alive;
 };
@@ -63,6 +70,11 @@ bool dax_alive(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_alive);
 
+static int dax_host_hash(const char *host)
+{
+	return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE;
+}
+
 /*
  * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
  * that any fault handlers or operations that might have seen
@@ -75,7 +87,13 @@ void kill_dax(struct dax_device *dax_dev)
 		return;
 
 	dax_dev->alive = false;
+
 	synchronize_srcu(&dax_srcu);
+
+	spin_lock(&dax_host_lock);
+	hlist_del_init(&dax_dev->list);
+	spin_unlock(&dax_host_lock);
+
 	dax_dev->private = NULL;
 }
 EXPORT_SYMBOL_GPL(kill_dax);
@@ -98,6 +116,8 @@ static void dax_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct dax_device *dax_dev = to_dax_dev(inode);
 
+	kfree(dax_dev->host);
+	dax_dev->host = NULL;
 	ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev));
 	kmem_cache_free(dax_cache, dax_dev);
 }
@@ -169,26 +189,53 @@ static struct dax_device *dax_dev_get(dev_t devt)
 	return dax_dev;
 }
 
-struct dax_device *alloc_dax(void *private)
+static void dax_add_host(struct dax_device *dax_dev, const char *host)
+{
+	int hash;
+
+	/*
+	 * Unconditionally init dax_dev since it's coming from a
+	 * non-zeroed slab cache
+	 */
+	INIT_HLIST_NODE(&dax_dev->list);
+	dax_dev->host = host;
+	if (!host)
+		return;
+
+	hash = dax_host_hash(host);
+	spin_lock(&dax_host_lock);
+	hlist_add_head(&dax_dev->list, &dax_host_list[hash]);
+	spin_unlock(&dax_host_lock);
+}
+
+struct dax_device *alloc_dax(void *private, const char *__host)
 {
 	struct dax_device *dax_dev;
+	const char *host;
 	dev_t devt;
 	int minor;
 
+	host = kstrdup(__host, GFP_KERNEL);
+	if (__host && !host)
+		return NULL;
+
 	minor = ida_simple_get(&dax_minor_ida, 0, nr_dax, GFP_KERNEL);
 	if (minor < 0)
-		return NULL;
+		goto err_minor;
 
 	devt = MKDEV(MAJOR(dax_devt), minor);
 	dax_dev = dax_dev_get(devt);
 	if (!dax_dev)
-		goto err_inode;
+		goto err_dev;
 
+	dax_add_host(dax_dev, host);
 	dax_dev->private = private;
 	return dax_dev;
 
- err_inode:
+ err_dev:
 	ida_simple_remove(&dax_minor_ida, minor);
+ err_minor:
+	kfree(host);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(alloc_dax);
@@ -201,6 +248,38 @@ void put_dax(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(put_dax);
 
+/**
+ * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
+ * @host: alternate name for the device registered by a dax driver
+ */
+struct dax_device *dax_get_by_host(const char *host)
+{
+	struct dax_device *dax_dev, *found = NULL;
+	int hash, id;
+
+	if (!host)
+		return NULL;
+
+	hash = dax_host_hash(host);
+
+	id = dax_read_lock();
+	spin_lock(&dax_host_lock);
+	hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) {
+		if (!dax_alive(dax_dev)
+				|| strcmp(host, dax_dev->host) != 0)
+			continue;
+
+		if (igrab(&dax_dev->inode))
+			found = dax_dev;
+		break;
+	}
+	spin_unlock(&dax_host_lock);
+	dax_read_unlock(id);
+
+	return found;
+}
+EXPORT_SYMBOL_GPL(dax_get_by_host);
+
 /**
  * inode_dax: convert a public inode into its dax_dev
  * @inode: An inode with i_cdev pointing to a dax_dev
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 5b62f5d19aea..9b2d5ba10d7d 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -10,6 +10,7 @@ struct iomap_ops;
 
 int dax_read_lock(void);
 void dax_read_unlock(int id);
+struct dax_device *dax_get_by_host(const char *host);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
-- 
cgit v1.2.3


From 6568b08b77816cda2a95919c7494108d983d5941 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 24 Jan 2017 18:44:18 -0800
Subject: dax: introduce dax_operations

Track a set of dax_operations per dax_device that can be set at
alloc_dax() time. These operations will be used to stop the abuse of
block_device_operations for communicating dax capabilities to
filesystems. It will also be used to replace the "pmem api" and move
pmem-specific cache maintenance, and other dax-driver-specific
filesystem-dax operations, to dax device methods. In particular this
allows us to stop abusing __copy_user_nocache(), via memcpy_to_pmem(),
with a driver specific replacement.

This is a standalone introduction of the operations. Follow on patches
convert each dax-driver and teach fs/dax.c to use ->direct_access() from
dax_operations instead of block_device_operations.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.h    |  4 +++-
 drivers/dax/device.c |  6 +++++-
 drivers/dax/super.c  |  6 +++++-
 include/linux/dax.h  | 10 ++++++++++
 4 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 246a24d68d4c..617bbc24be2b 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,7 +13,9 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_device *alloc_dax(void *private, const char *host);
+struct dax_operations;
+struct dax_device *alloc_dax(void *private, const char *host,
+		const struct dax_operations *ops);
 void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index db68f4fa8ce0..a0db055054a4 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -645,7 +645,11 @@ struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region,
 		goto err_id;
 	}
 
-	dax_dev = alloc_dax(dev_dax, NULL);
+	/*
+	 * No 'host' or dax_operations since there is no access to this
+	 * device outside of mmap of the resulting character device.
+	 */
+	dax_dev = alloc_dax(dev_dax, NULL, NULL);
 	if (!dax_dev)
 		goto err_dax;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 8d446674c1da..1a58542ee8fd 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -17,6 +17,7 @@
 #include <linux/cdev.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
+#include <linux/dax.h>
 #include <linux/fs.h>
 
 static int nr_dax = CONFIG_NR_DEV_DAX;
@@ -61,6 +62,7 @@ struct dax_device {
 	const char *host;
 	void *private;
 	bool alive;
+	const struct dax_operations *ops;
 };
 
 bool dax_alive(struct dax_device *dax_dev)
@@ -208,7 +210,8 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host)
 	spin_unlock(&dax_host_lock);
 }
 
-struct dax_device *alloc_dax(void *private, const char *__host)
+struct dax_device *alloc_dax(void *private, const char *__host,
+		const struct dax_operations *ops)
 {
 	struct dax_device *dax_dev;
 	const char *host;
@@ -229,6 +232,7 @@ struct dax_device *alloc_dax(void *private, const char *__host)
 		goto err_dev;
 
 	dax_add_host(dax_dev, host);
+	dax_dev->ops = ops;
 	dax_dev->private = private;
 	return dax_dev;
 
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9b2d5ba10d7d..74ebb92b625a 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,16 @@
 #include <asm/pgtable.h>
 
 struct iomap_ops;
+struct dax_device;
+struct dax_operations {
+	/*
+	 * direct_access: translate a device-relative
+	 * logical-page-offset into an absolute physical pfn. Return the
+	 * number of pages available for DAX at that pfn.
+	 */
+	long (*direct_access)(struct dax_device *, pgoff_t, long,
+			void **, pfn_t *);
+};
 
 int dax_read_lock(void);
 void dax_read_unlock(int id);
-- 
cgit v1.2.3


From c1d6e828a35df524df2af277eedd1471d05e4f4c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 24 Jan 2017 23:02:09 -0800
Subject: pmem: add dax_operations support

Setup a dax_device to have the same lifetime as the pmem block device
and add a ->direct_access() method that is equivalent to
pmem_direct_access(). Once fs/dax.c has been converted to use
dax_operations the old pmem_direct_access() will be removed.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax.h               |  7 -----
 drivers/nvdimm/Kconfig          |  1 +
 drivers/nvdimm/pmem.c           | 61 ++++++++++++++++++++++++++++++++---------
 drivers/nvdimm/pmem.h           |  7 +++--
 include/linux/dax.h             |  6 ++++
 tools/testing/nvdimm/pmem-dax.c | 21 +++++++-------
 6 files changed, 70 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
index 617bbc24be2b..f9e5feea742c 100644
--- a/drivers/dax/dax.h
+++ b/drivers/dax/dax.h
@@ -13,13 +13,6 @@
 #ifndef __DAX_H__
 #define __DAX_H__
 struct dax_device;
-struct dax_operations;
-struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops);
-void put_dax(struct dax_device *dax_dev);
-bool dax_alive(struct dax_device *dax_dev);
-void kill_dax(struct dax_device *dax_dev);
 struct dax_device *inode_dax(struct inode *inode);
 struct inode *dax_inode(struct dax_device *dax_dev);
-void *dax_get_private(struct dax_device *dax_dev);
 #endif /* __DAX_H__ */
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 59e750183b7f..5bdd499b5f4f 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -20,6 +20,7 @@ if LIBNVDIMM
 config BLK_DEV_PMEM
 	tristate "PMEM: Persistent memory block device support"
 	default LIBNVDIMM
+	select DAX
 	select ND_BTT if BTT
 	select ND_PFN if NVDIMM_PFN
 	help
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fbbcf8154eec 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -28,6 +28,7 @@
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
+#include <linux/dax.h>
 #include <linux/nd.h>
 #include "pmem.h"
 #include "pfn.h"
@@ -199,13 +200,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 }
 
 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
-__weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void **kaddr, pfn_t *pfn, long size)
+__weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-	resource_size_t offset = sector * 512 + pmem->data_offset;
+	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
+					PFN_PHYS(nr_pages))))
 		return -EIO;
 	*kaddr = pmem->virt_addr + offset;
 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
@@ -215,26 +216,51 @@ __weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	 * requested range.
 	 */
 	if (unlikely(pmem->bb.count))
-		return size;
-	return pmem->size - pmem->pfn_pad - offset;
+		return nr_pages;
+	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
+}
+
+static long pmem_blk_direct_access(struct block_device *bdev, sector_t sector,
+		void **kaddr, pfn_t *pfn, long size)
+{
+	struct pmem_device *pmem = bdev->bd_queue->queuedata;
+
+	return __pmem_direct_access(pmem, PHYS_PFN(sector * 512),
+			PHYS_PFN(size), kaddr, pfn);
 }
 
 static const struct block_device_operations pmem_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		pmem_rw_page,
-	.direct_access =	pmem_direct_access,
+	.direct_access =	pmem_blk_direct_access,
 	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
+static long pmem_dax_direct_access(struct dax_device *dax_dev,
+		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct pmem_device *pmem = dax_get_private(dax_dev);
+
+	return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn);
+}
+
+static const struct dax_operations pmem_dax_ops = {
+	.direct_access = pmem_dax_direct_access,
+};
+
 static void pmem_release_queue(void *q)
 {
 	blk_cleanup_queue(q);
 }
 
-static void pmem_release_disk(void *disk)
+static void pmem_release_disk(void *__pmem)
 {
-	del_gendisk(disk);
-	put_disk(disk);
+	struct pmem_device *pmem = __pmem;
+
+	kill_dax(pmem->dax_dev);
+	put_dax(pmem->dax_dev);
+	del_gendisk(pmem->disk);
+	put_disk(pmem->disk);
 }
 
 static int pmem_attach_disk(struct device *dev,
@@ -245,6 +271,7 @@ static int pmem_attach_disk(struct device *dev,
 	struct vmem_altmap __altmap, *altmap = NULL;
 	struct resource *res = &nsio->res;
 	struct nd_pfn *nd_pfn = NULL;
+	struct dax_device *dax_dev;
 	int nid = dev_to_node(dev);
 	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
@@ -325,6 +352,7 @@ static int pmem_attach_disk(struct device *dev,
 	disk = alloc_disk_node(0, nid);
 	if (!disk)
 		return -ENOMEM;
+	pmem->disk = disk;
 
 	disk->fops		= &pmem_fops;
 	disk->queue		= q;
@@ -336,9 +364,16 @@ static int pmem_attach_disk(struct device *dev,
 		return -ENOMEM;
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
 	disk->bb = &pmem->bb;
-	device_add_disk(dev, disk);
 
-	if (devm_add_action_or_reset(dev, pmem_release_disk, disk))
+	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+	if (!dax_dev) {
+		put_disk(disk);
+		return -ENOMEM;
+	}
+	pmem->dax_dev = dax_dev;
+
+	device_add_disk(dev, disk);
+	if (devm_add_action_or_reset(dev, pmem_release_disk, pmem))
 		return -ENOMEM;
 
 	revalidate_disk(disk);
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index b4ee4f71b4a1..7f4dbd72a90a 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,8 +5,6 @@
 #include <linux/pfn_t.h>
 #include <linux/fs.h>
 
-long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		      void **kaddr, pfn_t *pfn, long size);
 /* this definition is in it's own header for tools/testing/nvdimm to consume */
 struct pmem_device {
 	/* One contiguous memory region per device */
@@ -20,5 +18,10 @@ struct pmem_device {
 	/* trim size when namespace capacity has been section aligned */
 	u32			pfn_pad;
 	struct badblocks	bb;
+	struct dax_device	*dax_dev;
+	struct gendisk		*disk;
 };
+
+long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn);
 #endif /* __NVDIMM_PMEM_H__ */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 74ebb92b625a..39a0312c45c3 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -21,6 +21,12 @@ struct dax_operations {
 int dax_read_lock(void);
 void dax_read_unlock(int id);
 struct dax_device *dax_get_by_host(const char *host);
+struct dax_device *alloc_dax(void *private, const char *host,
+		const struct dax_operations *ops);
+void put_dax(struct dax_device *dax_dev);
+bool dax_alive(struct dax_device *dax_dev);
+void kill_dax(struct dax_device *dax_dev);
+void *dax_get_private(struct dax_device *dax_dev);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
diff --git a/tools/testing/nvdimm/pmem-dax.c b/tools/testing/nvdimm/pmem-dax.c
index c9b8c48f85fc..b53596ad601b 100644
--- a/tools/testing/nvdimm/pmem-dax.c
+++ b/tools/testing/nvdimm/pmem-dax.c
@@ -15,13 +15,13 @@
 #include <pmem.h>
 #include <nd.h>
 
-long pmem_direct_access(struct block_device *bdev, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
+long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
-	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-	resource_size_t offset = sector * 512 + pmem->data_offset;
+	resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset;
 
-	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
+	if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512,
+					PFN_PHYS(nr_pages))))
 		return -EIO;
 
 	/*
@@ -34,11 +34,10 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector,
 		*kaddr = pmem->virt_addr + offset;
 		page = vmalloc_to_page(pmem->virt_addr + offset);
 		*pfn = page_to_pfn_t(page);
-		dev_dbg_ratelimited(disk_to_dev(bdev->bd_disk)->parent,
-				"%s: sector: %#llx pfn: %#lx\n", __func__,
-				(unsigned long long) sector, page_to_pfn(page));
+		pr_debug_ratelimited("%s: pmem: %p pgoff: %#lx pfn: %#lx\n",
+				__func__, pmem, pgoff, page_to_pfn(page));
 
-		return PAGE_SIZE;
+		return 1;
 	}
 
 	*kaddr = pmem->virt_addr + offset;
@@ -49,6 +48,6 @@ long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	 * requested range.
 	 */
 	if (unlikely(pmem->bb.count))
-		return size;
-	return pmem->size - pmem->pfn_pad - offset;
+		return nr_pages;
+	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
 }
-- 
cgit v1.2.3


From da8d7f079b868ceab830309f80efc69d350576f3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 19 Apr 2017 14:01:24 -0700
Subject: block: Export blk_init_request_from_bio()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export this function such that it becomes available to block
drivers.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Matias Bjørling <m@bjorling.me>
Cc: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 5 +++--
 block/blk-mq.c         | 2 +-
 block/blk.h            | 1 -
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9697b789408f..72544b462657 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1628,7 +1628,7 @@ out:
 	return ret;
 }
 
-void init_request_from_bio(struct request *req, struct bio *bio)
+void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
@@ -1640,6 +1640,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
+EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
@@ -1730,7 +1731,7 @@ get_rq:
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
-	init_request_from_bio(req, bio);
+	blk_init_request_from_bio(req, bio);
 
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 		req->cpu = raw_smp_processor_id();
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2ef7b460924..c496692ecc5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1424,7 +1424,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
-	init_request_from_bio(rq, bio);
+	blk_init_request_from_bio(rq, bio);
 
 	blk_account_io_start(rq, true);
 }
diff --git a/block/blk.h b/block/blk.h
index 35b3041eec1a..2ed70228e44f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,7 +60,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask);
 void blk_exit_rl(struct request_list *rl);
-void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 void blk_queue_bypass_start(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5b52b3d7818c..3470375952a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -924,6 +924,7 @@ extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
+extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
-- 
cgit v1.2.3


From 0be0dee64eacd950f8e4b6c45adb5a92392eaaaf Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 19 Apr 2017 14:01:27 -0700
Subject: block: Inline blk_rq_set_prio()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since only a single caller remains, inline blk_rq_set_prio(). Initialize
req->ioprio even if no I/O priority has been set in the bio nor in the
I/O context.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Tested-by: Adam Manzanares <adam.manzanares@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  7 ++++++-
 include/linux/blkdev.h | 14 --------------
 2 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 72544b462657..25aea293ee98 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1630,14 +1630,19 @@ out:
 
 void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
+	struct io_context *ioc = rq_ioc(bio);
+
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->errors = 0;
 	req->__sector = bio->bi_iter.bi_sector;
-	blk_rq_set_prio(req, rq_ioc(bio));
 	if (ioprio_valid(bio_prio(bio)))
 		req->ioprio = bio_prio(bio);
+	else if (ioc)
+		req->ioprio = ioc->ioprio;
+	else
+		req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 	blk_rq_bio_prep(req->q, req, bio);
 }
 EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3470375952a1..51c9e391798e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1087,20 +1087,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
 	return nr_bios;
 }
 
-/*
- * blk_rq_set_prio - associate a request with prio from ioc
- * @rq: request of interest
- * @ioc: target iocontext
- *
- * Assocate request prio with ioc prio so request based drivers
- * can leverage priority information.
- */
-static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc)
-{
-	if (ioc)
-		rq->ioprio = ioc->ioprio;
-}
-
 /*
  * Request issue related functions.
  */
-- 
cgit v1.2.3


From f9b67f0014cba18f1aabb6fa9272335a043eb6fd Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Wed, 19 Apr 2017 13:36:10 -0600
Subject: dma-buf: Rename dma-ops to prevent conflict with kunmap_atomic macro

Seeing the kunmap_atomic dma_buf_ops share the same name with a macro
in highmem.h, the former can be aliased if any dma-buf user includes
that header.

I'm personally trying to include highmem.h inside scatterlist.h and this
breaks the dma-buf code proper.

Christoph Hellwig suggested [1] renaming it and pushing this patch ASAP.

To maintain consistency I've renamed all four of kmap* and kunmap* to be
map* and unmap*. (Even though only kmap_atomic presently conflicts.)

[1] https://www.spinics.net/lists/target-devel/msg15070.html

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Sinclair Yeh <syeh@vmware.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Sumit Semwal <sumit.semwal@linaro.org>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: http://patchwork.freedesktop.org/patch/msgid/1492630570-879-1-git-send-email-logang@deltatee.com
---
 drivers/dma-buf/dma-buf.c                      | 16 ++++++++--------
 drivers/gpu/drm/armada/armada_gem.c            |  8 ++++----
 drivers/gpu/drm/drm_prime.c                    |  8 ++++----
 drivers/gpu/drm/i915/i915_gem_dmabuf.c         |  8 ++++----
 drivers/gpu/drm/i915/selftests/mock_dmabuf.c   |  8 ++++----
 drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c      |  8 ++++----
 drivers/gpu/drm/tegra/gem.c                    |  8 ++++----
 drivers/gpu/drm/udl/udl_dmabuf.c               |  8 ++++----
 drivers/gpu/drm/vmwgfx/vmwgfx_prime.c          |  8 ++++----
 drivers/media/v4l2-core/videobuf2-dma-contig.c |  4 ++--
 drivers/media/v4l2-core/videobuf2-dma-sg.c     |  4 ++--
 drivers/media/v4l2-core/videobuf2-vmalloc.c    |  4 ++--
 drivers/staging/android/ion/ion.c              |  8 ++++----
 include/linux/dma-buf.h                        | 22 +++++++++++-----------
 14 files changed, 61 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index f72aaacbe023..512bdbc23bbb 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -405,8 +405,8 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 			  || !exp_info->ops->map_dma_buf
 			  || !exp_info->ops->unmap_dma_buf
 			  || !exp_info->ops->release
-			  || !exp_info->ops->kmap_atomic
-			  || !exp_info->ops->kmap
+			  || !exp_info->ops->map_atomic
+			  || !exp_info->ops->map
 			  || !exp_info->ops->mmap)) {
 		return ERR_PTR(-EINVAL);
 	}
@@ -872,7 +872,7 @@ void *dma_buf_kmap_atomic(struct dma_buf *dmabuf, unsigned long page_num)
 {
 	WARN_ON(!dmabuf);
 
-	return dmabuf->ops->kmap_atomic(dmabuf, page_num);
+	return dmabuf->ops->map_atomic(dmabuf, page_num);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kmap_atomic);
 
@@ -889,8 +889,8 @@ void dma_buf_kunmap_atomic(struct dma_buf *dmabuf, unsigned long page_num,
 {
 	WARN_ON(!dmabuf);
 
-	if (dmabuf->ops->kunmap_atomic)
-		dmabuf->ops->kunmap_atomic(dmabuf, page_num, vaddr);
+	if (dmabuf->ops->unmap_atomic)
+		dmabuf->ops->unmap_atomic(dmabuf, page_num, vaddr);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kunmap_atomic);
 
@@ -907,7 +907,7 @@ void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long page_num)
 {
 	WARN_ON(!dmabuf);
 
-	return dmabuf->ops->kmap(dmabuf, page_num);
+	return dmabuf->ops->map(dmabuf, page_num);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kmap);
 
@@ -924,8 +924,8 @@ void dma_buf_kunmap(struct dma_buf *dmabuf, unsigned long page_num,
 {
 	WARN_ON(!dmabuf);
 
-	if (dmabuf->ops->kunmap)
-		dmabuf->ops->kunmap(dmabuf, page_num, vaddr);
+	if (dmabuf->ops->unmap)
+		dmabuf->ops->unmap(dmabuf, page_num, vaddr);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kunmap);
 
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 1597458d884e..d6c2a5d190eb 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -529,10 +529,10 @@ static const struct dma_buf_ops armada_gem_prime_dmabuf_ops = {
 	.map_dma_buf	= armada_gem_prime_map_dma_buf,
 	.unmap_dma_buf	= armada_gem_prime_unmap_dma_buf,
 	.release	= drm_gem_dmabuf_release,
-	.kmap_atomic	= armada_gem_dmabuf_no_kmap,
-	.kunmap_atomic	= armada_gem_dmabuf_no_kunmap,
-	.kmap		= armada_gem_dmabuf_no_kmap,
-	.kunmap		= armada_gem_dmabuf_no_kunmap,
+	.map_atomic	= armada_gem_dmabuf_no_kmap,
+	.unmap_atomic	= armada_gem_dmabuf_no_kunmap,
+	.map		= armada_gem_dmabuf_no_kmap,
+	.unmap		= armada_gem_dmabuf_no_kunmap,
 	.mmap		= armada_gem_dmabuf_mmap,
 };
 
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 9fb65b736a90..954eb848b5e2 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -403,10 +403,10 @@ static const struct dma_buf_ops drm_gem_prime_dmabuf_ops =  {
 	.map_dma_buf = drm_gem_map_dma_buf,
 	.unmap_dma_buf = drm_gem_unmap_dma_buf,
 	.release = drm_gem_dmabuf_release,
-	.kmap = drm_gem_dmabuf_kmap,
-	.kmap_atomic = drm_gem_dmabuf_kmap_atomic,
-	.kunmap = drm_gem_dmabuf_kunmap,
-	.kunmap_atomic = drm_gem_dmabuf_kunmap_atomic,
+	.map = drm_gem_dmabuf_kmap,
+	.map_atomic = drm_gem_dmabuf_kmap_atomic,
+	.unmap = drm_gem_dmabuf_kunmap,
+	.unmap_atomic = drm_gem_dmabuf_kunmap_atomic,
 	.mmap = drm_gem_dmabuf_mmap,
 	.vmap = drm_gem_dmabuf_vmap,
 	.vunmap = drm_gem_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index 11898cd97596..f225bf680b6d 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -200,10 +200,10 @@ static const struct dma_buf_ops i915_dmabuf_ops =  {
 	.map_dma_buf = i915_gem_map_dma_buf,
 	.unmap_dma_buf = i915_gem_unmap_dma_buf,
 	.release = drm_gem_dmabuf_release,
-	.kmap = i915_gem_dmabuf_kmap,
-	.kmap_atomic = i915_gem_dmabuf_kmap_atomic,
-	.kunmap = i915_gem_dmabuf_kunmap,
-	.kunmap_atomic = i915_gem_dmabuf_kunmap_atomic,
+	.map = i915_gem_dmabuf_kmap,
+	.map_atomic = i915_gem_dmabuf_kmap_atomic,
+	.unmap = i915_gem_dmabuf_kunmap,
+	.unmap_atomic = i915_gem_dmabuf_kunmap_atomic,
 	.mmap = i915_gem_dmabuf_mmap,
 	.vmap = i915_gem_dmabuf_vmap,
 	.vunmap = i915_gem_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/i915/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/selftests/mock_dmabuf.c
index 99da8f4ef497..302f7d103635 100644
--- a/drivers/gpu/drm/i915/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/selftests/mock_dmabuf.c
@@ -129,10 +129,10 @@ static const struct dma_buf_ops mock_dmabuf_ops =  {
 	.map_dma_buf = mock_map_dma_buf,
 	.unmap_dma_buf = mock_unmap_dma_buf,
 	.release = mock_dmabuf_release,
-	.kmap = mock_dmabuf_kmap,
-	.kmap_atomic = mock_dmabuf_kmap_atomic,
-	.kunmap = mock_dmabuf_kunmap,
-	.kunmap_atomic = mock_dmabuf_kunmap_atomic,
+	.map = mock_dmabuf_kmap,
+	.map_atomic = mock_dmabuf_kmap_atomic,
+	.unmap = mock_dmabuf_kunmap,
+	.unmap_atomic = mock_dmabuf_kunmap_atomic,
 	.mmap = mock_dmabuf_mmap,
 	.vmap = mock_dmabuf_vmap,
 	.vunmap = mock_dmabuf_vunmap,
diff --git a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
index ee5883f59be5..0dbe0306953d 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c
@@ -160,10 +160,10 @@ static struct dma_buf_ops omap_dmabuf_ops = {
 	.release = omap_gem_dmabuf_release,
 	.begin_cpu_access = omap_gem_dmabuf_begin_cpu_access,
 	.end_cpu_access = omap_gem_dmabuf_end_cpu_access,
-	.kmap_atomic = omap_gem_dmabuf_kmap_atomic,
-	.kunmap_atomic = omap_gem_dmabuf_kunmap_atomic,
-	.kmap = omap_gem_dmabuf_kmap,
-	.kunmap = omap_gem_dmabuf_kunmap,
+	.map_atomic = omap_gem_dmabuf_kmap_atomic,
+	.unmap_atomic = omap_gem_dmabuf_kunmap_atomic,
+	.map = omap_gem_dmabuf_kmap,
+	.unmap = omap_gem_dmabuf_kunmap,
 	.mmap = omap_gem_dmabuf_mmap,
 };
 
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 17e62ecb5d4d..8672f5d2f237 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -619,10 +619,10 @@ static const struct dma_buf_ops tegra_gem_prime_dmabuf_ops = {
 	.map_dma_buf = tegra_gem_prime_map_dma_buf,
 	.unmap_dma_buf = tegra_gem_prime_unmap_dma_buf,
 	.release = tegra_gem_prime_release,
-	.kmap_atomic = tegra_gem_prime_kmap_atomic,
-	.kunmap_atomic = tegra_gem_prime_kunmap_atomic,
-	.kmap = tegra_gem_prime_kmap,
-	.kunmap = tegra_gem_prime_kunmap,
+	.map_atomic = tegra_gem_prime_kmap_atomic,
+	.unmap_atomic = tegra_gem_prime_kunmap_atomic,
+	.map = tegra_gem_prime_kmap,
+	.unmap = tegra_gem_prime_kunmap,
 	.mmap = tegra_gem_prime_mmap,
 	.vmap = tegra_gem_prime_vmap,
 	.vunmap = tegra_gem_prime_vunmap,
diff --git a/drivers/gpu/drm/udl/udl_dmabuf.c b/drivers/gpu/drm/udl/udl_dmabuf.c
index ac90ffdb5912..ed0e636243b2 100644
--- a/drivers/gpu/drm/udl/udl_dmabuf.c
+++ b/drivers/gpu/drm/udl/udl_dmabuf.c
@@ -191,10 +191,10 @@ static struct dma_buf_ops udl_dmabuf_ops = {
 	.detach			= udl_detach_dma_buf,
 	.map_dma_buf		= udl_map_dma_buf,
 	.unmap_dma_buf		= udl_unmap_dma_buf,
-	.kmap			= udl_dmabuf_kmap,
-	.kmap_atomic		= udl_dmabuf_kmap_atomic,
-	.kunmap			= udl_dmabuf_kunmap,
-	.kunmap_atomic		= udl_dmabuf_kunmap_atomic,
+	.map			= udl_dmabuf_kmap,
+	.map_atomic		= udl_dmabuf_kmap_atomic,
+	.unmap			= udl_dmabuf_kunmap,
+	.unmap_atomic		= udl_dmabuf_kunmap_atomic,
 	.mmap			= udl_dmabuf_mmap,
 	.release		= drm_gem_dmabuf_release,
 };
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
index 31fe32d8d65a..0d42a46521fc 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_prime.c
@@ -108,10 +108,10 @@ const struct dma_buf_ops vmw_prime_dmabuf_ops =  {
 	.map_dma_buf = vmw_prime_map_dma_buf,
 	.unmap_dma_buf = vmw_prime_unmap_dma_buf,
 	.release = NULL,
-	.kmap = vmw_prime_dmabuf_kmap,
-	.kmap_atomic = vmw_prime_dmabuf_kmap_atomic,
-	.kunmap = vmw_prime_dmabuf_kunmap,
-	.kunmap_atomic = vmw_prime_dmabuf_kunmap_atomic,
+	.map = vmw_prime_dmabuf_kmap,
+	.map_atomic = vmw_prime_dmabuf_kmap_atomic,
+	.unmap = vmw_prime_dmabuf_kunmap,
+	.unmap_atomic = vmw_prime_dmabuf_kunmap_atomic,
 	.mmap = vmw_prime_dmabuf_mmap,
 	.vmap = vmw_prime_dmabuf_vmap,
 	.vunmap = vmw_prime_dmabuf_vunmap,
diff --git a/drivers/media/v4l2-core/videobuf2-dma-contig.c b/drivers/media/v4l2-core/videobuf2-dma-contig.c
index fb6a177be461..2db0413f5d57 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-contig.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-contig.c
@@ -356,8 +356,8 @@ static struct dma_buf_ops vb2_dc_dmabuf_ops = {
 	.detach = vb2_dc_dmabuf_ops_detach,
 	.map_dma_buf = vb2_dc_dmabuf_ops_map,
 	.unmap_dma_buf = vb2_dc_dmabuf_ops_unmap,
-	.kmap = vb2_dc_dmabuf_ops_kmap,
-	.kmap_atomic = vb2_dc_dmabuf_ops_kmap,
+	.map = vb2_dc_dmabuf_ops_kmap,
+	.map_atomic = vb2_dc_dmabuf_ops_kmap,
 	.vmap = vb2_dc_dmabuf_ops_vmap,
 	.mmap = vb2_dc_dmabuf_ops_mmap,
 	.release = vb2_dc_dmabuf_ops_release,
diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c b/drivers/media/v4l2-core/videobuf2-dma-sg.c
index ecff8f492c4f..6fd1343b7c13 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c
@@ -504,8 +504,8 @@ static struct dma_buf_ops vb2_dma_sg_dmabuf_ops = {
 	.detach = vb2_dma_sg_dmabuf_ops_detach,
 	.map_dma_buf = vb2_dma_sg_dmabuf_ops_map,
 	.unmap_dma_buf = vb2_dma_sg_dmabuf_ops_unmap,
-	.kmap = vb2_dma_sg_dmabuf_ops_kmap,
-	.kmap_atomic = vb2_dma_sg_dmabuf_ops_kmap,
+	.map = vb2_dma_sg_dmabuf_ops_kmap,
+	.map_atomic = vb2_dma_sg_dmabuf_ops_kmap,
 	.vmap = vb2_dma_sg_dmabuf_ops_vmap,
 	.mmap = vb2_dma_sg_dmabuf_ops_mmap,
 	.release = vb2_dma_sg_dmabuf_ops_release,
diff --git a/drivers/media/v4l2-core/videobuf2-vmalloc.c b/drivers/media/v4l2-core/videobuf2-vmalloc.c
index 3f778147cdef..27d1db3bb8cf 100644
--- a/drivers/media/v4l2-core/videobuf2-vmalloc.c
+++ b/drivers/media/v4l2-core/videobuf2-vmalloc.c
@@ -342,8 +342,8 @@ static struct dma_buf_ops vb2_vmalloc_dmabuf_ops = {
 	.detach = vb2_vmalloc_dmabuf_ops_detach,
 	.map_dma_buf = vb2_vmalloc_dmabuf_ops_map,
 	.unmap_dma_buf = vb2_vmalloc_dmabuf_ops_unmap,
-	.kmap = vb2_vmalloc_dmabuf_ops_kmap,
-	.kmap_atomic = vb2_vmalloc_dmabuf_ops_kmap,
+	.map = vb2_vmalloc_dmabuf_ops_kmap,
+	.map_atomic = vb2_vmalloc_dmabuf_ops_kmap,
 	.vmap = vb2_vmalloc_dmabuf_ops_vmap,
 	.mmap = vb2_vmalloc_dmabuf_ops_mmap,
 	.release = vb2_vmalloc_dmabuf_ops_release,
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index f45115fce4eb..95a7f1648c00 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -1020,10 +1020,10 @@ static const struct dma_buf_ops dma_buf_ops = {
 	.release = ion_dma_buf_release,
 	.begin_cpu_access = ion_dma_buf_begin_cpu_access,
 	.end_cpu_access = ion_dma_buf_end_cpu_access,
-	.kmap_atomic = ion_dma_buf_kmap,
-	.kunmap_atomic = ion_dma_buf_kunmap,
-	.kmap = ion_dma_buf_kmap,
-	.kunmap = ion_dma_buf_kunmap,
+	.map_atomic = ion_dma_buf_kmap,
+	.unmap_atomic = ion_dma_buf_kunmap,
+	.map = ion_dma_buf_kmap,
+	.unmap = ion_dma_buf_kunmap,
 };
 
 struct dma_buf *ion_share_dma_buf(struct ion_client *client,
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index bfb3704fc6fc..79f27d60ec66 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,13 +39,13 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @kmap_atomic: maps a page from the buffer into kernel address
- * 		 space, users may not block until the subsequent unmap call.
- * 		 This callback must not sleep.
- * @kunmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
- * 		   This Callback must not sleep.
- * @kmap: maps a page from the buffer into kernel address space.
- * @kunmap: [optional] unmaps a page from the buffer.
+ * @map_atomic: maps a page from the buffer into kernel address
+ *		space, users may not block until the subsequent unmap call.
+ *		This callback must not sleep.
+ * @unmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
+ *		  This Callback must not sleep.
+ * @map: maps a page from the buffer into kernel address space.
+ * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
  *	  address space. Same restrictions as for vmap and friends apply.
  * @vunmap: [optional] unmaps a vmap from the buffer
@@ -206,10 +206,10 @@ struct dma_buf_ops {
 	 * to be restarted.
 	 */
 	int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction);
-	void *(*kmap_atomic)(struct dma_buf *, unsigned long);
-	void (*kunmap_atomic)(struct dma_buf *, unsigned long, void *);
-	void *(*kmap)(struct dma_buf *, unsigned long);
-	void (*kunmap)(struct dma_buf *, unsigned long, void *);
+	void *(*map_atomic)(struct dma_buf *, unsigned long);
+	void (*unmap_atomic)(struct dma_buf *, unsigned long, void *);
+	void *(*map)(struct dma_buf *, unsigned long);
+	void (*unmap)(struct dma_buf *, unsigned long, void *);
 
 	/**
 	 * @mmap:
-- 
cgit v1.2.3


From 0773cea37470f8e080c510fe720fc356cf35df3a Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 18 Apr 2017 16:30:37 -0700
Subject: clocksource: Use GENMASK_ULL in definition of CLOCKSOURCE_MASK

Besides reusing existing code this removes the special case handling
for 64-bit masks, which causes clang to raise a shift count overflow
warning due to https://bugs.llvm.org//show_bug.cgi?id=10030.

Suggested-by: Dmitry Torokhov <dtor@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Michael Davidson <md@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170418233037.70990-1-mka@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index cfc75848a35d..f2b10d9ebd04 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -120,7 +120,7 @@ struct clocksource {
 #define CLOCK_SOURCE_RESELECT			0x100
 
 /* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (u64)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
 
 static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
 {
-- 
cgit v1.2.3


From 58bb100a9de10329ca0d63484e76f27c257e9a2e Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Wed, 19 Apr 2017 13:26:46 +0100
Subject: Clocksource/mips-gic: Remove redundant non devicetree init

Malta was the only platform probing this driver from platform code
without using device tree. With that code removed, gic_clocksource_init
is redundant so remove it.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Link: http://lkml.kernel.org/r/1492604806-23420-2-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/clocksource/mips-gic-timer.c | 13 -------------
 include/linux/irqchip/mips-gic.h     |  1 -
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c
index b6ad9c0a7c55..3f52ee219923 100644
--- a/drivers/clocksource/mips-gic-timer.c
+++ b/drivers/clocksource/mips-gic-timer.c
@@ -154,19 +154,6 @@ static int __init __gic_clocksource_init(void)
 	return ret;
 }
 
-void __init gic_clocksource_init(unsigned int frequency)
-{
-	gic_frequency = frequency;
-	gic_timer_irq = MIPS_GIC_IRQ_BASE +
-		GIC_LOCAL_TO_HWIRQ(GIC_LOCAL_INT_COMPARE);
-
-	__gic_clocksource_init();
-	gic_clockevent_init();
-
-	/* And finally start the counter */
-	gic_start_count();
-}
-
 static int __init gic_clocksource_of_init(struct device_node *node)
 {
 	struct clk *clk;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 7b49c71c968b..2b0e56619e53 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -258,7 +258,6 @@ extern unsigned int gic_present;
 extern void gic_init(unsigned long gic_base_addr,
 	unsigned long gic_addrspace_size, unsigned int cpu_vec,
 	unsigned int irqbase);
-extern void gic_clocksource_init(unsigned int);
 extern u64 gic_read_count(void);
 extern unsigned int gic_get_count_width(void);
 extern u64 gic_read_compare(void);
-- 
cgit v1.2.3


From 49e0b4658fe6aab5bf6bfe0738a86c1895930ad1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:00 +0530
Subject: kprobes: Convert kprobe_lookup_name() to a function

The macro is now pretty long and ugly on powerpc. In the light of further
changes needed here, convert it to a __weak variant to be over-ridden with a
nicer looking function.

Suggested-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kprobes.h | 53 ----------------------------------
 arch/powerpc/kernel/kprobes.c      | 58 ++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/optprobes.c    |  4 +--
 include/linux/kprobes.h            |  1 +
 kernel/kprobes.c                   | 20 ++++++-------
 5 files changed, 69 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 0503c98b2117..a843884aafaf 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -61,59 +61,6 @@ extern kprobe_opcode_t optprobe_template_end[];
 #define MAX_OPTINSN_SIZE	(optprobe_template_end - optprobe_template_entry)
 #define RELATIVEJUMP_SIZE	sizeof(kprobe_opcode_t)	/* 4 bytes */
 
-#ifdef PPC64_ELF_ABI_v2
-/* PPC64 ABIv2 needs local entry point */
-#define kprobe_lookup_name(name, addr)					\
-{									\
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);		\
-	if (addr)							\
-		addr = (kprobe_opcode_t *)ppc_function_entry(addr);	\
-}
-#elif defined(PPC64_ELF_ABI_v1)
-/*
- * 64bit powerpc ABIv1 uses function descriptors:
- * - Check for the dot variant of the symbol first.
- * - If that fails, try looking up the symbol provided.
- *
- * This ensures we always get to the actual symbol and not the descriptor.
- * Also handle <module:symbol> format.
- */
-#define kprobe_lookup_name(name, addr)					\
-{									\
-	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];		\
-	const char *modsym;							\
-	bool dot_appended = false;					\
-	if ((modsym = strchr(name, ':')) != NULL) {			\
-		modsym++;						\
-		if (*modsym != '\0' && *modsym != '.') {		\
-			/* Convert to <module:.symbol> */		\
-			strncpy(dot_name, name, modsym - name);		\
-			dot_name[modsym - name] = '.';			\
-			dot_name[modsym - name + 1] = '\0';		\
-			strncat(dot_name, modsym,			\
-				sizeof(dot_name) - (modsym - name) - 2);\
-			dot_appended = true;				\
-		} else {						\
-			dot_name[0] = '\0';				\
-			strncat(dot_name, name, sizeof(dot_name) - 1);	\
-		}							\
-	} else if (name[0] != '.') {					\
-		dot_name[0] = '.';					\
-		dot_name[1] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN - 2);		\
-		dot_appended = true;					\
-	} else {							\
-		dot_name[0] = '\0';					\
-		strncat(dot_name, name, KSYM_NAME_LEN - 1);		\
-	}								\
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);	\
-	if (!addr && dot_appended) {					\
-		/* Let's try the original non-dot symbol lookup	*/	\
-		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);	\
-	}								\
-}
-#endif
-
 #define flush_insn_slot(p)	do { } while (0)
 #define kretprobe_blacklist_size 0
 
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fce05a38851c..e5f518a962c6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -42,6 +42,64 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
+kprobe_opcode_t *kprobe_lookup_name(const char *name)
+{
+	kprobe_opcode_t *addr;
+
+#ifdef PPC64_ELF_ABI_v2
+	/* PPC64 ABIv2 needs local entry point */
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	if (addr)
+		addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+#elif defined(PPC64_ELF_ABI_v1)
+	/*
+	 * 64bit powerpc ABIv1 uses function descriptors:
+	 * - Check for the dot variant of the symbol first.
+	 * - If that fails, try looking up the symbol provided.
+	 *
+	 * This ensures we always get to the actual symbol and not
+	 * the descriptor.
+	 *
+	 * Also handle <module:symbol> format.
+	 */
+	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
+	const char *modsym;
+	bool dot_appended = false;
+	if ((modsym = strchr(name, ':')) != NULL) {
+		modsym++;
+		if (*modsym != '\0' && *modsym != '.') {
+			/* Convert to <module:.symbol> */
+			strncpy(dot_name, name, modsym - name);
+			dot_name[modsym - name] = '.';
+			dot_name[modsym - name + 1] = '\0';
+			strncat(dot_name, modsym,
+				sizeof(dot_name) - (modsym - name) - 2);
+			dot_appended = true;
+		} else {
+			dot_name[0] = '\0';
+			strncat(dot_name, name, sizeof(dot_name) - 1);
+		}
+	} else if (name[0] != '.') {
+		dot_name[0] = '.';
+		dot_name[1] = '\0';
+		strncat(dot_name, name, KSYM_NAME_LEN - 2);
+		dot_appended = true;
+	} else {
+		dot_name[0] = '\0';
+		strncat(dot_name, name, KSYM_NAME_LEN - 1);
+	}
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+	if (!addr && dot_appended) {
+		/* Let's try the original non-dot symbol lookup	*/
+		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+	}
+#else
+	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+#endif
+
+	return addr;
+}
+
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	int ret = 0;
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 2282bf4e63cd..aefe076d00e0 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,8 +243,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
 	/*
 	 * 2. branch to optimized_callback() and emulate_step()
 	 */
-	kprobe_lookup_name("optimized_callback", op_callback_addr);
-	kprobe_lookup_name("emulate_step", emulate_step_addr);
+	op_callback_addr = kprobe_lookup_name("optimized_callback");
+	emulate_step_addr = kprobe_lookup_name("emulate_step");
 	if (!op_callback_addr || !emulate_step_addr) {
 		WARN(1, "kprobe_lookup_name() failed\n");
 		goto error;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index c328e4f7dcad..16f153c84646 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -379,6 +379,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
 	return this_cpu_ptr(&kprobe_ctlblk);
 }
 
+kprobe_opcode_t *kprobe_lookup_name(const char *name);
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
 int register_kprobes(struct kprobe **kps, int num);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d179839e1b70..e5087e692b42 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
 
 
-/*
- * Some oddball architectures like 64bit powerpc have function descriptors
- * so this must be overridable.
- */
-#ifndef kprobe_lookup_name
-#define kprobe_lookup_name(name, addr) \
-	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
-#endif
-
 static int kprobes_initialized;
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,11 @@ static struct {
 	raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
 
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name)
+{
+	return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
+}
+
 static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
 	return &(kretprobe_table_locks[hash].lock);
@@ -1407,7 +1403,7 @@ static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
 		goto invalid;
 
 	if (p->symbol_name) {
-		kprobe_lookup_name(p->symbol_name, addr);
+		addr = kprobe_lookup_name(p->symbol_name);
 		if (!addr)
 			return ERR_PTR(-ENOENT);
 	}
@@ -2199,8 +2195,8 @@ static int __init init_kprobes(void)
 	if (kretprobe_blacklist_size) {
 		/* lookup the function address from its name */
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
-			kprobe_lookup_name(kretprobe_blacklist[i].name,
-					   kretprobe_blacklist[i].addr);
+			kretprobe_blacklist[i].addr =
+				kprobe_lookup_name(kretprobe_blacklist[i].name);
 			if (!kretprobe_blacklist[i].addr)
 				printk("kretprobe: lookup failed: %s\n",
 				       kretprobe_blacklist[i].name);
-- 
cgit v1.2.3


From 290e3070762ac80e5fc4087d8c4de7e3f1d90aca Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 19 Apr 2017 18:21:01 +0530
Subject: powerpc/kprobes: Fix handling of function offsets on ABIv2

commit 239aeba76409 ("perf powerpc: Fix kprobe and kretprobe handling with
kallsyms on ppc64le") changed how we use the offset field in struct kprobe on
ABIv2. perf now offsets from the global entry point if an offset is specified
and otherwise chooses the local entry point.

Fix the same in kernel for kprobe API users. We do this by extending
kprobe_lookup_name() to accept an additional parameter to indicate the offset
specified with the kprobe registration. If offset is 0, we return the local
function entry and return the global entry point otherwise.

With:
  # cd /sys/kernel/debug/tracing/
  # echo "p _do_fork" >> kprobe_events
  # echo "p _do_fork+0x10" >> kprobe_events

before this patch:
  # cat ../kprobes/list
  c0000000000d0748  k  _do_fork+0x8    [DISABLED]
  c0000000000d0758  k  _do_fork+0x18    [DISABLED]
  c0000000000412b0  k  kretprobe_trampoline+0x0    [OPTIMIZED]

and after:
  # cat ../kprobes/list
  c0000000000d04c8  k  _do_fork+0x8    [DISABLED]
  c0000000000d04d0  k  _do_fork+0x10    [DISABLED]
  c0000000000412b0  k  kretprobe_trampoline+0x0    [OPTIMIZED]

Acked-by: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c   | 4 ++--
 arch/powerpc/kernel/optprobes.c | 4 ++--
 include/linux/kprobes.h         | 2 +-
 kernel/kprobes.c                | 7 ++++---
 4 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index e5f518a962c6..65828fbc410d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -42,14 +42,14 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
-kprobe_opcode_t *kprobe_lookup_name(const char *name)
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
 {
 	kprobe_opcode_t *addr;
 
 #ifdef PPC64_ELF_ABI_v2
 	/* PPC64 ABIv2 needs local entry point */
 	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
-	if (addr)
+	if (addr && !offset)
 		addr = (kprobe_opcode_t *)ppc_function_entry(addr);
 #elif defined(PPC64_ELF_ABI_v1)
 	/*
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index aefe076d00e0..ce81a322251c 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,8 +243,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
 	/*
 	 * 2. branch to optimized_callback() and emulate_step()
 	 */
-	op_callback_addr = kprobe_lookup_name("optimized_callback");
-	emulate_step_addr = kprobe_lookup_name("emulate_step");
+	op_callback_addr = kprobe_lookup_name("optimized_callback", 0);
+	emulate_step_addr = kprobe_lookup_name("emulate_step", 0);
 	if (!op_callback_addr || !emulate_step_addr) {
 		WARN(1, "kprobe_lookup_name() failed\n");
 		goto error;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 16f153c84646..1f82a3db00b1 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -379,7 +379,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
 	return this_cpu_ptr(&kprobe_ctlblk);
 }
 
-kprobe_opcode_t *kprobe_lookup_name(const char *name);
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
 int register_kprobes(struct kprobe **kps, int num);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5087e692b42..406889889ce5 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -72,7 +72,8 @@ static struct {
 	raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
 
-kprobe_opcode_t * __weak kprobe_lookup_name(const char *name)
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
+					unsigned int __unused)
 {
 	return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
 }
@@ -1403,7 +1404,7 @@ static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
 		goto invalid;
 
 	if (p->symbol_name) {
-		addr = kprobe_lookup_name(p->symbol_name);
+		addr = kprobe_lookup_name(p->symbol_name, p->offset);
 		if (!addr)
 			return ERR_PTR(-ENOENT);
 	}
@@ -2196,7 +2197,7 @@ static int __init init_kprobes(void)
 		/* lookup the function address from its name */
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
 			kretprobe_blacklist[i].addr =
-				kprobe_lookup_name(kretprobe_blacklist[i].name);
+				kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
 			if (!kretprobe_blacklist[i].addr)
 				printk("kretprobe: lookup failed: %s\n",
 				       kretprobe_blacklist[i].name);
-- 
cgit v1.2.3


From f66e225828c1b046c7db1db65b0dd2d135f6a2da Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 12 Apr 2017 13:25:58 +0100
Subject: PCI: Add BAR index argument to pci_mmap_page_range()

In all cases we know which BAR it is.  Passing it in means that arch code
(or generic code; watch this space) won't have to go looking for it again.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/arm/kernel/bios32.c                | 3 ++-
 arch/cris/arch-v32/drivers/pci/bios.c   | 3 ++-
 arch/ia64/pci/pci.c                     | 3 ++-
 arch/microblaze/pci/pci-common.c        | 2 +-
 arch/mips/pci/pci.c                     | 3 ++-
 arch/mn10300/unit-asb2305/pci-asb2305.c | 3 ++-
 arch/parisc/kernel/pci.c                | 3 ++-
 arch/powerpc/kernel/pci-common.c        | 3 ++-
 arch/sh/drivers/pci/pci.c               | 3 ++-
 arch/sparc/kernel/pci.c                 | 6 +++---
 arch/unicore32/kernel/pci.c             | 3 ++-
 arch/x86/pci/i386.c                     | 3 ++-
 arch/xtensa/kernel/pci.c                | 3 ++-
 drivers/pci/pci-sysfs.c                 | 2 +-
 drivers/pci/proc.c                      | 2 +-
 include/linux/pci.h                     | 3 ++-
 16 files changed, 30 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index 2f0e07735d1d..a4fc3f46eeae 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -597,7 +597,8 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 	return start;
 }
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	if (mmap_state == pci_mmap_io)
diff --git a/arch/cris/arch-v32/drivers/pci/bios.c b/arch/cris/arch-v32/drivers/pci/bios.c
index 212266a2c5d9..a589686d5448 100644
--- a/arch/cris/arch-v32/drivers/pci/bios.c
+++ b/arch/cris/arch-v32/drivers/pci/bios.c
@@ -14,7 +14,8 @@ void pcibios_set_master(struct pci_dev *dev)
 	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
 }
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long prot;
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 8f6ac2f8ae4c..053c688b15a5 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -419,7 +419,8 @@ pcibios_align_resource (void *data, const struct resource *res,
 }
 
 int
-pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
+pci_mmap_page_range (struct pci_dev *dev, int bar,
+		     struct vm_area_struct *vma,
 		     enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long size = vma->vm_end - vma->vm_start;
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 13bc93242c0c..404fb38d06b7 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -278,7 +278,7 @@ pgprot_t pci_phys_mem_access_prot(struct file *file,
  *
  * Returns a negative error code on failure, zero on success.
  */
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	resource_size_t offset =
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index f6325fa657fb..f189502041a6 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -58,7 +58,8 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
 	*end = rsrc->start + size;
 }
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long prot;
diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.c b/arch/mn10300/unit-asb2305/pci-asb2305.c
index b7ab8378964c..4abbbd54ac7d 100644
--- a/arch/mn10300/unit-asb2305/pci-asb2305.c
+++ b/arch/mn10300/unit-asb2305/pci-asb2305.c
@@ -211,7 +211,8 @@ void __init pcibios_resource_survey(void)
 	pcibios_allocate_resources(1);
 }
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long prot;
diff --git a/arch/parisc/kernel/pci.c b/arch/parisc/kernel/pci.c
index 0903c6abd7a4..653877553acd 100644
--- a/arch/parisc/kernel/pci.c
+++ b/arch/parisc/kernel/pci.c
@@ -228,7 +228,8 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 }
 
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long prot;
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index ffda24a38dda..6dda4a20de6e 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -513,7 +513,8 @@ pgprot_t pci_phys_mem_access_prot(struct file *file,
  *
  * Returns a negative error code on failure, zero on success.
  */
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	resource_size_t offset =
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 84563e39a5b8..c8b36b7ceb98 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -269,7 +269,8 @@ void __ref pcibios_report_status(unsigned int status_mask, int warn)
 	}
 }
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	/*
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 015e55a7495d..7eceaa10836f 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -862,9 +862,9 @@ static void __pci_mmap_set_pgprot(struct pci_dev *dev, struct vm_area_struct *vm
  *
  * Returns a negative error code on failure, zero on success.
  */
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state,
-			int write_combine)
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
 {
 	int ret;
 
diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c
index 62137d13c6f9..1b438857e91f 100644
--- a/arch/unicore32/kernel/pci.c
+++ b/arch/unicore32/kernel/pci.c
@@ -357,7 +357,8 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return 0;
 }
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long phys;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 0a9f2caf358f..8ca5e5d9f251 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -411,7 +411,8 @@ static const struct vm_operations_struct pci_mmap_ops = {
 	.access = generic_access_phys,
 };
 
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long prot;
diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c
index 5b73fc2f076c..903963ee495d 100644
--- a/arch/xtensa/kernel/pci.c
+++ b/arch/xtensa/kernel/pci.c
@@ -343,7 +343,8 @@ __pci_mmap_make_offset(struct pci_dev *dev, struct vm_area_struct *vma,
  *
  * Returns a negative error code on failure, zero on success.
  */
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *dev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state,
 			int write_combine)
 {
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 534844df8bf5..bfd9efe637c4 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -1041,7 +1041,7 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 	pci_resource_to_user(pdev, bar, res, &start, &end);
 	vma->vm_pgoff += start >> PAGE_SHIFT;
 	mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io;
-	return pci_mmap_page_range(pdev, vma, mmap_type, write_combine);
+	return pci_mmap_page_range(pdev, bar, vma, mmap_type, write_combine);
 }
 
 static int pci_mmap_resource_uc(struct file *filp, struct kobject *kobj,
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 45e5cf7e9193..098360d7ff81 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -262,7 +262,7 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 		else
 			return -EINVAL;
 	}
-	ret = pci_mmap_page_range(dev, vma,
+	ret = pci_mmap_page_range(dev, i, vma,
 				  fpriv->mmap_state, write_combine);
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 590cfcf6acf5..7173a677d6dd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1630,7 +1630,8 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
  * Architectures provide this function if they set HAVE_PCI_MMAP, and
  * it accepts the 'write_combine' argument when arch_can_pci_mmap_wc()
  * evaluates to nonzero. */
-int pci_mmap_page_range(struct pci_dev *pdev, struct vm_area_struct *vma,
+int pci_mmap_page_range(struct pci_dev *pdev, int bar,
+			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine);
 
 #ifndef arch_can_pci_mmap_wc
-- 
cgit v1.2.3


From f719582435afe9c7985206e42d804ea6aa315d33 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 12 Apr 2017 13:25:59 +0100
Subject: PCI: Add pci_mmap_resource_range() and use it for ARM64

Starting to leave behind the legacy of the pci_mmap_page_range() interface
which takes "user-visible" BAR addresses.  This takes just the resource and
offset.

For now, both APIs coexist and depending on the platform, one is
implemented as a wrapper around the other.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 Documentation/filesystems/sysfs-pci.txt | 10 ++--
 arch/arm64/include/asm/pci.h            |  2 +
 drivers/pci/Makefile                    |  2 +-
 drivers/pci/mmap.c                      | 95 +++++++++++++++++++++++++++++++++
 drivers/pci/pci-sysfs.c                 | 13 ++---
 drivers/pci/pci.h                       |  4 +-
 include/linux/pci.h                     | 19 +++++--
 7 files changed, 125 insertions(+), 20 deletions(-)
 create mode 100644 drivers/pci/mmap.c

(limited to 'include/linux')

diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 46b95d82c4fd..06f1d64c6f70 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -113,9 +113,13 @@ Supporting PCI access on new platforms
 --------------------------------------
 
 In order to support PCI resource mapping as described above, Linux platform
-code must define HAVE_PCI_MMAP and provide a pci_mmap_page_range function.
-Platforms are free to only support subsets of the mmap functionality, but
-useful return codes should be provided.
+code should ideally define ARCH_GENERIC_PCI_MMAP_RESOURCE and use the generic
+implementation of that functionality. To support the historical interface of
+mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP.
+
+Alternatively, platforms which set HAVE_PCI_MMAP may provide their own
+implementation of pci_mmap_page_range() instead of defining
+ARCH_GENERIC_PCI_MMAP_RESOURCE.
 
 Platforms which support write-combining maps of PCI resources must define
 arch_can_pci_mmap_wc() which shall evaluate to non-zero at runtime when
diff --git a/arch/arm64/include/asm/pci.h b/arch/arm64/include/asm/pci.h
index b9a7ba9ca44c..1fc19744ffe9 100644
--- a/arch/arm64/include/asm/pci.h
+++ b/arch/arm64/include/asm/pci.h
@@ -22,6 +22,8 @@
  */
 #define PCI_DMA_BUS_IS_PHYS	(0)
 
+#define ARCH_GENERIC_PCI_MMAP_RESOURCE	1
+
 extern int isa_dma_bridge_buggy;
 
 #ifdef CONFIG_PCI
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 8db5079f09a7..3d40e415a171 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -4,7 +4,7 @@
 
 obj-y		+= access.o bus.o probe.o host-bridge.o remove.o pci.o \
 			pci-driver.o search.o pci-sysfs.o rom.o setup-res.o \
-			irq.o vpd.o setup-bus.o vc.o
+			irq.o vpd.o setup-bus.o vc.o mmap.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_SYSFS) += slot.o
 
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
new file mode 100644
index 000000000000..b7aca9c89c31
--- /dev/null
+++ b/drivers/pci/mmap.c
@@ -0,0 +1,95 @@
+/*
+ * mmap.c — generic PCI resource mmap helper
+ *
+ * Copyright © 2017 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: David Woodhouse <dwmw2@infradead.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pci.h>
+
+#ifdef ARCH_GENERIC_PCI_MMAP_RESOURCE
+
+/*
+ * Modern setup: generic pci_mmap_resource_range(), and implement the legacy
+ * pci_mmap_page_range() (if needed) as a wrapper round it.
+ */
+
+#ifdef HAVE_PCI_MMAP
+int pci_mmap_page_range(struct pci_dev *pdev, int bar,
+			struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
+{
+	resource_size_t start, end;
+
+	pci_resource_to_user(pdev, bar, &pdev->resource[bar], &start, &end);
+
+	/* Adjust vm_pgoff to be the offset within the resource */
+	vma->vm_pgoff -= start >> PAGE_SHIFT;
+	return pci_mmap_resource_range(pdev, bar, vma, mmap_state,
+				       write_combine);
+}
+#endif
+
+static const struct vm_operations_struct pci_phys_vm_ops = {
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+	.access = generic_access_phys,
+#endif
+};
+
+int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
+			    struct vm_area_struct *vma,
+			    enum pci_mmap_state mmap_state, int write_combine)
+{
+	unsigned long size;
+
+	if (mmap_state == pci_mmap_io)
+		return -EINVAL;
+
+	size = ((pci_resource_len(pdev, bar) - 1) >> PAGE_SHIFT) + 1;
+	if (vma->vm_pgoff + vma_pages(vma) > size)
+		return -EINVAL;
+
+	if (write_combine)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	else
+		vma->vm_page_prot = pgprot_device(vma->vm_page_prot);
+
+	vma->vm_pgoff += (pci_resource_start(pdev, bar) >> PAGE_SHIFT);
+	vma->vm_ops = &pci_phys_vm_ops;
+
+	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+				  vma->vm_end - vma->vm_start,
+				  vma->vm_page_prot);
+}
+
+#elif defined(HAVE_PCI_MMAP) /* && !ARCH_GENERIC_PCI_MMAP_RESOURCE */
+
+/*
+ * Legacy setup: Impement pci_mmap_resource_range() as a wrapper around
+ * the architecture's pci_mmap_page_range(), converting to "user visible"
+ * addresses as necessary.
+ */
+
+int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
+			    struct vm_area_struct *vma,
+			    enum pci_mmap_state mmap_state, int write_combine)
+{
+	resource_size_t start, end;
+
+	/*
+	 * pci_mmap_page_range() expects the same kind of entry as coming
+	 * from /proc/bus/pci/ which is a "user visible" value. If this is
+	 * different from the resource itself, arch will do necessary fixup.
+	 */
+	pci_resource_to_user(pdev, bar, &pdev->resource[bar], &start, &end);
+	vma->vm_pgoff += start >> PAGE_SHIFT;
+	return pci_mmap_page_range(pdev, bar, vma, mmap_state, write_combine);
+}
+#endif
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index bfd9efe637c4..10feb98a2b1d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -980,7 +980,7 @@ void pci_remove_legacy_files(struct pci_bus *b)
 }
 #endif /* HAVE_PCI_LEGACY */
 
-#ifdef HAVE_PCI_MMAP
+#if defined(HAVE_PCI_MMAP) || defined(ARCH_GENERIC_PCI_MMAP_RESOURCE)
 
 int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vma,
 		  enum pci_mmap_api mmap_api)
@@ -1019,7 +1019,6 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
 	int bar = (unsigned long)attr->private;
 	enum pci_mmap_state mmap_type;
-	resource_size_t start, end;
 	struct resource *res = &pdev->resource[bar];
 
 	if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start))
@@ -1033,15 +1032,9 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 			(u64)pci_resource_len(pdev, bar));
 		return -EINVAL;
 	}
-
-	/* pci_mmap_page_range() expects the same kind of entry as coming
-	 * from /proc/bus/pci/ which is a "user visible" value. If this is
-	 * different from the resource itself, arch will do necessary fixup.
-	 */
-	pci_resource_to_user(pdev, bar, res, &start, &end);
-	vma->vm_pgoff += start >> PAGE_SHIFT;
 	mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io;
-	return pci_mmap_page_range(pdev, bar, vma, mmap_type, write_combine);
+
+	return pci_mmap_resource_range(pdev, bar, vma, mmap_type, write_combine);
 }
 
 static int pci_mmap_resource_uc(struct file *filp, struct kobject *kobj,
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 8dd38e69d6f2..8e5ca2dec7e7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -21,14 +21,14 @@ void pci_create_firmware_label_files(struct pci_dev *pdev);
 void pci_remove_firmware_label_files(struct pci_dev *pdev);
 #endif
 void pci_cleanup_rom(struct pci_dev *dev);
-#ifdef HAVE_PCI_MMAP
+
 enum pci_mmap_api {
 	PCI_MMAP_SYSFS,	/* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */
 	PCI_MMAP_PROCFS	/* mmap on /proc/bus/pci/<BDF> */
 };
 int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai,
 		  enum pci_mmap_api mmap_api);
-#endif
+
 int pci_probe_reset_function(struct pci_dev *dev);
 
 /**
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7173a677d6dd..98a72abcf361 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1626,10 +1626,21 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; }
 
 #include <asm/pci.h>
 
-/* Map a range of PCI memory or I/O space for a device into user space.
- * Architectures provide this function if they set HAVE_PCI_MMAP, and
- * it accepts the 'write_combine' argument when arch_can_pci_mmap_wc()
- * evaluates to nonzero. */
+/* These two functions provide almost identical functionality. Depennding
+ * on the architecture, one will be implemented as a wrapper around the
+ * other (in drivers/pci/mmap.c).
+ *
+ * pci_mmap_resource_range() maps a specific BAR, and vm->vm_pgoff
+ * is expected to be an offset within that region.
+ *
+ * pci_mmap_page_range() is the legacy architecture-specific interface,
+ * which accepts a "user visible" resource address converted by
+ * pci_resource_to_user(), as used in the legacy mmap() interface in
+ * /proc/bus/pci/.
+ */
+int pci_mmap_resource_range(struct pci_dev *dev, int bar,
+			    struct vm_area_struct *vma,
+			    enum pci_mmap_state mmap_state, int write_combine);
 int pci_mmap_page_range(struct pci_dev *pdev, int bar,
 			struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine);
-- 
cgit v1.2.3


From 2bea36fd1af440e1853e431459a0bf929266cd52 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Wed, 12 Apr 2017 13:26:08 +0100
Subject: PCI: Add I/O BAR support to generic pci_mmap_resource_range()

This will need to call into an arch-provided pci_iobar_pfn() function.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/mmap.c  | 12 ++++++++----
 include/linux/pci.h |  4 ++++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index b7aca9c89c31..9a5e5a9055eb 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -48,9 +48,7 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 			    enum pci_mmap_state mmap_state, int write_combine)
 {
 	unsigned long size;
-
-	if (mmap_state == pci_mmap_io)
-		return -EINVAL;
+	int ret;
 
 	size = ((pci_resource_len(pdev, bar) - 1) >> PAGE_SHIFT) + 1;
 	if (vma->vm_pgoff + vma_pages(vma) > size)
@@ -61,7 +59,13 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 	else
 		vma->vm_page_prot = pgprot_device(vma->vm_page_prot);
 
-	vma->vm_pgoff += (pci_resource_start(pdev, bar) >> PAGE_SHIFT);
+	if (mmap_state == pci_mmap_io) {
+		ret = pci_iobar_pfn(pdev, bar, vma);
+		if (ret)
+			return ret;
+	} else
+		vma->vm_pgoff += (pci_resource_start(pdev, bar) >> PAGE_SHIFT);
+
 	vma->vm_ops = &pci_phys_vm_ops;
 
 	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 98a72abcf361..9f302444d4ac 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1648,8 +1648,12 @@ int pci_mmap_page_range(struct pci_dev *pdev, int bar,
 #ifndef arch_can_pci_mmap_wc
 #define arch_can_pci_mmap_wc()		0
 #endif
+
 #ifndef arch_can_pci_mmap_io
 #define arch_can_pci_mmap_io()		0
+#define pci_iobar_pfn(pdev, bar, vma) (-EINVAL)
+#else
+int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma);
 #endif
 
 #ifndef pci_root_bus_fwnode
-- 
cgit v1.2.3


From a60a2b73ba69abca26653fff157b0fd8947bc498 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Apr 2017 21:11:25 +0200
Subject: PCI: Export pcie_flr()

Currently we opencode the FLR sequence in lots of place; export a core
helper instead.  We split out the probing for FLR support as all the
non-core callers already know their hardware.

Note that in the new pci_has_flr() function the quirk check has been moved
before the capability check as there is no point in reading the capability
in this case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 39 ++++++++++++++++++++++++++++-----------
 include/linux/pci.h |  1 +
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index bef14777bb30..957a11a6a840 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3773,27 +3773,41 @@ static void pci_flr_wait(struct pci_dev *dev)
 			 (i - 1) * 100);
 }
 
-static int pcie_flr(struct pci_dev *dev, int probe)
+/**
+ * pcie_has_flr - check if a device supports function level resets
+ * @dev:	device to check
+ *
+ * Returns true if the device advertises support for PCIe function level
+ * resets.
+ */
+static bool pcie_has_flr(struct pci_dev *dev)
 {
 	u32 cap;
 
-	pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, &cap);
-	if (!(cap & PCI_EXP_DEVCAP_FLR))
-		return -ENOTTY;
-
 	if (dev->dev_flags & PCI_DEV_FLAGS_NO_FLR_RESET)
-		return -ENOTTY;
+		return false;
 
-	if (probe)
-		return 0;
+	pcie_capability_read_dword(dev, PCI_EXP_DEVCAP, &cap);
+	return cap & PCI_EXP_DEVCAP_FLR;
+}
 
+/**
+ * pcie_flr - initiate a PCIe function level reset
+ * @dev:	device to reset
+ *
+ * Initiate a function level reset on @dev.  The caller should ensure the
+ * device supports FLR before calling this function, e.g. by using the
+ * pcie_has_flr() helper.
+ */
+void pcie_flr(struct pci_dev *dev)
+{
 	if (!pci_wait_for_pending_transaction(dev))
 		dev_err(&dev->dev, "timed out waiting for pending transaction; performing function level reset anyway\n");
 
 	pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR);
 	pci_flr_wait(dev);
-	return 0;
 }
+EXPORT_SYMBOL_GPL(pcie_flr);
 
 static int pci_af_flr(struct pci_dev *dev, int probe)
 {
@@ -3977,9 +3991,12 @@ static int __pci_dev_reset(struct pci_dev *dev, int probe)
 	if (rc != -ENOTTY)
 		goto done;
 
-	rc = pcie_flr(dev, probe);
-	if (rc != -ENOTTY)
+	if (pcie_has_flr(dev)) {
+		if (!probe)
+			pcie_flr(dev);
+		rc = 0;
 		goto done;
+	}
 
 	rc = pci_af_flr(dev, probe);
 	if (rc != -ENOTTY)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 22cad2c66d59..a9ff99c91601 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1054,6 +1054,7 @@ int pcie_get_mps(struct pci_dev *dev);
 int pcie_set_mps(struct pci_dev *dev, int mps);
 int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
 			  enum pcie_link_width *width);
+void pcie_flr(struct pci_dev *dev);
 int __pci_reset_function(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
-- 
cgit v1.2.3


From 3f1866779cf8338e1c8bd32e5f6f5424795ef191 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Mon, 10 Apr 2017 16:50:58 +0530
Subject: of: dma: Make of_dma_deconfigure() public

As part of moving DMA initializing to probe time the
of_dma_deconfigure() function will need to be called from different
source files. Make it public and move it to drivers/of/device.c where
the of_dma_configure() function is.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/of/device.c       | 12 ++++++++++++
 drivers/of/platform.c     |  5 -----
 include/linux/of_device.h |  3 +++
 3 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/device.c b/drivers/of/device.c
index b1e6bebda3f3..0d378c03e1a4 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -151,6 +151,18 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_dma_configure);
 
+/**
+ * of_dma_deconfigure - Clean up DMA configuration
+ * @dev:	Device for which to clean up DMA configuration
+ *
+ * Clean up all configuration performed by of_dma_configure_ops() and free all
+ * resources that have been allocated.
+ */
+void of_dma_deconfigure(struct device *dev)
+{
+	arch_teardown_dma_ops(dev);
+}
+
 int of_device_register(struct platform_device *pdev)
 {
 	device_initialize(&pdev->dev);
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 5dfcc967dd05..5344db50aa65 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -158,11 +158,6 @@ struct platform_device *of_device_alloc(struct device_node *np,
 }
 EXPORT_SYMBOL(of_device_alloc);
 
-static void of_dma_deconfigure(struct device *dev)
-{
-	arch_teardown_dma_ops(dev);
-}
-
 /**
  * of_platform_device_create_pdata - Alloc, initialize and register an of_device
  * @np: pointer to node to create device for
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index c12dace043f3..af984551cc2b 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -56,6 +56,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 }
 
 void of_dma_configure(struct device *dev, struct device_node *np);
+void of_dma_deconfigure(struct device *dev);
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
@@ -105,6 +106,8 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 }
 static inline void of_dma_configure(struct device *dev, struct device_node *np)
 {}
+static inline void of_dma_deconfigure(struct device *dev)
+{}
 #endif /* CONFIG_OF */
 
 #endif /* _LINUX_OF_DEVICE_H */
-- 
cgit v1.2.3


From 09515ef5ddad71c7820e5e428da418b709feeb26 Mon Sep 17 00:00:00 2001
From: Sricharan R <sricharan@codeaurora.org>
Date: Mon, 10 Apr 2017 16:51:01 +0530
Subject: of/acpi: Configure dma operations at probe time for platform/amba/pci
 bus devices

Configuring DMA ops at probe time will allow deferring device probe when
the IOMMU isn't available yet. The dma_configure for the device is
now called from the generic device_attach callback just before the
bus/driver probe is called. This way, configuring the DMA ops for the
device would be called at the same place for all bus_types, hence the
deferred probing mechanism should work for all buses as well.

pci_bus_add_devices    (platform/amba)(_device_create/driver_register)
       |                         |
pci_bus_add_device     (device_add/driver_register)
       |                         |
device_attach           device_initial_probe
       |                         |
__device_attach_driver    __device_attach_driver
       |
driver_probe_device
       |
really_probe
       |
dma_configure

Similarly on the device/driver_unregister path __device_release_driver is
called which inturn calls dma_deconfigure.

This patch changes the dma ops configuration to probe time for
both OF and ACPI based platform/amba/pci bus devices.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com> (drivers/pci part)
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/glue.c         |  5 -----
 drivers/base/dd.c           |  9 +++++++++
 drivers/base/dma-mapping.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/of/platform.c       |  5 +----
 drivers/pci/probe.c         | 28 ----------------------------
 include/linux/dma-mapping.h | 12 ++++++++++++
 6 files changed, 62 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
index fb19e1cdb641..c05f24107bfc 100644
--- a/drivers/acpi/glue.c
+++ b/drivers/acpi/glue.c
@@ -176,7 +176,6 @@ int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
 	struct list_head *physnode_list;
 	unsigned int node_id;
 	int retval = -EINVAL;
-	enum dev_dma_attr attr;
 
 	if (has_acpi_companion(dev)) {
 		if (acpi_dev) {
@@ -233,10 +232,6 @@ int acpi_bind_one(struct device *dev, struct acpi_device *acpi_dev)
 	if (!has_acpi_companion(dev))
 		ACPI_COMPANION_SET(dev, acpi_dev);
 
-	attr = acpi_get_dma_attr(acpi_dev);
-	if (attr != DEV_DMA_NOT_SUPPORTED)
-		acpi_dma_configure(dev, attr);
-
 	acpi_physnode_link_name(physical_node_name, node_id);
 	retval = sysfs_create_link(&acpi_dev->dev.kobj, &dev->kobj,
 				   physical_node_name);
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index a1fbf55c4d3a..4882f06d12df 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -19,6 +19,7 @@
 
 #include <linux/device.h>
 #include <linux/delay.h>
+#include <linux/dma-mapping.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/wait.h>
@@ -356,6 +357,10 @@ re_probe:
 	if (ret)
 		goto pinctrl_bind_failed;
 
+	ret = dma_configure(dev);
+	if (ret)
+		goto dma_failed;
+
 	if (driver_sysfs_add(dev)) {
 		printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n",
 			__func__, dev_name(dev));
@@ -417,6 +422,8 @@ re_probe:
 	goto done;
 
 probe_failed:
+	dma_deconfigure(dev);
+dma_failed:
 	if (dev->bus)
 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
 					     BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
@@ -826,6 +833,8 @@ static void __device_release_driver(struct device *dev, struct device *parent)
 			drv->remove(dev);
 
 		device_links_driver_cleanup(dev);
+		dma_deconfigure(dev);
+
 		devres_release_all(dev);
 		dev->driver = NULL;
 		dev_set_drvdata(dev, NULL);
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index efd71cf4fdea..449b948c7427 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -7,9 +7,11 @@
  * This file is released under the GPLv2.
  */
 
+#include <linux/acpi.h>
 #include <linux/dma-mapping.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
+#include <linux/of_device.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 
@@ -341,3 +343,41 @@ void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
 	vunmap(cpu_addr);
 }
 #endif
+
+/*
+ * Common configuration to enable DMA API use for a device
+ */
+#include <linux/pci.h>
+
+int dma_configure(struct device *dev)
+{
+	struct device *bridge = NULL, *dma_dev = dev;
+	enum dev_dma_attr attr;
+
+	if (dev_is_pci(dev)) {
+		bridge = pci_get_host_bridge_device(to_pci_dev(dev));
+		dma_dev = bridge;
+		if (IS_ENABLED(CONFIG_OF) && dma_dev->parent &&
+		    dma_dev->parent->of_node)
+			dma_dev = dma_dev->parent;
+	}
+
+	if (dma_dev->of_node) {
+		of_dma_configure(dev, dma_dev->of_node);
+	} else if (has_acpi_companion(dma_dev)) {
+		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
+		if (attr != DEV_DMA_NOT_SUPPORTED)
+			acpi_dma_configure(dev, attr);
+	}
+
+	if (bridge)
+		pci_put_host_bridge_device(bridge);
+
+	return 0;
+}
+
+void dma_deconfigure(struct device *dev)
+{
+	of_dma_deconfigure(dev);
+	acpi_dma_deconfigure(dev);
+}
diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 5344db50aa65..2aa4ebbde9cd 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
+#include <linux/of_iommu.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
@@ -186,11 +187,9 @@ static struct platform_device *of_platform_device_create_pdata(
 
 	dev->dev.bus = &platform_bus_type;
 	dev->dev.platform_data = platform_data;
-	of_dma_configure(&dev->dev, dev->dev.of_node);
 	of_msi_configure(&dev->dev, dev->dev.of_node);
 
 	if (of_device_add(dev) != 0) {
-		of_dma_deconfigure(&dev->dev);
 		platform_device_put(dev);
 		goto err_clear_flag;
 	}
@@ -248,7 +247,6 @@ static struct amba_device *of_amba_device_create(struct device_node *node,
 		dev_set_name(&dev->dev, "%s", bus_id);
 	else
 		of_device_make_bus_id(&dev->dev);
-	of_dma_configure(&dev->dev, dev->dev.of_node);
 
 	/* Allow the HW Peripheral ID to be overridden */
 	prop = of_get_property(node, "arm,primecell-periphid", NULL);
@@ -542,7 +540,6 @@ static int of_platform_device_destroy(struct device *dev, void *data)
 		amba_device_unregister(to_amba_device(dev));
 #endif
 
-	of_dma_deconfigure(dev);
 	of_node_clear_flag(dev->of_node, OF_POPULATED);
 	of_node_clear_flag(dev->of_node, OF_POPULATED_BUS);
 	return 0;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index dfc9a2794141..5a8dd43db336 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1893,33 +1893,6 @@ static void pci_set_msi_domain(struct pci_dev *dev)
 	dev_set_msi_domain(&dev->dev, d);
 }
 
-/**
- * pci_dma_configure - Setup DMA configuration
- * @dev: ptr to pci_dev struct of the PCI device
- *
- * Function to update PCI devices's DMA configuration using the same
- * info from the OF node or ACPI node of host bridge's parent (if any).
- */
-static void pci_dma_configure(struct pci_dev *dev)
-{
-	struct device *bridge = pci_get_host_bridge_device(dev);
-
-	if (IS_ENABLED(CONFIG_OF) &&
-		bridge->parent && bridge->parent->of_node) {
-			of_dma_configure(&dev->dev, bridge->parent->of_node);
-	} else if (has_acpi_companion(bridge)) {
-		struct acpi_device *adev = to_acpi_device_node(bridge->fwnode);
-		enum dev_dma_attr attr = acpi_get_dma_attr(adev);
-
-		if (attr == DEV_DMA_NOT_SUPPORTED)
-			dev_warn(&dev->dev, "DMA not supported.\n");
-		else
-			acpi_dma_configure(&dev->dev, attr);
-	}
-
-	pci_put_host_bridge_device(bridge);
-}
-
 void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 {
 	int ret;
@@ -1933,7 +1906,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	dev->dev.dma_mask = &dev->dma_mask;
 	dev->dev.dma_parms = &dev->dma_parms;
 	dev->dev.coherent_dma_mask = 0xffffffffull;
-	pci_dma_configure(dev);
 
 	pci_set_dma_max_seg_size(dev, 65536);
 	pci_set_dma_seg_boundary(dev, 0xffffffff);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0977317c6835..4f3eecedca2d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -728,6 +728,18 @@ dma_mark_declared_memory_occupied(struct device *dev,
 }
 #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
+#ifdef CONFIG_HAS_DMA
+int dma_configure(struct device *dev);
+void dma_deconfigure(struct device *dev);
+#else
+static inline int dma_configure(struct device *dev)
+{
+	return 0;
+}
+
+static inline void dma_deconfigure(struct device *dev) {}
+#endif
+
 /*
  * Managed DMA API
  */
-- 
cgit v1.2.3


From 7b07cbefb68d486febf47e13b570fed53d9296b4 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Mon, 10 Apr 2017 16:51:02 +0530
Subject: iommu: of: Handle IOMMU lookup failure with deferred probing or error

Failures to look up an IOMMU when parsing the DT iommus property need to
be handled separately from the .of_xlate() failures to support deferred
probing.

The lack of a registered IOMMU can be caused by the lack of a driver for
the IOMMU, the IOMMU device probe not having been performed yet, having
been deferred, or having failed.

The first case occurs when the device tree describes the bus master and
IOMMU topology correctly but no device driver exists for the IOMMU yet
or the device driver has not been compiled in. Return NULL, the caller
will configure the device without an IOMMU.

The second and third cases are handled by deferring the probe of the bus
master device which will eventually get reprobed after the IOMMU.

The last case is currently handled by deferring the probe of the bus
master device as well. A mechanism to either configure the bus master
device without an IOMMU or to fail the bus master device probe depending
on whether the IOMMU is optional or mandatory would be a good
enhancement.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Laurent Pichart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/base/dma-mapping.c | 5 +++--
 drivers/iommu/of_iommu.c   | 4 ++--
 drivers/of/device.c        | 9 +++++++--
 include/linux/of_device.h  | 9 ++++++---
 4 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 449b948c7427..82bd45ced7ff 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -353,6 +353,7 @@ int dma_configure(struct device *dev)
 {
 	struct device *bridge = NULL, *dma_dev = dev;
 	enum dev_dma_attr attr;
+	int ret = 0;
 
 	if (dev_is_pci(dev)) {
 		bridge = pci_get_host_bridge_device(to_pci_dev(dev));
@@ -363,7 +364,7 @@ int dma_configure(struct device *dev)
 	}
 
 	if (dma_dev->of_node) {
-		of_dma_configure(dev, dma_dev->of_node);
+		ret = of_dma_configure(dev, dma_dev->of_node);
 	} else if (has_acpi_companion(dma_dev)) {
 		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
 		if (attr != DEV_DMA_NOT_SUPPORTED)
@@ -373,7 +374,7 @@ int dma_configure(struct device *dev)
 	if (bridge)
 		pci_put_host_bridge_device(bridge);
 
-	return 0;
+	return ret;
 }
 
 void dma_deconfigure(struct device *dev)
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index c8be889f5206..9f44ee8ea1bc 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -236,7 +236,7 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 			ops = ERR_PTR(err);
 	}
 
-	return IS_ERR(ops) ? NULL : ops;
+	return ops;
 }
 
 static int __init of_iommu_init(void)
@@ -247,7 +247,7 @@ static int __init of_iommu_init(void)
 	for_each_matching_node_and_match(np, matches, &match) {
 		const of_iommu_init_fn init_fn = match->data;
 
-		if (init_fn(np))
+		if (init_fn && init_fn(np))
 			pr_err("Failed to initialise IOMMU %s\n",
 				of_node_full_name(np));
 	}
diff --git a/drivers/of/device.c b/drivers/of/device.c
index e1ae9e7104a1..8bd3d8c09435 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -82,7 +82,7 @@ int of_device_add(struct platform_device *ofdev)
  * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
  * to fix up DMA configuration.
  */
-void of_dma_configure(struct device *dev, struct device_node *np)
+int of_dma_configure(struct device *dev, struct device_node *np)
 {
 	u64 dma_addr, paddr, size;
 	int ret;
@@ -123,7 +123,7 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 
 		if (!size) {
 			dev_err(dev, "Adjusted size 0x%llx invalid\n", size);
-			return;
+			return -EINVAL;
 		}
 		dev_dbg(dev, "dma_pfn_offset(%#08lx)\n", offset);
 	}
@@ -144,10 +144,15 @@ void of_dma_configure(struct device *dev, struct device_node *np)
 		coherent ? " " : " not ");
 
 	iommu = of_iommu_configure(dev, np);
+	if (IS_ERR(iommu))
+		return PTR_ERR(iommu);
+
 	dev_dbg(dev, "device is%sbehind an iommu\n",
 		iommu ? " " : " not ");
 
 	arch_setup_dma_ops(dev, dma_addr, size, iommu, coherent);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(of_dma_configure);
 
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index af984551cc2b..2cacdd81062e 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -55,7 +55,7 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return of_node_get(cpu_dev->of_node);
 }
 
-void of_dma_configure(struct device *dev, struct device_node *np);
+int of_dma_configure(struct device *dev, struct device_node *np);
 void of_dma_deconfigure(struct device *dev);
 #else /* CONFIG_OF */
 
@@ -104,8 +104,11 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 {
 	return NULL;
 }
-static inline void of_dma_configure(struct device *dev, struct device_node *np)
-{}
+
+static inline int of_dma_configure(struct device *dev, struct device_node *np)
+{
+	return 0;
+}
 static inline void of_dma_deconfigure(struct device *dev)
 {}
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3


From 5a1bb638d5677053c7addcb228b56da6fccb5d68 Mon Sep 17 00:00:00 2001
From: Sricharan R <sricharan@codeaurora.org>
Date: Mon, 10 Apr 2017 16:51:03 +0530
Subject: drivers: acpi: Handle IOMMU lookup failure with deferred probing or
 error

This is an equivalent to the DT's handling of the iommu master's probe
with deferred probing when the corrsponding iommu is not probed yet.
The lack of a registered IOMMU can be caused by the lack of a driver for
the IOMMU, the IOMMU device probe not having been performed yet, having
been deferred, or having failed.

The first case occurs when the firmware describes the bus master and
IOMMU topology correctly but no device driver exists for the IOMMU yet
or the device driver has not been compiled in. Return NULL, the caller
will configure the device without an IOMMU.

The second and third cases are handled by deferring the probe of the bus
master device which will eventually get reprobed after the IOMMU.

The last case is currently handled by deferring the probe of the bus
master device as well. A mechanism to either configure the bus master
device without an IOMMU or to fail the bus master device probe depending
on whether the IOMMU is optional or mandatory would be a good
enhancement.

Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
[Lorenzo: Added fixes for dma_coherent_mask overflow, acpi_dma_configure
          called multiple times for same device]
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c  | 33 ++++++++++++++++++++++++++++++++-
 drivers/acpi/scan.c        | 11 ++++++++---
 drivers/base/dma-mapping.c |  2 +-
 include/acpi/acpi_bus.h    |  2 +-
 include/linux/acpi.h       |  7 +++++--
 5 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 3dd9ec372dae..e323ece0314d 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -543,6 +543,14 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
 	const struct iommu_ops *ops = NULL;
 	int ret = -ENODEV;
 	struct fwnode_handle *iort_fwnode;
+	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+
+	/*
+	 * If we already translated the fwspec there
+	 * is nothing left to do, return the iommu_ops.
+	 */
+	if (fwspec && fwspec->ops)
+		return fwspec->ops;
 
 	if (node) {
 		iort_fwnode = iort_get_fwnode(node);
@@ -550,8 +558,17 @@ static const struct iommu_ops *iort_iommu_xlate(struct device *dev,
 			return NULL;
 
 		ops = iommu_ops_from_fwnode(iort_fwnode);
+		/*
+		 * If the ops look-up fails, this means that either
+		 * the SMMU drivers have not been probed yet or that
+		 * the SMMU drivers are not built in the kernel;
+		 * Depending on whether the SMMU drivers are built-in
+		 * in the kernel or not, defer the IOMMU configuration
+		 * or just abort it.
+		 */
 		if (!ops)
-			return NULL;
+			return iort_iommu_driver_enabled(node->type) ?
+			       ERR_PTR(-EPROBE_DEFER) : NULL;
 
 		ret = arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);
 	}
@@ -625,12 +642,26 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
 
 		while (parent) {
 			ops = iort_iommu_xlate(dev, parent, streamid);
+			if (IS_ERR_OR_NULL(ops))
+				return ops;
 
 			parent = iort_node_get_id(node, &streamid,
 						  IORT_IOMMU_TYPE, i++);
 		}
 	}
 
+	/*
+	 * If we have reason to believe the IOMMU driver missed the initial
+	 * add_device callback for dev, replay it to get things in order.
+	 */
+	if (!IS_ERR_OR_NULL(ops) && ops->add_device &&
+	    dev->bus && !dev->iommu_group) {
+		int err = ops->add_device(dev);
+
+		if (err)
+			ops = ERR_PTR(err);
+	}
+
 	return ops;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 192691880d55..2a513cce332e 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1373,20 +1373,25 @@ enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
  * @dev: The pointer to the device
  * @attr: device dma attributes
  */
-void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
+int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
 {
 	const struct iommu_ops *iommu;
+	u64 size;
 
 	iort_set_dma_mask(dev);
 
 	iommu = iort_iommu_configure(dev);
+	if (IS_ERR(iommu))
+		return PTR_ERR(iommu);
 
+	size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
 	/*
 	 * Assume dma valid range starts at 0 and covers the whole
 	 * coherent_dma_mask.
 	 */
-	arch_setup_dma_ops(dev, 0, dev->coherent_dma_mask + 1, iommu,
-			   attr == DEV_DMA_COHERENT);
+	arch_setup_dma_ops(dev, 0, size, iommu, attr == DEV_DMA_COHERENT);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(acpi_dma_configure);
 
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index 82bd45ced7ff..755a2b5354c5 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -368,7 +368,7 @@ int dma_configure(struct device *dev)
 	} else if (has_acpi_companion(dma_dev)) {
 		attr = acpi_get_dma_attr(to_acpi_device_node(dma_dev->fwnode));
 		if (attr != DEV_DMA_NOT_SUPPORTED)
-			acpi_dma_configure(dev, attr);
+			ret = acpi_dma_configure(dev, attr);
 	}
 
 	if (bridge)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ef0ae8aaa567..2a9a5de0fb00 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -575,7 +575,7 @@ struct acpi_pci_root {
 
 bool acpi_dma_supported(struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
-void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr);
+int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr);
 void acpi_dma_deconfigure(struct device *dev);
 
 struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..79d06ef654c9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -762,8 +762,11 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
 	return DEV_DMA_NOT_SUPPORTED;
 }
 
-static inline void acpi_dma_configure(struct device *dev,
-				      enum dev_dma_attr attr) { }
+static inline int acpi_dma_configure(struct device *dev,
+				     enum dev_dma_attr attr)
+{
+	return 0;
+}
 
 static inline void acpi_dma_deconfigure(struct device *dev) { }
 
-- 
cgit v1.2.3


From 316ca8804ea84a782d5ba2163711ebb22116ff5a Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Mon, 10 Apr 2017 16:51:06 +0530
Subject: ACPI/IORT: Remove linker section for IORT entries probing

The IORT linker section introduced by commit 34ceea275f62
("ACPI/IORT: Introduce linker section for IORT entries probing")
was needed to make sure SMMU drivers are registered (and therefore
probed) in the kernel before devices using the SMMU have a chance
to probe in turn.

Through the introduction of deferred IOMMU configuration the linker
section based IORT probing infrastructure is not needed any longer, in
that device/SMMU probe dependencies are managed through the probe
deferral mechanism, making the IORT linker section infrastructure
unused, so that it can be removed.

Remove the unused IORT linker section probing infrastructure
from the kernel to complete the ACPI IORT IOMMU configure probe
deferral mechanism implementation.

Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Sricharan R <sricharan@codeaurora.org>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/iort.c         | 2 --
 include/asm-generic/vmlinux.lds.h | 1 -
 include/linux/acpi_iort.h         | 3 ---
 3 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e323ece0314d..e7b1940ff13b 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1000,6 +1000,4 @@ void __init acpi_iort_init(void)
 	}
 
 	iort_init_platform_devices();
-
-	acpi_probe_device_table(iort);
 }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0968d13b3885..9faa26c41c14 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -566,7 +566,6 @@
 	IRQCHIP_OF_MATCH_TABLE()					\
 	ACPI_PROBE_TABLE(irqchip)					\
 	ACPI_PROBE_TABLE(clksrc)					\
-	ACPI_PROBE_TABLE(iort)						\
 	EARLYCON_TABLE()
 
 #define INIT_TEXT							\
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 77e08099e554..f167e1d045ff 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -52,7 +52,4 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
 { return NULL; }
 #endif
 
-#define IORT_ACPI_DECLARE(name, table_id, fn)		\
-	ACPI_DECLARE_PROBE_ENTRY(iort, name, table_id, 0, NULL, 0, fn)
-
 #endif /* __ACPI_IORT_H__ */
-- 
cgit v1.2.3


From 49a57ef7f8492ef985ee1ecdb927ca78a6b2f308 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 12 Apr 2017 00:21:27 -0500
Subject: iommu/omap: Drop legacy-style device support

All the supported boards that have OMAP IOMMU devices do support
DT boot only now. So, drop the support for the non-DT legacy-style
devices from the OMAP IOMMU driver. Couple of the fields from the
iommu platform data would no longer be required, so they have also
been cleaned up. The IOMMU platform data is still needed though for
performing reset management properly in a multi-arch environment.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/omap-iommu.c               | 30 ++++++++++++++----------------
 include/linux/platform_data/iommu-omap.h |  3 ---
 2 files changed, 14 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 54556713c8d1..febd4fbe3445 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -928,28 +928,26 @@ static int omap_iommu_probe(struct platform_device *pdev)
 	int irq;
 	struct omap_iommu *obj;
 	struct resource *res;
-	struct iommu_platform_data *pdata = dev_get_platdata(&pdev->dev);
 	struct device_node *of = pdev->dev.of_node;
 
+	if (!of) {
+		pr_err("%s: only DT-based devices are supported\n", __func__);
+		return -ENODEV;
+	}
+
 	obj = devm_kzalloc(&pdev->dev, sizeof(*obj) + MMU_REG_SIZE, GFP_KERNEL);
 	if (!obj)
 		return -ENOMEM;
 
-	if (of) {
-		obj->name = dev_name(&pdev->dev);
-		obj->nr_tlb_entries = 32;
-		err = of_property_read_u32(of, "ti,#tlb-entries",
-					   &obj->nr_tlb_entries);
-		if (err && err != -EINVAL)
-			return err;
-		if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
-			return -EINVAL;
-		if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
-			obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
-	} else {
-		obj->nr_tlb_entries = pdata->nr_tlb_entries;
-		obj->name = pdata->name;
-	}
+	obj->name = dev_name(&pdev->dev);
+	obj->nr_tlb_entries = 32;
+	err = of_property_read_u32(of, "ti,#tlb-entries", &obj->nr_tlb_entries);
+	if (err && err != -EINVAL)
+		return err;
+	if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
+		return -EINVAL;
+	if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
+		obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
 
 	obj->dev = &pdev->dev;
 	obj->ctx = (void *)obj + sizeof(*obj);
diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h
index 0496d171700a..a40fc0f4f9de 100644
--- a/include/linux/platform_data/iommu-omap.h
+++ b/include/linux/platform_data/iommu-omap.h
@@ -30,10 +30,7 @@ struct omap_iommu_arch_data {
 };
 
 struct iommu_platform_data {
-	const char *name;
 	const char *reset_name;
-	int nr_tlb_entries;
-
 	int (*assert_reset)(struct platform_device *pdev, const char *name);
 	int (*deassert_reset)(struct platform_device *pdev, const char *name);
 };
-- 
cgit v1.2.3


From e73b7afe4e8ca5ec4304a9e1d5009755a85fff91 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 12 Apr 2017 00:21:28 -0500
Subject: iommu/omap: Move data structures to omap-iommu.h

The internal data-structures are scattered over various
header and C files. Consolidate them in omap-iommu.h.

While at this, add the kerneldoc comment for the missing
iommu domain variable and revise the iommu_arch_data name.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
[s-anna@ti.com: revise kerneldoc comments]
Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/omap-iommu.c               | 16 ----------------
 drivers/iommu/omap-iommu.h               | 33 ++++++++++++++++++++++++++++++++
 include/linux/platform_data/iommu-omap.h | 17 ----------------
 3 files changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index febd4fbe3445..c1739a650654 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -42,22 +42,6 @@
 /* bitmap of the page sizes currently supported */
 #define OMAP_IOMMU_PGSIZES	(SZ_4K | SZ_64K | SZ_1M | SZ_16M)
 
-/**
- * struct omap_iommu_domain - omap iommu domain
- * @pgtable:	the page table
- * @iommu_dev:	an omap iommu device attached to this domain. only a single
- *		iommu device can be attached for now.
- * @dev:	Device using this domain.
- * @lock:	domain lock, should be taken when attaching/detaching
- */
-struct omap_iommu_domain {
-	u32 *pgtable;
-	struct omap_iommu *iommu_dev;
-	struct device *dev;
-	spinlock_t lock;
-	struct iommu_domain domain;
-};
-
 #define MMU_LOCK_BASE_SHIFT	10
 #define MMU_LOCK_BASE_MASK	(0x1f << MMU_LOCK_BASE_SHIFT)
 #define MMU_LOCK_BASE(x)	\
diff --git a/drivers/iommu/omap-iommu.h b/drivers/iommu/omap-iommu.h
index 59628e5017b4..3c33608f48ca 100644
--- a/drivers/iommu/omap-iommu.h
+++ b/drivers/iommu/omap-iommu.h
@@ -14,6 +14,7 @@
 #define _OMAP_IOMMU_H
 
 #include <linux/bitops.h>
+#include <linux/iommu.h>
 
 #define for_each_iotlb_cr(obj, n, __i, cr)				\
 	for (__i = 0;							\
@@ -27,6 +28,23 @@ struct iotlb_entry {
 	u32 endian, elsz, mixed;
 };
 
+/**
+ * struct omap_iommu_domain - omap iommu domain
+ * @pgtable:	the page table
+ * @iommu_dev:	an omap iommu device attached to this domain. only a single
+ *		iommu device can be attached for now.
+ * @dev:	Device using this domain.
+ * @lock:	domain lock, should be taken when attaching/detaching
+ * @domain:	generic domain handle used by iommu core code
+ */
+struct omap_iommu_domain {
+	u32 *pgtable;
+	struct omap_iommu *iommu_dev;
+	struct device *dev;
+	spinlock_t lock;
+	struct iommu_domain domain;
+};
+
 struct omap_iommu {
 	const char	*name;
 	void __iomem	*regbase;
@@ -52,6 +70,21 @@ struct omap_iommu {
 	u32 id;
 };
 
+/**
+ * struct omap_iommu_arch_data - omap iommu private data
+ * @name: name of the iommu device
+ * @iommu_dev: handle of the iommu device
+ *
+ * This is an omap iommu private data object, which binds an iommu user
+ * to its iommu device. This object should be placed at the iommu user's
+ * dev_archdata so generic IOMMU API can be used without having to
+ * utilize omap-specific plumbing anymore.
+ */
+struct omap_iommu_arch_data {
+	const char *name;
+	struct omap_iommu *iommu_dev;
+};
+
 struct cr_regs {
 	u32 cam;
 	u32 ram;
diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h
index a40fc0f4f9de..e8b12dbf6170 100644
--- a/include/linux/platform_data/iommu-omap.h
+++ b/include/linux/platform_data/iommu-omap.h
@@ -12,23 +12,6 @@
 
 #include <linux/platform_device.h>
 
-#define MMU_REG_SIZE		256
-
-/**
- * struct iommu_arch_data - omap iommu private data
- * @name: name of the iommu device
- * @iommu_dev: handle of the iommu device
- *
- * This is an omap iommu private data object, which binds an iommu user
- * to its iommu device. This object should be placed at the iommu user's
- * dev_archdata so generic IOMMU API can be used without having to
- * utilize omap-specific plumbing anymore.
- */
-struct omap_iommu_arch_data {
-	const char *name;
-	struct omap_iommu *iommu_dev;
-};
-
 struct iommu_platform_data {
 	const char *reset_name;
 	int (*assert_reset)(struct platform_device *pdev, const char *name);
-- 
cgit v1.2.3


From 518662e0fcb9fa241fe90a337b59bc5066b2a930 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 10 Apr 2017 12:22:09 +1000
Subject: NFS: fix usage of mempools.

When passed GFP flags that allow sleeping (such as
GFP_NOIO), mempool_alloc() will never return NULL, it will
wait until memory is available.

This means that we don't need to handle failure, but that we
do need to ensure one thread doesn't call mempool_alloc()
twice on the one pool without queuing or freeing the first
allocation.  If multiple threads did this during times of
high memory pressure, the pool could be exhausted and a
deadlock could result.

pnfs_generic_alloc_ds_commits() attempts to allocate from
the nfs_commit_mempool while already holding an allocation
from that pool.  This is not safe.  So change
nfs_commitdata_alloc() to take a flag that indicates whether
failure is acceptable.

In pnfs_generic_alloc_ds_commits(), accept failure and
handle it as we currently do.  Else where, do not accept
failure, and do not handle it.

Even when failure is acceptable, we want to succeed if
possible.  That means both
 - using an entry from the pool if there is one
 - waiting for direct reclaim is there isn't.

We call mempool_alloc(GFP_NOWAIT) to achieve the first, then
kmem_cache_alloc(GFP_NOIO|__GFP_NORETRY) to achieve the
second.  Each of these can fail, but together they do the
best they can without blocking indefinitely.

The objects returned by kmem_cache_alloc() will still be freed
by mempool_free().  This is safe as mempool_alloc() uses
exactly the same function to allocate objects (since the mempool
was created with mempool_create_slab_pool()).  The object returned
by mempool_alloc() and kmem_cache_alloc() are indistinguishable
so mempool_free() will handle both identically, either adding to the
pool or calling kmem_cache_free().

Also, don't test for failure when allocating from
nfs_wdata_mempool.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pnfs_nfs.c      | 16 +++++-----------
 fs/nfs/write.c         | 35 +++++++++++++++++++++--------------
 include/linux/nfs_fs.h |  2 +-
 3 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7250b95549ec..1edf5b84aba5 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -217,7 +217,7 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
 		if (list_empty(&bucket->committing))
 			continue;
-		data = nfs_commitdata_alloc();
+		data = nfs_commitdata_alloc(false);
 		if (!data)
 			break;
 		data->ds_commit_index = i;
@@ -283,16 +283,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	unsigned int nreq = 0;
 
 	if (!list_empty(mds_pages)) {
-		data = nfs_commitdata_alloc();
-		if (data != NULL) {
-			data->ds_commit_index = -1;
-			list_add(&data->pages, &list);
-			nreq++;
-		} else {
-			nfs_retry_commit(mds_pages, NULL, cinfo, 0);
-			pnfs_generic_retry_commit(cinfo, 0);
-			return -ENOMEM;
-		}
+		data = nfs_commitdata_alloc(true);
+		data->ds_commit_index = -1;
+		list_add(&data->pages, &list);
+		nreq++;
 	}
 
 	nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..bdfe5a7c5874 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool;
 static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_commit_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
 {
-	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+	struct nfs_commit_data *p;
 
-	if (p) {
-		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
+	if (never_fail)
+		p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+	else {
+		/* It is OK to do some reclaim, not no safe to wait
+		 * for anything to be returned to the pool.
+		 * mempool_alloc() cannot handle that particular combination,
+		 * so we need two separate attempts.
+		 */
+		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+		if (!p)
+			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
+					     __GFP_NOWARN | __GFP_NORETRY);
+		if (!p)
+			return NULL;
 	}
+
+	memset(p, 0, sizeof(*p));
+	INIT_LIST_HEAD(&p->pages);
 	return p;
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
@@ -82,8 +96,7 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	if (p)
-		memset(p, 0, sizeof(*p));
+	memset(p, 0, sizeof(*p));
 	return p;
 }
 
@@ -1705,19 +1718,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	if (list_empty(head))
 		return 0;
 
-	data = nfs_commitdata_alloc();
-
-	if (!data)
-		goto out_bad;
+	data = nfs_commitdata_alloc(true);
 
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
 	atomic_inc(&cinfo->mds->rpcs_out);
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
 				   data->mds_ops, how, 0);
- out_bad:
-	nfs_retry_commit(head, NULL, cinfo, 0);
-	return -ENOMEM;
 }
 
 int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 287f34161086..1b29915247b2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -502,7 +502,7 @@ extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_commit_data *nfs_commitdata_alloc(void);
+extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 
 static inline int
-- 
cgit v1.2.3


From fbe77c30e9abcb3429380dec622439991a718e31 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 19 Apr 2017 10:11:35 -0400
Subject: NFS: move rw_mode to nfs_pageio_header

Let's try to have it in a cacheline in nfs4_proc_pgio_rpc_prepare().

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c        | 2 +-
 fs/nfs/pagelist.c        | 8 +++-----
 fs/nfs/read.c            | 9 ++++++---
 fs/nfs/write.c           | 8 +++++---
 include/linux/nfs_page.h | 4 ++--
 include/linux/nfs_xdr.h  | 1 +
 6 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dda19a35ad9e..4e52ac773d1f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4610,7 +4610,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
 		return 0;
 	if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
 				hdr->args.lock_context,
-				hdr->rw_ops->rw_mode) == -EIO)
+				hdr->rw_mode) == -EIO)
 		return -EIO;
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
 		return -EIO;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 453255c30fa1..f53610672f03 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -664,11 +664,11 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 		     const struct nfs_pgio_completion_ops *compl_ops,
 		     const struct nfs_rw_ops *rw_ops,
 		     size_t bsize,
-		     int io_flags)
+		     int io_flags,
+		     gfp_t gfp_flags)
 {
 	struct nfs_pgio_mirror *new;
 	int i;
-	gfp_t gfp_flags = GFP_KERNEL;
 
 	desc->pg_moreio = 0;
 	desc->pg_inode = inode;
@@ -688,8 +688,6 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 	if (pg_ops->pg_get_mirror_count) {
 		/* until we have a request, we don't have an lseg and no
 		 * idea how many mirrors there will be */
-		if (desc->pg_rw_ops->rw_mode == FMODE_WRITE)
-			gfp_flags = GFP_NOIO;
 		new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
 			      sizeof(struct nfs_pgio_mirror), gfp_flags);
 		desc->pg_mirrors_dynamic = new;
@@ -753,7 +751,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	if (pagecount <= ARRAY_SIZE(pg_array->page_array))
 		pg_array->pagevec = pg_array->page_array;
 	else {
-		if (desc->pg_rw_ops->rw_mode == FMODE_WRITE)
+		if (hdr->rw_mode == FMODE_WRITE)
 			gfp_flags = GFP_NOIO;
 		pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags);
 		if (!pg_array->pagevec) {
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index defc9233e985..a8421d9dab6a 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep;
 
 static struct nfs_pgio_header *nfs_readhdr_alloc(void)
 {
-	return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+	struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
+
+	if (p)
+		p->rw_mode = FMODE_READ;
+	return p;
 }
 
 static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
@@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_read_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
-			server->rsize, 0);
+			server->rsize, 0, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 
@@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void)
 }
 
 static const struct nfs_rw_ops nfs_rw_read_ops = {
-	.rw_mode		= FMODE_READ,
 	.rw_alloc_header	= nfs_readhdr_alloc,
 	.rw_free_header		= nfs_readhdr_free,
 	.rw_done		= nfs_readpage_done,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e0bccbefbc9e..95ac001b6fdf 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -96,7 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	memset(p, 0, sizeof(*p));
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		p->rw_mode = FMODE_WRITE;
+	}
 	return p;
 }
 
@@ -1381,7 +1384,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 		pg_ops = server->pnfs_curr_ld->pg_write_ops;
 #endif
 	nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
-			server->wsize, ioflags);
+			server->wsize, ioflags, GFP_NOIO);
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 
@@ -2115,7 +2118,6 @@ void nfs_destroy_writepagecache(void)
 }
 
 static const struct nfs_rw_ops nfs_rw_write_ops = {
-	.rw_mode		= FMODE_WRITE,
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
 	.rw_done		= nfs_writeback_done,
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 957049f72290..6f01e28bba27 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -64,7 +64,6 @@ struct nfs_pageio_ops {
 };
 
 struct nfs_rw_ops {
-	const fmode_t rw_mode;
 	struct nfs_pgio_header *(*rw_alloc_header)(void);
 	void (*rw_free_header)(struct nfs_pgio_header *);
 	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
@@ -124,7 +123,8 @@ extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 			     const struct nfs_pgio_completion_ops *compl_ops,
 			     const struct nfs_rw_ops *rw_ops,
 			     size_t bsize,
-			     int how);
+			     int how,
+			     gfp_t gfp_flags);
 extern	int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
 				   struct nfs_page *);
 extern  int nfs_pageio_resend(struct nfs_pageio_descriptor *,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 348f7c158084..51e27f9746ee 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1427,6 +1427,7 @@ struct nfs_pgio_header {
 	struct list_head	pages;
 	struct nfs_page		*req;
 	struct nfs_writeverf	verf;		/* Used for writes */
+	fmode_t			rw_mode;
 	struct pnfs_layout_segment *lseg;
 	loff_t			io_start;
 	const struct rpc_call_ops *mds_ops;
-- 
cgit v1.2.3


From baf7a616d537f577d33b7d9986f40532e2bd9f66 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:25 +0200
Subject: bdi: Provide bdi_register_va() and bdi_alloc()

Add function that registers bdi and takes va_list instead of variable
number of arguments.

Add bdi_alloc() as simple wrapper for NUMA-unaware users allocating BDI.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h |  6 ++++++
 mm/backing-dev.c            | 20 +++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c52a48cb9a66..47a98e6e2a65 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -30,6 +30,8 @@ void bdi_put(struct backing_dev_info *bdi);
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, va_list args);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
@@ -37,6 +39,10 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+	return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
+}
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3ea3bbd921d6..e5e0972bdd6f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -856,18 +856,15 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 	return bdi;
 }
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, va_list args)
 {
-	va_list args;
 	struct device *dev;
 
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		return 0;
 
-	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-	va_end(args);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
@@ -884,6 +881,19 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	trace_writeback_bdi_register(bdi);
 	return 0;
 }
+EXPORT_SYMBOL(bdi_register_va);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = bdi_register_va(bdi, parent, fmt, args);
+	va_end(args);
+	return ret;
+}
 EXPORT_SYMBOL(bdi_register);
 
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-- 
cgit v1.2.3


From fca39346a55bb7196888ffc77d9e3557340d1d0b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:28 +0200
Subject: fs: Provide infrastructure for dynamic BDIs in filesystems

Provide helper functions for setting up dynamically allocated
backing_dev_info structures for filesystems and cleaning them up on
superblock destruction.

CC: linux-mtd@lists.infradead.org
CC: linux-nfs@vger.kernel.org
CC: Petr Vandrovec <petr@vandrovec.name>
CC: linux-nilfs@vger.kernel.org
CC: cluster-devel@redhat.com
CC: osd-dev@open-osd.org
CC: codalist@coda.cs.cmu.edu
CC: linux-afs@lists.infradead.org
CC: ecryptfs@vger.kernel.org
CC: linux-cifs@vger.kernel.org
CC: ceph-devel@vger.kernel.org
CC: linux-btrfs@vger.kernel.org
CC: v9fs-developer@lists.sourceforge.net
CC: lustre-devel@lists.lustre.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/super.c                       | 49 ++++++++++++++++++++++++++++++++++++++++
 include/linux/backing-dev-defs.h |  2 +-
 include/linux/fs.h               |  6 +++++
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index b8b6a086c03b..0f51a437c269 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,11 @@ void generic_shutdown_super(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
+	if (sb->s_iflags & SB_I_DYNBDI) {
+		bdi_put(sb->s_bdi);
+		sb->s_bdi = &noop_backing_dev_info;
+		sb->s_iflags &= ~SB_I_DYNBDI;
+	}
 }
 
 EXPORT_SYMBOL(generic_shutdown_super);
@@ -1255,6 +1260,50 @@ out:
 	return ERR_PTR(error);
 }
 
+/*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+	struct backing_dev_info *bdi;
+	int err;
+	va_list args;
+
+	bdi = bdi_alloc(GFP_KERNEL);
+	if (!bdi)
+		return -ENOMEM;
+
+	bdi->name = sb->s_type->name;
+
+	va_start(args, fmt);
+	err = bdi_register_va(bdi, NULL, fmt, args);
+	va_end(args);
+	if (err) {
+		bdi_put(bdi);
+		return err;
+	}
+	WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+	sb->s_bdi = bdi;
+	sb->s_iflags |= SB_I_DYNBDI;
+
+	return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+	static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+	return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+				    atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
 /*
  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
  * instead.
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e66d4722db8e..866c433e7d32 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -146,7 +146,7 @@ struct backing_dev_info {
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
 
-	char *name;
+	const char *name;
 
 	struct kref refcnt;	/* Reference counter for the structure */
 	unsigned int capabilities; /* Device capabilities */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..98cf14ea78c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1272,6 +1272,9 @@ struct mm_struct;
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 
+/* Temporary flag until all filesystems are converted to dynamic bdis */
+#define SB_I_DYNBDI	0x00000100
+
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
@@ -2121,6 +2124,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
+extern __printf(2, 3)
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
+extern int super_setup_bdi(struct super_block *sb);
 
 extern int current_umask(void);
 
-- 
cgit v1.2.3


From fa06052d637bf3a76f18cd2304048b866af4096e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:37 +0200
Subject: mtd: Convert to dynamically allocated bdi infrastructure

MTD already allocates backing_dev_info dynamically. Convert it to use
generic infrastructure for this including proper refcounting. We drop
mtd->backing_dev_info as its only use was to pass mtd_bdi pointer from
one file into another and if we wanted to keep that in a clean way, we'd
have to make mtd hold and drop bdi reference as needed which seems
pointless for passing one global pointer...

CC: David Woodhouse <dwmw2@infradead.org>
CC: Brian Norris <computersforpeace@gmail.com>
CC: linux-mtd@lists.infradead.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/mtd/mtdcore.c   | 23 ++++++++++++-----------
 drivers/mtd/mtdsuper.c  |  7 ++++++-
 include/linux/mtd/mtd.h |  5 -----
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 66a9dedd1062..23e2e56ca54e 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -46,7 +46,7 @@
 
 #include "mtdcore.h"
 
-static struct backing_dev_info *mtd_bdi;
+struct backing_dev_info *mtd_bdi;
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -496,11 +496,9 @@ int add_mtd_device(struct mtd_info *mtd)
 	 * mtd_device_parse_register() multiple times on the same master MTD,
 	 * especially with CONFIG_MTD_PARTITIONED_MASTER=y.
 	 */
-	if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+	if (WARN_ONCE(mtd->dev.type, "MTD already registered\n"))
 		return -EEXIST;
 
-	mtd->backing_dev_info = mtd_bdi;
-
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
 
@@ -1775,13 +1773,18 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
 	struct backing_dev_info *bdi;
 	int ret;
 
-	bdi = kzalloc(sizeof(*bdi), GFP_KERNEL);
+	bdi = bdi_alloc(GFP_KERNEL);
 	if (!bdi)
 		return ERR_PTR(-ENOMEM);
 
-	ret = bdi_setup_and_register(bdi, name);
+	bdi->name = name;
+	/*
+	 * We put '-0' suffix to the name to get the same name format as we
+	 * used to get. Since this is called only once, we get a unique name. 
+	 */
+	ret = bdi_register(bdi, NULL, "%.28s-0", name);
 	if (ret)
-		kfree(bdi);
+		bdi_put(bdi);
 
 	return ret ? ERR_PTR(ret) : bdi;
 }
@@ -1813,8 +1816,7 @@ static int __init init_mtd(void)
 out_procfs:
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
-	bdi_destroy(mtd_bdi);
-	kfree(mtd_bdi);
+	bdi_put(mtd_bdi);
 err_bdi:
 	class_unregister(&mtd_class);
 err_reg:
@@ -1828,8 +1830,7 @@ static void __exit cleanup_mtd(void)
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
 	class_unregister(&mtd_class);
-	bdi_destroy(mtd_bdi);
-	kfree(mtd_bdi);
+	bdi_put(mtd_bdi);
 	idr_destroy(&mtd_idr);
 }
 
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 20c02a3b7417..e69e7855e31f 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -18,6 +18,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/backing-dev.h>
 
 /*
  * compare superblocks to see if they're equivalent
@@ -38,6 +39,8 @@ static int get_sb_mtd_compare(struct super_block *sb, void *_mtd)
 	return 0;
 }
 
+extern struct backing_dev_info *mtd_bdi;
+
 /*
  * mark the superblock by the MTD device it is using
  * - set the device number to be the correct MTD block device for pesuperstence
@@ -49,7 +52,9 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 
 	sb->s_mtd = mtd;
 	sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
-	sb->s_bdi = mtd->backing_dev_info;
+	sb->s_bdi = bdi_get(mtd_bdi);
+	sb->s_iflags |= SB_I_DYNBDI;
+
 	return 0;
 }
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eebdc63cf6af..79b176eca04a 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -334,11 +334,6 @@ struct mtd_info {
 	int (*_get_device) (struct mtd_info *mtd);
 	void (*_put_device) (struct mtd_info *mtd);
 
-	/* Backing device capabilities for this device
-	 * - provides mmap capabilities
-	 */
-	struct backing_dev_info *backing_dev_info;
-
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
 	/* ECC status information */
-- 
cgit v1.2.3


From a5695a79088653c73c92ae8d48658cbc49f31884 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:38 +0200
Subject: coda: Convert to separately allocated bdi

Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Jan Harkes <jaharkes@cs.cmu.edu>
CC: coda@cs.cmu.edu
CC: codalist@coda.cs.cmu.edu
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/coda/inode.c            | 11 ++++-------
 include/linux/coda_psdev.h |  1 -
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2dea594da199..6058df380cc0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		goto unlock_out;
 	}
 
-	error = bdi_setup_and_register(&vc->bdi, "coda");
-	if (error)
-		goto unlock_out;
-
 	vc->vc_sb = sb;
 	mutex_unlock(&vc->vc_mutex);
 
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
 	sb->s_d_op = &coda_dentry_operations;
-	sb->s_bdi = &vc->bdi;
+
+	error = super_setup_bdi(sb);
+	if (error)
+		goto error;
 
 	/* get root fid from Venus: this needs the root inode */
 	error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 
 error:
 	mutex_lock(&vc->vc_mutex);
-	bdi_destroy(&vc->bdi);
 	vc->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
 {
 	struct venus_comm *vcp = coda_vcp(sb);
 	mutex_lock(&vcp->vc_mutex);
-	bdi_destroy(&vcp->bdi);
 	vcp->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 	mutex_unlock(&vcp->vc_mutex);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 5b8721efa948..31e4e1f1547c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -15,7 +15,6 @@ struct venus_comm {
 	struct list_head    vc_processing;
 	int                 vc_inuse;
 	struct super_block *vc_sb;
-	struct backing_dev_info bdi;
 	struct mutex	    vc_mutex;
 };
 
-- 
cgit v1.2.3


From 0db10944a76ba09f37d43b99d0fe085a18307f22 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:45 +0200
Subject: nfs: Convert to separately allocated bdi

Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Anna Schumaker <anna.schumaker@netapp.com>
CC: linux-nfs@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/nfs/client.c           | 10 ----------
 fs/nfs/internal.h         |  6 +++---
 fs/nfs/super.c            | 34 +++++++++++++++++++---------------
 fs/nfs/write.c            | 13 ++++++-------
 include/linux/nfs_fs_sb.h |  1 -
 5 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 390ada8741bc..04d15a0045e3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	server->backing_dev_info.name = "nfs";
-	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
 	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void)
 		return NULL;
 	}
 
-	if (bdi_init(&server->backing_dev_info)) {
-		nfs_free_iostats(server->io_stats);
-		kfree(server);
-		return NULL;
-	}
-
 	ida_init(&server->openowner_id);
 	ida_init(&server->lockowner_id);
 	pnfs_init_server(server);
@@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server)
 	ida_destroy(&server->lockowner_id);
 	ida_destroy(&server->openowner_id);
 	nfs_free_iostats(server->io_stats);
-	bdi_destroy(&server->backing_dev_info);
 	kfree(server);
 	nfs_release_automount_timer();
 	dprintk("<-- nfs_free_server()\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b38fedb7e03..9dc65d7ae754 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -139,7 +139,7 @@ struct nfs_mount_request {
 };
 
 struct nfs_mount_info {
-	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+	int (*fill_super)(struct super_block *, struct nfs_mount_info *);
 	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
 	struct nfs_parsed_mount_data *parsed;
 	struct nfs_clone_mount *cloned;
@@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *
 struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
 		const char *, struct nfs_mount_info *);
 void nfs_kill_super(struct super_block *);
-void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
 
 extern struct rpc_stat nfs_rpcstat;
 
@@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
 /* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
 void nfs_umount_begin(struct super_block *);
 int  nfs_statfs(struct dentry *, struct kstatfs *);
 int  nfs_show_options(struct seq_file *, struct dentry *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..8d97aa70407e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2315,18 +2315,17 @@ inline void nfs_initialise_sb(struct super_block *sb)
 		sb->s_blocksize = nfs_block_bits(server->wsize,
 						 &sb->s_blocksize_bits);
 
-	sb->s_bdi = &server->backing_dev_info;
-
 	nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
 /*
  * Finish setting up an NFS2/3 superblock
  */
-void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
 	struct nfs_parsed_mount_data *data = mount_info->parsed;
 	struct nfs_server *server = NFS_SB(sb);
+	int ret;
 
 	sb->s_blocksize_bits = 0;
 	sb->s_blocksize = 0;
@@ -2344,13 +2343,21 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 	}
 
  	nfs_initialise_sb(sb);
+
+	ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
+				   MINOR(server->s_dev));
+	if (ret)
+		return ret;
+	sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	return 0;
+
 }
 EXPORT_SYMBOL_GPL(nfs_fill_super);
 
 /*
  * Finish setting up a cloned NFS2/3/4 superblock
  */
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
 	const struct super_block *old_sb = mount_info->cloned->sb;
 	struct nfs_server *server = NFS_SB(sb);
@@ -2370,6 +2377,11 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 	}
 
  	nfs_initialise_sb(sb);
+
+	sb->s_bdi = bdi_get(old_sb->s_bdi);
+	sb->s_iflags |= SB_I_DYNBDI;
+
+	return 0;
 }
 
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -2522,11 +2534,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
 }
 #endif
 
-static int nfs_bdi_register(struct nfs_server *server)
-{
-	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
 int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
 			struct nfs_mount_info *mount_info)
 {
@@ -2594,17 +2601,14 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 		nfs_free_server(server);
 		server = NULL;
 	} else {
-		error = nfs_bdi_register(server);
-		if (error) {
-			mntroot = ERR_PTR(error);
-			goto error_splat_super;
-		}
 		server->super = s;
 	}
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		mount_info->fill_super(s, mount_info);
+		error = mount_info->fill_super(s, mount_info);
+		if (error)
+			goto error_splat_super;
 		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
 	}
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..cc341fc7fd44 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -263,16 +263,15 @@ int nfs_congestion_kb;
 
 static void nfs_set_page_writeback(struct page *page)
 {
-	struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	int ret = test_set_page_writeback(page);
 
 	WARN_ON_ONCE(ret != 0);
 
 	if (atomic_long_inc_return(&nfss->writeback) >
-			NFS_CONGESTION_ON_THRESH) {
-		set_bdi_congested(&nfss->backing_dev_info,
-					BLK_RW_ASYNC);
-	}
+			NFS_CONGESTION_ON_THRESH)
+		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
 static void nfs_end_page_writeback(struct nfs_page *req)
@@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 
 	end_page_writeback(req->wb_page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+		clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
 
@@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 	}
 	nfss = NFS_SERVER(data->inode);
 	if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+		clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
 
 	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
 	nfs_commit_end(cinfo.mds);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b34097c67848..e1502c55741e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -133,7 +133,6 @@ struct nfs_server {
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nlm_host		*nlm_host;	/* NLM client handle */
 	struct nfs_iostats __percpu *io_stats;	/* I/O statistics */
-	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
-- 
cgit v1.2.3


From c1844d536dafa5f2cddf4b4841a3634f80a27666 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:47 +0200
Subject: fs: Remove SB_I_DYNBDI flag

Now that all bdi structures filesystems use are properly refcounted, we
can remove the SB_I_DYNBDI flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/mtd/mtdsuper.c | 1 -
 fs/gfs2/ops_fstype.c   | 1 -
 fs/nfs/super.c         | 1 -
 fs/nilfs2/super.c      | 1 -
 fs/super.c             | 5 +----
 include/linux/fs.h     | 3 ---
 6 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index e69e7855e31f..e43fea896d1e 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -53,7 +53,6 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 	sb->s_mtd = mtd;
 	sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
 	sb->s_bdi = bdi_get(mtd_bdi);
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e6b6f97d0fc1..ed67548b286c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1224,7 +1224,6 @@ static int set_gfs2_super(struct super_block *s, void *data)
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
 	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
-	s->s_iflags |= SB_I_DYNBDI;
 	return 0;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8d97aa70407e..dc69314d455e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2379,7 +2379,6 @@ int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
  	nfs_initialise_sb(sb);
 
 	sb->s_bdi = bdi_get(old_sb->s_bdi);
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index feb796a38b8d..926682981d61 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1069,7 +1069,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_max_links = NILFS_LINK_MAX;
 
 	sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	err = load_nilfs(nilfs, sb);
 	if (err)
diff --git a/fs/super.c b/fs/super.c
index e267d3a00144..8444d26926ef 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,10 +446,9 @@ void generic_shutdown_super(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
-	if (sb->s_iflags & SB_I_DYNBDI) {
+	if (sb->s_bdi != &noop_backing_dev_info) {
 		bdi_put(sb->s_bdi);
 		sb->s_bdi = &noop_backing_dev_info;
-		sb->s_iflags &= ~SB_I_DYNBDI;
 	}
 }
 
@@ -1055,7 +1054,6 @@ static int set_bdev_super(struct super_block *s, void *data)
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
 	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
-	s->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
@@ -1282,7 +1280,6 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 	}
 	WARN_ON(sb->s_bdi != &noop_backing_dev_info);
 	sb->s_bdi = bdi;
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98cf14ea78c0..30e5c14bd743 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1272,9 +1272,6 @@ struct mm_struct;
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 
-/* Temporary flag until all filesystems are converted to dynamic bdis */
-#define SB_I_DYNBDI	0x00000100
-
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
-- 
cgit v1.2.3


From 2e82b84c01d9438d86079980e22e036eee71e754 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:48 +0200
Subject: block: Remove unused functions

Now that all backing_dev_info structure are allocated separately, we can
drop some unused functions.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h |  5 ----
 mm/backing-dev.c            | 56 +++++----------------------------------------
 2 files changed, 6 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 47a98e6e2a65..aaeb2ec5d33c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,8 +17,6 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/slab.h>
 
-int __must_check bdi_init(struct backing_dev_info *bdi);
-
 static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 {
 	kref_get(&bdi->refcnt);
@@ -32,12 +30,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, va_list args);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
 static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3dd175986390..4dcd56947f2a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
 #include <linux/device.h>
 #include <trace/events/writeback.h>
 
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
 struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
 }
 postcore_initcall(bdi_class_init);
 
+static int bdi_init(struct backing_dev_info *bdi);
+
 static int __init default_bdi_init(void)
 {
 	int err;
@@ -820,7 +820,7 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
 {
 	int ret;
 
@@ -838,7 +838,6 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	return ret;
 }
-EXPORT_SYMBOL(bdi_init);
 
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 {
@@ -897,12 +896,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 }
 EXPORT_SYMBOL(bdi_register);
 
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
-	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
-}
-EXPORT_SYMBOL(bdi_register_dev);
-
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
 {
 	int rc;
@@ -950,13 +943,6 @@ void bdi_unregister(struct backing_dev_info *bdi)
 	}
 }
 
-static void bdi_exit(struct backing_dev_info *bdi)
-{
-	WARN_ON_ONCE(bdi->dev);
-	wb_exit(&bdi->wb);
-	cgwb_bdi_exit(bdi);
-}
-
 static void release_bdi(struct kref *ref)
 {
 	struct backing_dev_info *bdi =
@@ -964,7 +950,9 @@ static void release_bdi(struct kref *ref)
 
 	if (test_bit(WB_registered, &bdi->wb.state))
 		bdi_unregister(bdi);
-	bdi_exit(bdi);
+	WARN_ON_ONCE(bdi->dev);
+	wb_exit(&bdi->wb);
+	cgwb_bdi_exit(bdi);
 	kfree(bdi);
 }
 
@@ -974,38 +962,6 @@ void bdi_put(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_put);
 
-void bdi_destroy(struct backing_dev_info *bdi)
-{
-	bdi_unregister(bdi);
-	bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
-	int err;
-
-	bdi->name = name;
-	bdi->capabilities = 0;
-	err = bdi_init(bdi);
-	if (err)
-		return err;
-
-	err = bdi_register(bdi, NULL, "%.28s-%ld", name,
-			   atomic_long_inc_return(&bdi_seq));
-	if (err) {
-		bdi_destroy(bdi);
-		return err;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
-
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
-- 
cgit v1.2.3


From 7c4cc30024946dae9530cd6dc0d8d4eb40fca173 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:49 +0200
Subject: bdi: Drop 'parent' argument from bdi_register[_va]()

Drop 'parent' argument of bdi_register() and bdi_register_va().  It is
always NULL.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/mtd/mtdcore.c       |  2 +-
 fs/super.c                  |  2 +-
 include/linux/backing-dev.h |  9 ++++-----
 mm/backing-dev.c            | 13 +++++--------
 4 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 23e2e56ca54e..1517da3ddd7d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1782,7 +1782,7 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
 	 * We put '-0' suffix to the name to get the same name format as we
 	 * used to get. Since this is called only once, we get a unique name. 
 	 */
-	ret = bdi_register(bdi, NULL, "%.28s-0", name);
+	ret = bdi_register(bdi, "%.28s-0", name);
 	if (ret)
 		bdi_put(bdi);
 
diff --git a/fs/super.c b/fs/super.c
index 8444d26926ef..adb0c0de428c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1272,7 +1272,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 	bdi->name = sb->s_type->name;
 
 	va_start(args, fmt);
-	err = bdi_register_va(bdi, NULL, fmt, args);
+	err = bdi_register_va(bdi, fmt, args);
 	va_end(args);
 	if (err) {
 		bdi_put(bdi);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index aaeb2ec5d33c..557d84063934 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -25,11 +25,10 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 
 void bdi_put(struct backing_dev_info *bdi);
 
-__printf(3, 4)
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...);
-int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, va_list args);
+__printf(2, 3)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
+		    va_list args);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 4dcd56947f2a..f028a9a472fd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -856,15 +856,14 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 }
 EXPORT_SYMBOL(bdi_alloc_node);
 
-int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, va_list args)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 {
 	struct device *dev;
 
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		return 0;
 
-	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+	dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
@@ -883,14 +882,13 @@ int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
 }
 EXPORT_SYMBOL(bdi_register_va);
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
 {
 	va_list args;
 	int ret;
 
 	va_start(args, fmt);
-	ret = bdi_register_va(bdi, parent, fmt, args);
+	ret = bdi_register_va(bdi, fmt, args);
 	va_end(args);
 	return ret;
 }
@@ -900,8 +898,7 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
 {
 	int rc;
 
-	rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
-			MINOR(owner->devt));
+	rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
 	if (rc)
 		return rc;
 	/* Leaking owner reference... */
-- 
cgit v1.2.3


From b7819b9259185dcdcc81eb32182a4dc13d695738 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:02:55 +0200
Subject: block: remove the blk_execute_rq return value

The function only returns -EIO if rq->errors is non-zero, which is not
very useful and lets a large number of callers ignore the return value.

Just let the callers figure out their error themselves.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-exec.c                 | 8 +-------
 block/scsi_ioctl.c               | 3 ++-
 drivers/block/virtio_blk.c       | 3 ++-
 drivers/cdrom/cdrom.c            | 3 ++-
 drivers/ide/ide-atapi.c          | 3 ++-
 drivers/ide/ide-cd.c             | 3 ++-
 drivers/ide/ide-cd_ioctl.c       | 3 ++-
 drivers/ide/ide-devsets.c        | 4 ++--
 drivers/ide/ide-disk.c           | 3 +--
 drivers/ide/ide-ioctls.c         | 7 ++++---
 drivers/ide/ide-park.c           | 3 ++-
 drivers/ide/ide-pm.c             | 3 ++-
 drivers/ide/ide-taskfile.c       | 4 ++--
 drivers/scsi/osd/osd_initiator.c | 5 ++++-
 fs/nfsd/blocklayout.c            | 5 +++--
 include/linux/blkdev.h           | 2 +-
 16 files changed, 34 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8cd0e9bc8dc8..afa383248c7c 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -92,11 +92,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
-	int err = 0;
 	unsigned long hang_check;
 
 	rq->end_io_data = &wait;
@@ -108,10 +107,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
 	else
 		wait_for_completion_io(&wait);
-
-	if (rq->errors)
-		err = -EIO;
-
-	return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 82a43bb19967..b1352143f12f 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	scsi_req(rq)->cmd[0] = cmd;
 	scsi_req(rq)->cmd[4] = data;
 	scsi_req(rq)->cmd_len = 6;
-	err = blk_execute_rq(q, bd_disk, rq, 0);
+	blk_execute_rq(q, bd_disk, rq, 0);
+	err = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 
 	return err;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2d8290169271..eaf99022bdc6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -310,7 +310,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	if (err)
 		goto out;
 
-	err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+	err = req->errors ? -EIO : 0;
 out:
 	blk_put_request(req);
 	return err;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 87739649eac2..308501730ab3 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2218,7 +2218,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		rq->timeout = 60 * HZ;
 		bio = rq->bio;
 
-		if (blk_execute_rq(q, cdi->disk, rq, 0)) {
+		blk_execute_rq(q, cdi->disk, rq, 0);
+		if (rq->errors) {
 			struct request_sense *s = req->sense;
 			ret = -EIO;
 			cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index feb30061123b..1524797e1776 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-	error = blk_execute_rq(drive->queue, disk, rq, 0);
+	blk_execute_rq(drive->queue, disk, rq, 0);
+	error = rq->errors ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 74f1b7dc03f7..95c40afa9120 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -452,7 +452,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 			}
 		}
 
-		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
+		blk_execute_rq(drive->queue, info->disk, rq, 0);
+		error = rq->errors ? -EIO : 0;
 
 		if (buffer)
 			*bufflen = scsi_req(rq)->resid_len;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 9fcefbc8425e..f1ab726bd430 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -307,7 +307,8 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	scsi_req_init(rq);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
-	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
+	blk_execute_rq(drive->queue, cd->disk, rq, 0);
+	ret = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 	/*
 	 * A reset will unlock the door. If it was previously locked,
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index a45dda5386e4..eea6a7cb80b5 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,8 +173,8 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
 	rq->special = setting->set;
 
-	if (blk_execute_rq(q, NULL, rq, 0))
-		ret = rq->errors;
+	blk_execute_rq(q, NULL, rq, 0);
+	ret = rq->errors;
 	blk_put_request(rq);
 
 	return ret;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 186159715b71..7c06237f3479 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -470,7 +470,6 @@ ide_devset_get(multcount, mult_count);
 static int set_multcount(ide_drive_t *drive, int arg)
 {
 	struct request *rq;
-	int error;
 
 	if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
 		return -EINVAL;
@@ -484,7 +483,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
 	drive->mult_req = arg;
 	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-	error = blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
 
 	return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 248a3e0ceb46..3e96e531b367 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -128,7 +128,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 		scsi_req_init(rq);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
-		err = blk_execute_rq(drive->queue, NULL, rq, 0);
+		blk_execute_rq(drive->queue, NULL, rq, 0);
+		err = rq->errors ? -EIO : 0;
 		blk_put_request(rq);
 
 		return err;
@@ -227,8 +228,8 @@ static int generic_drive_reset(ide_drive_t *drive)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
-	if (blk_execute_rq(drive->queue, NULL, rq, 1))
-		ret = rq->errors;
+	blk_execute_rq(drive->queue, NULL, rq, 1);
+	ret = rq->errors;
 	blk_put_request(rq);
 	return ret;
 }
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 101aed9a61ca..b4f577016f5a 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = &timeout;
-	rc = blk_execute_rq(q, NULL, rq, 1);
+	blk_execute_rq(q, NULL, rq, 1);
+	rc = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 	if (rc)
 		goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ec951be4b0c8..bf513f886f3c 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		mesg.event = PM_EVENT_FREEZE;
 	rqpm.pm_state = mesg.event;
 
-	ret = blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
+	ret = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 
 	if (ret == 0 && ide_port_acpi(hwif)) {
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4c0007cb74e3..78924c7c9478 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -452,8 +452,8 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	rq->special = cmd;
 	cmd->rq = rq;
 
-	error = blk_execute_rq(drive->queue, NULL, rq, 0);
-
+	blk_execute_rq(drive->queue, NULL, rq, 0);
+	error = rq->errors ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 9d0727b2bdec..5eeab7047d1e 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -489,7 +489,10 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
 
 int osd_execute_request(struct osd_request *or)
 {
-	int error = blk_execute_rq(or->request->q, NULL, or->request, 0);
+	int error;
+
+	blk_execute_rq(or->request->q, NULL, or->request, 0);
+	error = or->request->errors ? -EIO : 0;
 
 	_set_error_resid(or, or->request, error);
 	return error;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 92b4b41d19d2..9f618b77ffee 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	req->cmd[4] = bufflen & 0xff;
 	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
-	error = blk_execute_rq(rq->q, NULL, rq, 1);
-	if (error) {
+	blk_execute_rq(rq->q, NULL, rq, 1);
+	if (rq->errors) {
 		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
 			rq->errors);
+		error = -EIO;
 		goto out_put_request;
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51c9e391798e..e2064ed3c703 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -970,7 +970,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, const struct iov_iter *,
 			       gfp_t);
-extern int blk_execute_rq(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
-- 
cgit v1.2.3


From 17d5363b83f8c73ef9109f75a4a9b578f31d842f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:01 +0200
Subject: scsi: introduce a result field in struct scsi_request

This passes on the scsi_cmnd result field to users of passthrough
requests.  Currently we abuse req->errors for this purpose, but that
field will go away in its current form.

Note that the old IDE code abuses the errors field in very creative
ways and stores all kinds of different values in it.  I didn't dare
to touch this magic, so the abuses are brought forward 1:1.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg-lib.c                    |  8 ++++----
 block/bsg.c                        | 12 +++++------
 block/scsi_ioctl.c                 | 14 ++++++-------
 drivers/block/cciss.c              | 42 +++++++++++++++++++-------------------
 drivers/block/pktcdvd.c            |  2 +-
 drivers/block/virtio_blk.c         |  2 +-
 drivers/cdrom/cdrom.c              |  2 +-
 drivers/ide/ide-atapi.c            | 10 ++++-----
 drivers/ide/ide-cd.c               | 20 +++++++++---------
 drivers/ide/ide-cd_ioctl.c         |  2 +-
 drivers/ide/ide-devsets.c          |  4 ++--
 drivers/ide/ide-dma.c              |  2 +-
 drivers/ide/ide-eh.c               | 36 ++++++++++++++++----------------
 drivers/ide/ide-floppy.c           | 10 ++++-----
 drivers/ide/ide-io.c               | 10 ++++-----
 drivers/ide/ide-ioctls.c           |  4 ++--
 drivers/ide/ide-park.c             |  2 +-
 drivers/ide/ide-pm.c               |  8 ++++----
 drivers/ide/ide-tape.c             |  4 ++--
 drivers/ide/ide-taskfile.c         |  6 +++---
 drivers/scsi/osd/osd_initiator.c   |  4 ++--
 drivers/scsi/osst.c                |  2 +-
 drivers/scsi/qla2xxx/qla_bsg.c     |  6 +++---
 drivers/scsi/scsi_lib.c            | 15 +++++++-------
 drivers/scsi/scsi_transport_sas.c  |  2 +-
 drivers/scsi/sg.c                  |  2 +-
 drivers/scsi/st.c                  |  6 +++---
 drivers/target/target_core_pscsi.c |  2 +-
 fs/nfsd/blocklayout.c              |  4 ++--
 include/linux/ide.h                |  2 +-
 include/scsi/scsi_request.h        |  1 +
 31 files changed, 123 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index cd15f9dbb147..0a23dbba2d30 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
 	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
 	struct request *rq = job->req;
 
-	blk_end_request_all(rq, rq->errors);
+	blk_end_request_all(rq, scsi_req(rq)->result);
 
 	put_device(job->dev);	/* release reference for the request */
 
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
 	struct scsi_request *rq = scsi_req(req);
 	int err;
 
-	err = job->req->errors = result;
+	err = scsi_req(job->req)->result = result;
 	if (err < 0)
 		/* we're only returning the result field in the reply */
 		rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ failjob_rls_job:
  * @q: request queue to manage
  *
  * On error the create_bsg_job function should return a -Exyz error value
- * that will be set to the req->errors.
+ * that will be set to ->result.
  *
  * Drivers/subsys should pass this to the queue init function.
  */
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
 
 		ret = bsg_create_job(dev, req);
 		if (ret) {
-			req->errors = ret;
+			scsi_req(req)->result = ret;
 			blk_end_request_all(req, ret);
 			spin_lock_irq(q->queue_lock);
 			continue;
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..d9da1b613ced 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	struct scsi_request *req = scsi_req(rq);
 	int ret = 0;
 
-	dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
+	dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
 	/*
 	 * fill in all the output members
 	 */
-	hdr->device_status = rq->errors & 0xff;
-	hdr->transport_status = host_byte(rq->errors);
-	hdr->driver_status = driver_byte(rq->errors);
+	hdr->device_status = req->result & 0xff;
+	hdr->transport_status = host_byte(req->result);
+	hdr->driver_status = driver_byte(req->result);
 	hdr->info = 0;
 	if (hdr->device_status || hdr->transport_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	 * just a protocol response (i.e. non negative), that gets
 	 * processed above.
 	 */
-	if (!ret && rq->errors < 0)
-		ret = rq->errors;
+	if (!ret && req->result < 0)
+		ret = req->result;
 
 	blk_rq_unmap_user(bio);
 	scsi_req_free_cmd(req);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index b1352143f12f..4a294a5f7fab 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	/*
 	 * fill in all the output members
 	 */
-	hdr->status = rq->errors & 0xff;
-	hdr->masked_status = status_byte(rq->errors);
-	hdr->msg_status = msg_byte(rq->errors);
-	hdr->host_status = host_byte(rq->errors);
-	hdr->driver_status = driver_byte(rq->errors);
+	hdr->status = req->result & 0xff;
+	hdr->masked_status = status_byte(req->result);
+	hdr->msg_status = msg_byte(req->result);
+	hdr->host_status = host_byte(req->result);
+	hdr->driver_status = driver_byte(req->result);
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	blk_execute_rq(q, disk, rq, 0);
 
-	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
+	err = req->result & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
 		if (req->sense_len && req->sense) {
 			bytes = (OMAX_SB_LEN > req->sense_len) ?
@@ -548,7 +548,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	scsi_req(rq)->cmd[4] = data;
 	scsi_req(rq)->cmd_len = 6;
 	blk_execute_rq(q, bd_disk, rq, 0);
-	err = rq->errors ? -EIO : 0;
+	err = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
 	return err;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8e1a4554951c..cd375503f7b0 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1864,8 +1864,7 @@ static void cciss_softirq_done(struct request *rq)
 	/* set the residual count for pc requests */
 	if (blk_rq_is_passthrough(rq))
 		scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
-
-	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
+	blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0);
 
 	spin_lock_irqsave(&h->lock, flags);
 	cmd_free(h, c);
@@ -3140,18 +3139,19 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 {
 	int retry_cmd = 0;
 	struct request *rq = cmd->rq;
+	struct scsi_request *sreq = scsi_req(rq);
 
-	rq->errors = 0;
+	sreq->result = 0;
 
 	if (timeout)
-		rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
+		sreq->result = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
 
 	if (cmd->err_info->CommandStatus == 0)	/* no error has occurred */
 		goto after_error_processing;
 
 	switch (cmd->err_info->CommandStatus) {
 	case CMD_TARGET_STATUS:
-		rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
+		sreq->result = evaluate_target_status(h, cmd, &retry_cmd);
 		break;
 	case CMD_DATA_UNDERRUN:
 		if (!blk_rq_is_passthrough(cmd->rq)) {
@@ -3169,7 +3169,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_INVALID:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p is "
 		       "reported invalid\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3177,7 +3177,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_PROTOCOL_ERR:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p has "
 		       "protocol error\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3185,7 +3185,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_HARDWARE_ERR:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p had "
 		       " hardware error\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3193,7 +3193,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_CONNECTION_LOST:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p had "
 		       "connection lost\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3201,7 +3201,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_ABORTED:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p was "
 		       "aborted\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
@@ -3209,7 +3209,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_ABORT_FAILED:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
 		       "abort failed\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3224,21 +3224,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		} else
 			dev_warn(&h->pdev->dev,
 				"%p retried too many times\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_TIMEOUT:
 		dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNABORTABLE:
 		dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3247,7 +3247,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		dev_warn(&h->pdev->dev, "cmd %p returned "
 		       "unknown status %x\n", cmd,
 		       cmd->err_info->CommandStatus);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3380,9 +3380,9 @@ static void do_cciss_request(struct request_queue *q)
 		if (dma_mapping_error(&h->pdev->dev, temp64.val)) {
 			dev_warn(&h->pdev->dev,
 				"%s: error mapping page for DMA\n", __func__);
-			creq->errors = make_status_bytes(SAM_STAT_GOOD,
-							0, DRIVER_OK,
-							DID_SOFT_ERROR);
+			scsi_req(creq)->result =
+				make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+						  DID_SOFT_ERROR);
 			cmd_free(h, c);
 			return;
 		}
@@ -3395,9 +3395,9 @@ static void do_cciss_request(struct request_queue *q)
 		if (cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
 			(seg - (h->max_cmd_sgentries - 1)) *
 				sizeof(SGDescriptor_struct))) {
-			creq->errors = make_status_bytes(SAM_STAT_GOOD,
-							0, DRIVER_OK,
-							DID_SOFT_ERROR);
+			scsi_req(creq)->result =
+				make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+						  DID_SOFT_ERROR);
 			cmd_free(h, c);
 			return;
 		}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a9..205b865ebeb9 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -724,7 +724,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 		rq->rq_flags |= RQF_QUIET;
 
 	blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
-	if (rq->errors)
+	if (scsi_req(rq)->result)
 		ret = -EIO;
 out:
 	blk_put_request(rq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8378ad480f77..dea2a58d6734 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -119,7 +119,7 @@ static inline void virtblk_scsi_request_done(struct request *req)
 
 	sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
 	sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
-	req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
+	sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
 }
 
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 308501730ab3..76c952fd9ab9 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2219,7 +2219,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		bio = rq->bio;
 
 		blk_execute_rq(q, cdi->disk, rq, 0);
-		if (rq->errors) {
+		if (scsi_req(rq)->result) {
 			struct request_sense *s = req->sense;
 			ret = -EIO;
 			cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 1524797e1776..5901937284e7 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -108,7 +108,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	if (drive->media == ide_tape)
 		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
 	blk_execute_rq(drive->queue, disk, rq, 0);
-	error = rq->errors ? -EIO : 0;
+	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
@@ -455,7 +455,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("%s: I/O error\n", drive->name);
 
 			if (drive->media != ide_tape)
-				pc->rq->errors++;
+				scsi_req(pc->rq)->result++;
 
 			if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
 				printk(KERN_ERR PFX "%s: I/O error in request "
@@ -489,13 +489,13 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			drive->failed_pc = NULL;
 
 		if (ata_misc_request(rq)) {
-			rq->errors = 0;
+			scsi_req(rq)->result = 0;
 			error = 0;
 		} else {
 
 			if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
-				if (rq->errors == 0)
-					rq->errors = -EIO;
+				if (scsi_req(rq)->result == 0)
+					scsi_req(rq)->result = -EIO;
 			}
 
 			error = uptodate ? 0 : -EIO;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 95c40afa9120..07e5ff3a64c3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -247,10 +247,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
 
 	struct cdrom_info *info = drive->driver_data;
 
-	if (!rq->errors)
+	if (!scsi_req(rq)->result)
 		info->write_timeout = jiffies +	ATAPI_WAIT_WRITE_BUSY;
 
-	rq->errors = 1;
+	scsi_req(rq)->result = 1;
 
 	if (time_after(jiffies, info->write_timeout))
 		return 0;
@@ -294,8 +294,8 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	}
 
 	/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
-	if (blk_rq_is_scsi(rq) && !rq->errors)
-		rq->errors = SAM_STAT_CHECK_CONDITION;
+	if (blk_rq_is_scsi(rq) && !scsi_req(rq)->result)
+		scsi_req(rq)->result = SAM_STAT_CHECK_CONDITION;
 
 	if (blk_noretry_request(rq))
 		do_end_request = 1;
@@ -325,7 +325,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * Arrange to retry the request but be sure to give up if we've
 		 * retried too many times.
 		 */
-		if (++rq->errors > ERROR_MAX)
+		if (++scsi_req(rq)->result > ERROR_MAX)
 			do_end_request = 1;
 		break;
 	case ILLEGAL_REQUEST:
@@ -372,7 +372,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 			/* go to the default handler for other errors */
 			ide_error(drive, "cdrom_decode_status", stat);
 			return 1;
-		} else if (++rq->errors > ERROR_MAX)
+		} else if (++scsi_req(rq)->result > ERROR_MAX)
 			/* we've racked up too many retries, abort */
 			do_end_request = 1;
 	}
@@ -453,7 +453,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		}
 
 		blk_execute_rq(drive->queue, info->disk, rq, 0);
-		error = rq->errors ? -EIO : 0;
+		error = scsi_req(rq)->result ? -EIO : 0;
 
 		if (buffer)
 			*bufflen = scsi_req(rq)->resid_len;
@@ -684,8 +684,8 @@ out_end:
 			if (cmd->nleft == 0)
 				uptodate = 1;
 		} else {
-			if (uptodate <= 0 && rq->errors == 0)
-				rq->errors = -EIO;
+			if (uptodate <= 0 && scsi_req(rq)->result == 0)
+				scsi_req(rq)->result = -EIO;
 		}
 
 		if (uptodate == 0 && rq->bio)
@@ -1380,7 +1380,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
 	 * appropriate action
 	 */
 	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
-		rq->errors = ILLEGAL_REQUEST;
+		scsi_req(rq)->result = ILLEGAL_REQUEST;
 		return BLKPREP_KILL;
 	}
 
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index f1ab726bd430..55cd736c39c6 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -308,7 +308,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	blk_execute_rq(drive->queue, cd->disk, rq, 0);
-	ret = rq->errors ? -EIO : 0;
+	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	/*
 	 * A reset will unlock the door. If it was previously locked,
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index eea6a7cb80b5..b1223234037d 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -174,7 +174,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	rq->special = setting->set;
 
 	blk_execute_rq(q, NULL, rq, 0);
-	ret = rq->errors;
+	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 
 	return ret;
@@ -186,7 +186,7 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 
 	err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
 	if (err)
-		rq->errors = err;
+		scsi_req(rq)->result = err;
 	ide_complete_rq(drive, err, blk_rq_bytes(rq));
 	return ide_stopped;
 }
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 17a65ac56491..51c81223e56d 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -490,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	 * make sure request is sane
 	 */
 	if (hwif->rq)
-		hwif->rq->errors = 0;
+		scsi_req(hwif->rq)->result = 0;
 	return ret;
 }
 
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index cf3af6840368..4b7ffd7d158d 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -12,7 +12,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 	if ((stat & ATA_BUSY) ||
 	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
 		/* other bits are useless when BUSY */
-		rq->errors |= ERROR_RESET;
+		scsi_req(rq)->result |= ERROR_RESET;
 	} else if (stat & ATA_ERR) {
 		/* err has different meaning on cdrom and tape */
 		if (err == ATA_ABORTED) {
@@ -25,10 +25,10 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 			drive->crc_count++;
 		} else if (err & (ATA_BBK | ATA_UNC)) {
 			/* retries won't help these */
-			rq->errors = ERROR_MAX;
+			scsi_req(rq)->result = ERROR_MAX;
 		} else if (err & ATA_TRK0NF) {
 			/* help it find track zero */
-			rq->errors |= ERROR_RECAL;
+			scsi_req(rq)->result |= ERROR_RECAL;
 		}
 	}
 
@@ -39,23 +39,23 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 		ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
 	}
 
-	if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
+	if (scsi_req(rq)->result >= ERROR_MAX || blk_noretry_request(rq)) {
 		ide_kill_rq(drive, rq);
 		return ide_stopped;
 	}
 
 	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-		rq->errors |= ERROR_RESET;
+		scsi_req(rq)->result |= ERROR_RESET;
 
-	if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-		++rq->errors;
+	if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+		++scsi_req(rq)->result;
 		return ide_do_reset(drive);
 	}
 
-	if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
+	if ((scsi_req(rq)->result & ERROR_RECAL) == ERROR_RECAL)
 		drive->special_flags |= IDE_SFLAG_RECALIBRATE;
 
-	++rq->errors;
+	++scsi_req(rq)->result;
 
 	return ide_stopped;
 }
@@ -68,7 +68,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
 	if ((stat & ATA_BUSY) ||
 	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
 		/* other bits are useless when BUSY */
-		rq->errors |= ERROR_RESET;
+		scsi_req(rq)->result |= ERROR_RESET;
 	} else {
 		/* add decoding error stuff */
 	}
@@ -77,14 +77,14 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
 		/* force an abort */
 		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
 
-	if (rq->errors >= ERROR_MAX) {
+	if (scsi_req(rq)->result >= ERROR_MAX) {
 		ide_kill_rq(drive, rq);
 	} else {
-		if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-			++rq->errors;
+		if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+			++scsi_req(rq)->result;
 			return ide_do_reset(drive);
 		}
-		++rq->errors;
+		++scsi_req(rq)->result;
 	}
 
 	return ide_stopped;
@@ -130,11 +130,11 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
 		} else if (ata_pm_request(rq)) {
-			rq->errors = 1;
+			scsi_req(rq)->result = 1;
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
 		}
-		rq->errors = err;
+		scsi_req(rq)->result = err;
 		ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
 		return ide_stopped;
 	}
@@ -149,8 +149,8 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 
 	if (rq && ata_misc_request(rq) &&
 	    scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
-		if (err <= 0 && rq->errors == 0)
-			rq->errors = -EIO;
+		if (err <= 0 && scsi_req(rq)->result == 0)
+			scsi_req(rq)->result = -EIO;
 		ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
 	}
 }
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a69e8013f1df..8ac6048cd2df 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -98,7 +98,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 	}
 
 	if (ata_misc_request(rq))
-		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
+		scsi_req(rq)->result = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
 	return uptodate;
 }
@@ -239,7 +239,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 					? rq->rq_disk->disk_name
 					: "dev?"));
 
-	if (rq->errors >= ERROR_MAX) {
+	if (scsi_req(rq)->result >= ERROR_MAX) {
 		if (drive->failed_pc) {
 			ide_floppy_report_error(floppy, drive->failed_pc);
 			drive->failed_pc = NULL;
@@ -247,7 +247,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
 		if (ata_misc_request(rq)) {
-			rq->errors = 0;
+			scsi_req(rq)->result = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
 		} else
@@ -301,8 +301,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	return ide_floppy_issue_pc(drive, &cmd, pc);
 out_end:
 	drive->failed_pc = NULL;
-	if (blk_rq_is_passthrough(rq) && rq->errors == 0)
-		rq->errors = -EIO;
+	if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+		scsi_req(rq)->result = -EIO;
 	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 	return ide_stopped;
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 043b1fb963cb..45b3f41a43d4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -141,12 +141,12 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 	drive->failed_pc = NULL;
 
 	if ((media == ide_floppy || media == ide_tape) && drv_req) {
-		rq->errors = 0;
+		scsi_req(rq)->result = 0;
 	} else {
 		if (media == ide_tape)
-			rq->errors = IDE_DRV_ERROR_GENERAL;
-		else if (blk_rq_is_passthrough(rq) && rq->errors == 0)
-			rq->errors = -EIO;
+			scsi_req(rq)->result = IDE_DRV_ERROR_GENERAL;
+		else if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+			scsi_req(rq)->result = -EIO;
 	}
 
 	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
@@ -271,7 +271,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
-	rq->errors = 0;
+	scsi_req(rq)->result = 0;
 	ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 
  	return ide_stopped;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 3e96e531b367..8c0d17297a7a 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -129,7 +129,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 		scsi_req_init(rq);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		blk_execute_rq(drive->queue, NULL, rq, 0);
-		err = rq->errors ? -EIO : 0;
+		err = scsi_req(rq)->result ? -EIO : 0;
 		blk_put_request(rq);
 
 		return err;
@@ -229,7 +229,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
 	blk_execute_rq(drive->queue, NULL, rq, 1);
-	ret = rq->errors;
+	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 	return ret;
 }
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index b4f577016f5a..94e3107f59b9 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -38,7 +38,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = &timeout;
 	blk_execute_rq(q, NULL, rq, 1);
-	rc = rq->errors ? -EIO : 0;
+	rc = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	if (rc)
 		goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index bf513f886f3c..277c2bb7616f 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -28,7 +28,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	rqpm.pm_state = mesg.event;
 
 	blk_execute_rq(drive->queue, NULL, rq, 0);
-	ret = rq->errors ? -EIO : 0;
+	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
 	if (ret == 0 && ide_port_acpi(hwif)) {
@@ -56,8 +56,8 @@ static int ide_pm_execute_rq(struct request *rq)
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
-		rq->errors = -ENXIO;
-		__blk_end_request_all(rq, rq->errors);
+		scsi_req(rq)->result = -ENXIO;
+		__blk_end_request_all(rq, scsi_req(rq)->result);
 		spin_unlock_irq(q->queue_lock);
 		return -ENXIO;
 	}
@@ -67,7 +67,7 @@ static int ide_pm_execute_rq(struct request *rq)
 
 	wait_for_completion_io(&wait);
 
-	return rq->errors ? -EIO : 0;
+	return scsi_req(rq)->result ? -EIO : 0;
 }
 
 int generic_ide_resume(struct device *dev)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d8a552b47718..a0651f948b76 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -366,7 +366,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 			err = pc->error;
 		}
 	}
-	rq->errors = err;
+	scsi_req(rq)->result = err;
 
 	return uptodate;
 }
@@ -879,7 +879,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 		tape->valid = 0;
 
 	ret = size;
-	if (rq->errors == IDE_DRV_ERROR_GENERAL)
+	if (scsi_req(rq)->result == IDE_DRV_ERROR_GENERAL)
 		ret = -EIO;
 out_put:
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 78924c7c9478..d71199d23c9e 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -287,7 +287,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 	u8 saved_io_32bit = drive->io_32bit;
 
 	if (cmd->tf_flags & IDE_TFLAG_FS)
-		cmd->rq->errors = 0;
+		scsi_req(cmd->rq)->result = 0;
 
 	if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
 		drive->io_32bit = 0;
@@ -329,7 +329,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
 	u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
 
 	ide_complete_cmd(drive, cmd, stat, err);
-	rq->errors = err;
+	scsi_req(rq)->result = err;
 
 	if (err == 0 && set_xfer) {
 		ide_set_xfer_rate(drive, nsect);
@@ -453,7 +453,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	cmd->rq = rq;
 
 	blk_execute_rq(drive->queue, NULL, rq, 0);
-	error = rq->errors ? -EIO : 0;
+	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 5eeab7047d1e..8a1b94816419 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -477,7 +477,7 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
 			     int error)
 {
 	or->async_error = error;
-	or->req_errors = req->errors ? : error;
+	or->req_errors = scsi_req(req)->result ? : error;
 	or->sense_len = scsi_req(req)->sense_len;
 	if (or->sense_len)
 		memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
@@ -492,7 +492,7 @@ int osd_execute_request(struct osd_request *or)
 	int error;
 
 	blk_execute_rq(or->request->q, NULL, or->request, 0);
-	error = or->request->errors ? -EIO : 0;
+	error = scsi_req(or->request)->result ? -EIO : 0;
 
 	_set_error_resid(or, or->request, error);
 	return error;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 41bc1d64bf86..67cbed92f07d 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -327,7 +327,7 @@ static void osst_end_async(struct request *req, int update)
 	struct osst_tape *STp = SRpnt->stp;
 	struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
 
-	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+	STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
 #if DEBUG
 	STp->write_pending = 0;
 #endif
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
index 84c9098cc089..b6e40fd4c3c1 100644
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -2553,13 +2553,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
 						ql_log(ql_log_warn, vha, 0x7089,
 						    "mbx abort_command "
 						    "failed.\n");
-						bsg_job->req->errors =
+						scsi_req(bsg_job->req)->result =
 						bsg_reply->result = -EIO;
 					} else {
 						ql_dbg(ql_dbg_user, vha, 0x708a,
 						    "mbx abort_command "
 						    "success.\n");
-						bsg_job->req->errors =
+						scsi_req(bsg_job->req)->result =
 						bsg_reply->result = 0;
 					}
 					spin_lock_irqsave(&ha->hardware_lock, flags);
@@ -2570,7 +2570,7 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
 	}
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 	ql_log(ql_log_info, vha, 0x708b, "SRB not found to abort.\n");
-	bsg_job->req->errors = bsg_reply->result = -ENXIO;
+	scsi_req(bsg_job->req)->result = bsg_reply->result = -ENXIO;
 	return 0;
 
 done:
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7bc4513bf4e4..b9298a499e19 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -229,8 +229,8 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
  * @rq_flags:	flags for ->rq_flags
  * @resid:	optional residual length
  *
- * returns the req->errors value which is the scsi_cmnd result
- * field.
+ * Returns the scsi_cmnd result field if a command was executed, or a negative
+ * Linux error code if we didn't get that far.
  */
 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		 int data_direction, void *buffer, unsigned bufflen,
@@ -281,7 +281,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
 	if (sshdr)
 		scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
-	ret = req->errors;
+	ret = rq->result;
  out:
 	blk_put_request(req);
 
@@ -797,8 +797,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		/*
 		 * __scsi_error_from_host_byte may have reset the host_byte
 		 */
-		req->errors = cmd->result;
-
+		scsi_req(req)->result = cmd->result;
 		scsi_req(req)->resid_len = scsi_get_resid(cmd);
 
 		if (scsi_bidi_cmnd(cmd)) {
@@ -835,7 +834,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	/*
 	 * Recovered errors need reporting, but they're always treated as
 	 * success, so fiddle the result code here.  For passthrough requests
-	 * we already took a copy of the original into rq->errors which
+	 * we already took a copy of the original into sreq->result which
 	 * is what gets returned to the user
 	 */
 	if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
@@ -1281,7 +1280,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 	switch (ret) {
 	case BLKPREP_KILL:
 	case BLKPREP_INVALID:
-		req->errors = DID_NO_CONNECT << 16;
+		scsi_req(req)->result = DID_NO_CONNECT << 16;
 		/* release the command and kill it */
 		if (req->special) {
 			struct scsi_cmnd *cmd = req->special;
@@ -1905,7 +1904,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request, cmd->request->errors);
+	blk_mq_complete_request(cmd->request, 0);
 }
 
 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index cdbb293aca08..a2b279737a4b 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -184,7 +184,7 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 				blk_rq_bytes(req->next_rq);
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
 		ret = handler(shost, rphy, req);
-		req->errors = ret;
+		scsi_req(req)->result = ret;
 
 		blk_end_request_all(req, ret);
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index b61cc3c512d3..90ee9d926deb 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1298,7 +1298,7 @@ sg_rq_end_io(struct request *rq, int uptodate)
 		pr_info("%s: device detaching\n", __func__);
 
 	sense = req->sense;
-	result = rq->errors;
+	result = req->result;
 	resid = req->resid_len;
 
 	SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp,
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5408643431bb..1ea34d6f5437 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -480,7 +480,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_write_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
 		atomic64_inc(&STp->stats->write_cnt);
-		if (req->errors) {
+		if (scsi_req(req)->result) {
 			atomic64_add(atomic_read(&STp->stats->last_write_size)
 				- STp->buffer->cmdstat.residual,
 				&STp->stats->write_byte_cnt);
@@ -494,7 +494,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_read_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
 		atomic64_inc(&STp->stats->read_cnt);
-		if (req->errors) {
+		if (scsi_req(req)->result) {
 			atomic64_add(atomic_read(&STp->stats->last_read_size)
 				- STp->buffer->cmdstat.residual,
 				&STp->stats->read_byte_cnt);
@@ -518,7 +518,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
 	struct scsi_tape *STp = SRpnt->stp;
 	struct bio *tmp;
 
-	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+	STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
 	STp->buffer->cmdstat.residual = rq->resid_len;
 
 	st_do_stats(STp, req);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index c7fa372c527a..a93d94e68ab5 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1050,7 +1050,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
 	struct se_cmd *cmd = req->end_io_data;
 	struct pscsi_plugin_task *pt = cmd->priv;
 
-	pt->pscsi_result = req->errors;
+	pt->pscsi_result = scsi_req(req)->result;
 	pt->pscsi_resid = scsi_req(req)->resid_len;
 
 	cmd->scsi_status = status_byte(pt->pscsi_result) << 1;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 9f618b77ffee..fb5213afc854 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -243,9 +243,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
 	blk_execute_rq(rq->q, NULL, rq, 1);
-	if (rq->errors) {
+	if (req->result) {
 		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
-			rq->errors);
+			req->result);
 		error = -EIO;
 		goto out_put_request;
 	}
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2f51c1724b5a..6980ca322074 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq)
 		 ide_req(rq)->type == ATA_PRIV_PM_RESUME);
 }
 
-/* Error codes returned in rq->errors to the higher part of the driver. */
+/* Error codes returned in result to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
 	IDE_DRV_ERROR_FILEMARK	= 102,
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index 7c583a0f363a..f0c76f9dc285 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -9,6 +9,7 @@ struct scsi_request {
 	unsigned char	__cmd[BLK_MAX_CDB];
 	unsigned char	*cmd;
 	unsigned short	cmd_len;
+	int		result;
 	unsigned int	sense_len;
 	unsigned int	resid_len;	/* residual count */
 	int		retries;
-- 
cgit v1.2.3


From 08e0029aa2a4acdd365613ce88a1184e5351a8a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:09 +0200
Subject: blk-mq: remove the error argument to blk_mq_complete_request

Now that all drivers that call blk_mq_complete_requests have a
->complete callback we can remove the direct call to blk_mq_end_request,
as well as the error argument to blk_mq_complete_request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c                    | 15 +++------------
 drivers/block/loop.c              |  4 ++--
 drivers/block/mtip32xx/mtip32xx.c |  4 ++--
 drivers/block/nbd.c               |  4 ++--
 drivers/block/null_blk.c          |  2 +-
 drivers/block/virtio_blk.c        |  2 +-
 drivers/block/xen-blkfront.c      |  2 +-
 drivers/md/dm-rq.c                |  2 +-
 drivers/nvme/host/core.c          |  2 +-
 drivers/nvme/host/nvme.h          |  2 +-
 drivers/scsi/scsi_lib.c           |  2 +-
 include/linux/blk-mq.h            |  2 +-
 12 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c496692ecc5b..3a2d179d49d6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -442,17 +442,10 @@ static void blk_mq_stat_add(struct request *rq)
 
 static void __blk_mq_complete_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
-
 	blk_mq_stat_add(rq);
-
-	if (!q->softirq_done_fn)
-		blk_mq_end_request(rq, rq->errors);
-	else
-		blk_mq_ipi_complete_request(rq);
+	blk_mq_ipi_complete_request(rq);
 }
 
 /**
@@ -463,16 +456,14 @@ static void __blk_mq_complete_request(struct request *rq)
  *	Ends all I/O on a request. It does not handle partial completions.
  *	The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq, int error)
+void blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq)) {
-		rq->errors = error;
+	if (!blk_mark_rq_complete(rq))
 		__blk_mq_complete_request(rq);
-	}
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 86351b3f7350..994403efee19 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -465,7 +465,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
 	cmd->ret = ret;
-	blk_mq_complete_request(cmd->rq, 0);
+	blk_mq_complete_request(cmd->rq);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -1685,7 +1685,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
 		cmd->ret = ret ? -EIO : 0;
-		blk_mq_complete_request(cmd->rq, 0);
+		blk_mq_complete_request(cmd->rq);
 	}
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 7406de29db58..66a6bd83faae 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -242,7 +242,7 @@ static void mtip_async_complete(struct mtip_port *port,
 	rq = mtip_rq_from_tag(dd, tag);
 
 	cmd->status = status;
-	blk_mq_complete_request(rq, 0);
+	blk_mq_complete_request(rq);
 }
 
 /*
@@ -4109,7 +4109,7 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
 
 	if (likely(!reserv)) {
 		cmd->status = -ENODEV;
-		blk_mq_complete_request(rq, 0);
+		blk_mq_complete_request(rq);
 	} else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
 
 		cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 09a74a66beb1..d387bef07fcc 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -635,7 +635,7 @@ static void recv_work(struct work_struct *work)
 			break;
 		}
 
-		blk_mq_complete_request(blk_mq_rq_from_pdu(cmd), 0);
+		blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
 	}
 	atomic_dec(&config->recv_threads);
 	wake_up(&config->recv_wq);
@@ -651,7 +651,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
 		return;
 	cmd = blk_mq_rq_to_pdu(req);
 	cmd->status = -EIO;
-	blk_mq_complete_request(req, 0);
+	blk_mq_complete_request(req);
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 0ca4aa34edb9..d946e1eeac8e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -281,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
 	case NULL_IRQ_SOFTIRQ:
 		switch (queue_mode)  {
 		case NULL_Q_MQ:
-			blk_mq_complete_request(cmd->rq, 0);
+			blk_mq_complete_request(cmd->rq);
 			break;
 		case NULL_Q_RQ:
 			blk_complete_request(cmd->rq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dea2a58d6734..f94614257462 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -201,7 +201,7 @@ static void virtblk_done(struct virtqueue *vq)
 		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
 			struct request *req = blk_mq_rq_from_pdu(vbr);
 
-			blk_mq_complete_request(req, 0);
+			blk_mq_complete_request(req);
 			req_done = true;
 		}
 		if (unlikely(virtqueue_is_broken(vq)))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 57866355c060..39459631667c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1647,7 +1647,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			BUG();
 		}
 
-		blk_mq_complete_request(req, 0);
+		blk_mq_complete_request(req);
 	}
 
 	rinfo->ring.rsp_cons = i;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1173be21f6f6..bff7e3bdb4ed 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -363,7 +363,7 @@ static void dm_complete_request(struct request *rq, int error)
 	if (!rq->q->mq_ops)
 		blk_complete_request(rq);
 	else
-		blk_mq_complete_request(rq, 0);
+		blk_mq_complete_request(rq);
 }
 
 /*
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 805f250315ec..8dc664798293 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -117,7 +117,7 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
 	if (blk_queue_dying(req->q))
 		status |= NVME_SC_DNR;
 	nvme_req(req)->status = status;
-	blk_mq_complete_request(req, 0);
+	blk_mq_complete_request(req);
 
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 550037f5efea..c6ef6c30e2f0 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -251,7 +251,7 @@ static inline void nvme_end_request(struct request *req, __le16 status,
 
 	rq->status = le16_to_cpu(status) >> 1;
 	rq->result = result;
-	blk_mq_complete_request(req, 0);
+	blk_mq_complete_request(req);
 }
 
 void nvme_complete_rq(struct request *req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b9298a499e19..4a20e6098f7c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1904,7 +1904,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request, 0);
+	blk_mq_complete_request(cmd->request);
 }
 
 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d75de612845d..0c4dadb85f62 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -228,7 +228,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq);
 
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.2.3


From e26738e037f34aedfe05e412f442833f44f4a6e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:11 +0200
Subject: block: add a error_count field to struct request

This is for the legacy floppy and ataflop drivers that currently abuse
->errors for this purpose.  It's stashed away in a union to not grow
the struct size, the other fields are either used by modern drivers
for different purposes or the I/O scheduler before queing the I/O
to drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e2064ed3c703..a3dcee624de3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -175,6 +175,7 @@ struct request {
 		struct rb_node rb_node;	/* sort/lookup */
 		struct bio_vec special_vec;
 		void *completion_data;
+		int error_count; /* for legacy drivers, don't use */
 	};
 
 	/*
-- 
cgit v1.2.3


From caf7df12272118e0274c8353bcfeaf60c7743a47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:16 +0200
Subject: block: remove the errors field from struct request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c             | 14 +-------------
 block/blk-exec.c             |  3 +--
 block/blk-mq.c               | 10 +++-------
 block/blk-timeout.c          |  1 -
 include/linux/blkdev.h       |  2 --
 include/trace/events/block.h | 17 +++++++----------
 kernel/trace/blktrace.c      | 26 ++++++++++++--------------
 7 files changed, 24 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 25aea293ee98..a49b0830aaaf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1635,7 +1635,6 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio)
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
-	req->errors = 0;
 	req->__sector = bio->bi_iter.bi_sector;
 	if (ioprio_valid(bio_prio(bio)))
 		req->ioprio = bio_prio(bio);
@@ -2573,22 +2572,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes;
 
-	trace_block_rq_complete(req->q, req, nr_bytes);
+	trace_block_rq_complete(req, error, nr_bytes);
 
 	if (!req->bio)
 		return false;
 
-	/*
-	 * For fs requests, rq is just carrier of independent bio's
-	 * and each partial completion should be handled separately.
-	 * Reset per-request error on each partial completion.
-	 *
-	 * TODO: tj: This is too subtle.  It would be better to let
-	 * low level drivers do what they see fit.
-	 */
-	if (!blk_rq_is_passthrough(req))
-		req->errors = 0;
-
 	if (error && !blk_rq_is_passthrough(req) &&
 	    !(req->rq_flags & RQF_QUIET)) {
 		char *error_type;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index afa383248c7c..a9451e3b8587 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
-		rq->errors = -ENXIO;
-		__blk_end_request_all(rq, rq->errors);
+		__blk_end_request_all(rq, -ENXIO);
 		spin_unlock_irq(q->queue_lock);
 		return;
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3a21948c867a..f2f6c59c5b05 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -213,7 +213,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 #endif
 	rq->special = NULL;
 	/* tag was already set */
-	rq->errors = 0;
 	rq->extra_len = 0;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
@@ -624,8 +623,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
 
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		rq->errors = -EIO;
-		blk_mq_end_request(rq, rq->errors);
+		blk_mq_end_request(rq, -EIO);
 	}
 }
 EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -1032,8 +1030,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			pr_err("blk-mq: bad return on queue: %d\n", ret);
 		case BLK_MQ_RQ_QUEUE_ERROR:
 			errors++;
-			rq->errors = -EIO;
-			blk_mq_end_request(rq, rq->errors);
+			blk_mq_end_request(rq, -EIO);
 			break;
 		}
 
@@ -1484,8 +1481,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 
 	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
 		*cookie = BLK_QC_T_NONE;
-		rq->errors = -EIO;
-		blk_mq_end_request(rq, rq->errors);
+		blk_mq_end_request(rq, -EIO);
 		return;
 	}
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c0..cbff183f3d9f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
 		ret = q->rq_timed_out_fn(req);
 	switch (ret) {
 	case BLK_EH_HANDLED:
-		/* Can we use req->errors here? */
 		__blk_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a3dcee624de3..6c4ab0d4a160 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -220,8 +220,6 @@ struct request {
 
 	void *special;		/* opaque pointer available for LLD use */
 
-	int errors;
-
 	unsigned int extra_len;	/* length of alignment and padding */
 
 	unsigned long deadline;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 99ed69fad041..d0dbe60d8a6d 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -80,7 +80,6 @@ TRACE_EVENT(block_rq_requeue,
 		__field(  dev_t,	dev			)
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
-		__field(  int,		errors			)
 		__array(  char,		rwbs,	RWBS_LEN	)
 		__dynamic_array( char,	cmd,	1		)
 	),
@@ -89,7 +88,6 @@ TRACE_EVENT(block_rq_requeue,
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
 		__entry->sector    = blk_rq_trace_sector(rq);
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
-		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		__get_str(cmd)[0] = '\0';
@@ -99,13 +97,13 @@ TRACE_EVENT(block_rq_requeue,
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
 		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->errors)
+		  __entry->nr_sector, 0)
 );
 
 /**
  * block_rq_complete - block IO operation completed by device driver
- * @q: queue containing the block operation request
  * @rq: block operations request
+ * @error: status code
  * @nr_bytes: number of completed bytes
  *
  * The block_rq_complete tracepoint event indicates that some portion
@@ -116,16 +114,15 @@ TRACE_EVENT(block_rq_requeue,
  */
 TRACE_EVENT(block_rq_complete,
 
-	TP_PROTO(struct request_queue *q, struct request *rq,
-		 unsigned int nr_bytes),
+	TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
 
-	TP_ARGS(q, rq, nr_bytes),
+	TP_ARGS(rq, error, nr_bytes),
 
 	TP_STRUCT__entry(
 		__field(  dev_t,	dev			)
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
-		__field(  int,		errors			)
+		__field(  int,		error			)
 		__array(  char,		rwbs,	RWBS_LEN	)
 		__dynamic_array( char,	cmd,	1		)
 	),
@@ -134,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
 		__entry->sector    = blk_rq_pos(rq);
 		__entry->nr_sector = nr_bytes >> 9;
-		__entry->errors    = rq->errors;
+		__entry->error     = error;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
 		__get_str(cmd)[0] = '\0';
@@ -144,7 +141,7 @@ TRACE_EVENT(block_rq_complete,
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
 		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->errors)
+		  __entry->nr_sector, __entry->error)
 );
 
 DECLARE_EVENT_CLASS(block_rq,
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9f3624dadb09..bd8ae8d5ae9c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
 
 /**
  * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
  * @rq:		the source request
+ * @error:	return status to log
  * @nr_bytes:	number of completed bytes
  * @what:	the action
  *
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
 			     unsigned int nr_bytes, u32 what)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt = rq->q->blk_trace;
 
 	if (likely(!bt))
 		return;
@@ -713,34 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 		what |= BLK_TC_ACT(BLK_TC_FS);
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-			rq->cmd_flags, what, rq->errors, 0, NULL);
+			rq->cmd_flags, what, error, 0, NULL);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
-	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
 }
 
-static void blk_add_trace_rq_complete(void *ignore,
-				      struct request_queue *q,
-				      struct request *rq,
-				      unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+			int error, unsigned int nr_bytes)
 {
-	blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
 }
 
 /**
@@ -935,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-			rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
 			sizeof(r), &r);
 }
 
@@ -960,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, rq->errors, len, data);
+				BLK_TA_DRV_DATA, 0, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-- 
cgit v1.2.3


From d8f07aee3f2fd959878bf614d4e984900018eb9e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 26 Jan 2017 23:30:05 -0800
Subject: block: kill bdev_dax_capable()

This is leftover dead code that has since been replaced by
bdev_dax_supported().

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/block_dev.c         | 24 ------------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec4370..7f40ea2f0875 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -807,30 +807,6 @@ int bdev_dax_supported(struct super_block *sb, int blocksize)
 }
 EXPORT_SYMBOL_GPL(bdev_dax_supported);
 
-/**
- * bdev_dax_capable() - Return if the raw device is capable for dax
- * @bdev: The device for raw block device access
- */
-bool bdev_dax_capable(struct block_device *bdev)
-{
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-	};
-
-	if (!IS_ENABLED(CONFIG_FS_DAX))
-		return false;
-
-	dax.sector = 0;
-	if (bdev_direct_access(bdev, &dax) < 0)
-		return false;
-
-	dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
-	if (bdev_direct_access(bdev, &dax) < 0)
-		return false;
-
-	return true;
-}
-
 /*
  * pseudo-fs
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..f72708399b83 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1958,7 +1958,6 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
-extern bool bdev_dax_capable(struct block_device *);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
-- 
cgit v1.2.3


From b0686260fecaa924d8eff2ace94bee70506bc308 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 26 Jan 2017 20:37:35 -0800
Subject: dax: introduce dax_direct_access()

Replace bdev_direct_access() with dax_direct_access() that uses
dax_device and dax_operations instead of a block_device and
block_device_operations for dax. Once all consumers of the old api have
been converted bdev_direct_access() will be deleted.

Given that block device partitioning decisions can cause dax page
alignment constraints to be violated this also introduces the
bdev_dax_pgoff() helper. It handles calculating a logical pgoff relative
to the dax_device and also checks for page alignment.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/Kconfig          |  1 +
 drivers/dax/super.c    | 39 +++++++++++++++++++++++++++++++++++++++
 fs/block_dev.c         | 14 ++++++++++++++
 include/linux/blkdev.h |  1 +
 include/linux/dax.h    |  2 ++
 5 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index e9f780f815f5..93da7fc3f254 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -6,6 +6,7 @@ menuconfig BLOCK
        default y
        select SBITMAP
        select SRCU
+       select DAX
        help
 	 Provide block layer support for the kernel.
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 1a58542ee8fd..465dcd7317d5 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -65,6 +65,45 @@ struct dax_device {
 	const struct dax_operations *ops;
 };
 
+/**
+ * dax_direct_access() - translate a device pgoff to an absolute pfn
+ * @dax_dev: a dax_device instance representing the logical memory range
+ * @pgoff: offset in pages from the start of the device to translate
+ * @nr_pages: number of consecutive pages caller can handle relative to @pfn
+ * @kaddr: output parameter that returns a virtual address mapping of pfn
+ * @pfn: output parameter that returns an absolute pfn translation of @pgoff
+ *
+ * Return: negative errno if an error occurs, otherwise the number of
+ * pages accessible at the device relative @pgoff.
+ */
+long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
+		void **kaddr, pfn_t *pfn)
+{
+	long avail;
+
+	/*
+	 * The device driver is allowed to sleep, in order to make the
+	 * memory directly accessible.
+	 */
+	might_sleep();
+
+	if (!dax_dev)
+		return -EOPNOTSUPP;
+
+	if (!dax_alive(dax_dev))
+		return -ENXIO;
+
+	if (nr_pages < 0)
+		return nr_pages;
+
+	avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
+			kaddr, pfn);
+	if (!avail)
+		return -ERANGE;
+	return min(avail, nr_pages);
+}
+EXPORT_SYMBOL_GPL(dax_direct_access);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7f40ea2f0875..2f7885712575 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
+#include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/swap.h>
 #include <linux/pagevec.h>
@@ -762,6 +763,19 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
 
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		pgoff_t *pgoff)
+{
+	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
+
+	if (pgoff)
+		*pgoff = PHYS_PFN(phys_off);
+	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(bdev_dax_pgoff);
+
 /**
  * bdev_dax_supported() - Check if the device supports dax for filesystem
  * @sb: The superblock of the device
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f72708399b83..612c497d1461 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1958,6 +1958,7 @@ extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
 extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
+int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 39a0312c45c3..7e62e280c11f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -27,6 +27,8 @@ void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
+long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
+		void **kaddr, pfn_t *pfn);
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
-- 
cgit v1.2.3


From f26c5719b2d7b00de69eb83eb1c1c831759fdc9b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Apr 2017 12:35:44 -0700
Subject: dm: add dax_device and dax_operations support

Allocate a dax_device to represent the capacity of a device-mapper
instance. Provide a ->direct_access() method via the new dax_operations
indirection that mirrors the functionality of the current direct_access
support via block_device_operations.  Once fs/dax.c has been converted
to use dax_operations the old dm_blk_direct_access() will be removed.

A new helper dm_dax_get_live_target() is introduced to separate some of
the dm-specifics from the direct_access implementation.

This enabling is only for the top-level dm representation to upper
layers. Converting target direct_access implementations is deferred to a
separate patch.

Cc: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/Kconfig            |  1 +
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm.c               | 84 +++++++++++++++++++++++++++++++++++--------
 include/linux/device-mapper.h |  1 +
 4 files changed, 73 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..1de8372d9459 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -200,6 +200,7 @@ config BLK_DEV_DM_BUILTIN
 config BLK_DEV_DM
 	tristate "Device mapper support"
 	select BLK_DEV_DM_BUILTIN
+	select DAX
 	---help---
 	  Device-mapper is a low level volume manager.  It works by allowing
 	  people to specify mappings for ranges of logical sectors.  Various
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..538630190f66 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -58,6 +58,7 @@ struct mapped_device {
 	struct target_type *immutable_target_type;
 
 	struct gendisk *disk;
+	struct dax_device *dax_dev;
 	char name[16];
 
 	void *interface_ptr;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..bd56dfe43a99 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -16,6 +16,7 @@
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/mempool.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
@@ -908,31 +909,68 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 }
 EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 
-static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
+		sector_t sector, int *srcu_idx)
 {
-	struct mapped_device *md = bdev->bd_disk->private_data;
 	struct dm_table *map;
 	struct dm_target *ti;
-	int srcu_idx;
-	long len, ret = -EIO;
 
-	map = dm_get_live_table(md, &srcu_idx);
+	map = dm_get_live_table(md, srcu_idx);
 	if (!map)
-		goto out;
+		return NULL;
 
 	ti = dm_table_find_target(map, sector);
 	if (!dm_target_is_valid(ti))
-		goto out;
+		return NULL;
 
-	len = max_io_len(sector, ti) << SECTOR_SHIFT;
-	size = min(len, size);
+	return ti;
+}
 
-	if (ti->type->direct_access)
-		ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
-out:
+static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
+{
+	struct mapped_device *md = dax_get_private(dax_dev);
+	sector_t sector = pgoff * PAGE_SECTORS;
+	struct dm_target *ti;
+	long len, ret = -EIO;
+	int srcu_idx;
+
+	ti = dm_dax_get_live_target(md, sector, &srcu_idx);
+
+	if (!ti)
+		goto out;
+	if (!ti->type->direct_access)
+		goto out;
+	len = max_io_len(sector, ti) / PAGE_SECTORS;
+	if (len < 1)
+		goto out;
+	nr_pages = min(len, nr_pages);
+	if (ti->type->direct_access) {
+		ret = ti->type->direct_access(ti, sector, kaddr, pfn,
+				nr_pages * PAGE_SIZE);
+		/*
+		 * FIXME: convert ti->type->direct_access to return
+		 * nr_pages directly.
+		 */
+		if (ret >= 0)
+			ret /= PAGE_SIZE;
+	}
+ out:
 	dm_put_live_table(md, srcu_idx);
-	return min(ret, size);
+
+	return ret;
+}
+
+static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
+		void **kaddr, pfn_t *pfn, long size)
+{
+	struct mapped_device *md = bdev->bd_disk->private_data;
+	struct dax_device *dax_dev = md->dax_dev;
+	long nr_pages = size / PAGE_SIZE;
+
+	nr_pages = dm_dax_direct_access(dax_dev, sector / PAGE_SECTORS,
+			nr_pages, kaddr, pfn);
+	return nr_pages < 0 ? nr_pages : nr_pages * PAGE_SIZE;
 }
 
 /*
@@ -1437,6 +1475,7 @@ static int next_free_minor(int *minor)
 }
 
 static const struct block_device_operations dm_blk_dops;
+static const struct dax_operations dm_dax_ops;
 
 static void dm_wq_work(struct work_struct *work);
 
@@ -1483,6 +1522,12 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	if (md->bs)
 		bioset_free(md->bs);
 
+	if (md->dax_dev) {
+		kill_dax(md->dax_dev);
+		put_dax(md->dax_dev);
+		md->dax_dev = NULL;
+	}
+
 	if (md->disk) {
 		spin_lock(&_minor_lock);
 		md->disk->private_data = NULL;
@@ -1510,6 +1555,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 static struct mapped_device *alloc_dev(int minor)
 {
 	int r, numa_node_id = dm_get_numa_node();
+	struct dax_device *dax_dev;
 	struct mapped_device *md;
 	void *old_md;
 
@@ -1574,6 +1620,12 @@ static struct mapped_device *alloc_dev(int minor)
 	md->disk->queue = md->queue;
 	md->disk->private_data = md;
 	sprintf(md->disk->disk_name, "dm-%d", minor);
+
+	dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+	if (!dax_dev)
+		goto bad;
+	md->dax_dev = dax_dev;
+
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 
@@ -2781,6 +2833,10 @@ static const struct block_device_operations dm_blk_dops = {
 	.owner = THIS_MODULE
 };
 
+static const struct dax_operations dm_dax_ops = {
+	.direct_access = dm_dax_direct_access,
+};
+
 /*
  * module hooks
  */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..bcba4d89089c 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -130,6 +130,7 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  */
 typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
 				     void **kaddr, pfn_t *pfn, long size);
+#define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
 
-- 
cgit v1.2.3


From 5be661412762bbef45a55eaf1e6847258d69b3a4 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Tue, 18 Apr 2017 16:55:36 +0200
Subject: net: add netif_is_ovs_port helper

To find out if a netdev is an OVS port.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b0aa089ce67f..0f3c38ce5417 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4171,6 +4171,11 @@ static inline bool netif_is_ovs_master(const struct net_device *dev)
 	return dev->priv_flags & IFF_OPENVSWITCH;
 }
 
+static inline bool netif_is_ovs_port(const struct net_device *dev)
+{
+	return dev->priv_flags & IFF_OVS_DATAPATH;
+}
+
 static inline bool netif_is_team_master(const struct net_device *dev)
 {
 	return dev->priv_flags & IFF_TEAM;
-- 
cgit v1.2.3


From 0206319fdfee7c36b97aa6c0561bab206132f813 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Thu, 20 Apr 2017 16:59:11 -0600
Subject: blk-mq: Fix poll_stat for new size-based bucketing.

Fixes an issue where the size of the poll_stat array in request_queue
does not match the size expected by the new size based bucketing for
IO completion polling.

Fixes: 720b8ccc4500 ("blk-mq: Add a polling specific stats function")
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-debugfs.c | 15 +++++++++------
 block/blk-mq.c         |  2 --
 include/linux/blkdev.h |  5 ++++-
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index df9b688b877c..3057641d5d15 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -159,14 +159,17 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 static int queue_poll_stat_show(struct seq_file *m, void *v)
 {
 	struct request_queue *q = m->private;
+	int bucket;
 
-	seq_puts(m, "read: ");
-	print_stat(m, &q->poll_stat[READ]);
-	seq_puts(m, "\n");
+	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
+		seq_printf(m, "read  (%d Bytes): ", 1 << (9+bucket));
+		print_stat(m, &q->poll_stat[2*bucket]);
+		seq_puts(m, "\n");
 
-	seq_puts(m, "write: ");
-	print_stat(m, &q->poll_stat[WRITE]);
-	seq_puts(m, "\n");
+		seq_printf(m, "write (%d Bytes): ",  1 << (9+bucket));
+		print_stat(m, &q->poll_stat[2*bucket+1]);
+		seq_puts(m, "\n");
+	}
 	return 0;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 47b810638729..b6dc9ba38e35 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -42,8 +42,6 @@ static LIST_HEAD(all_q_list);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
-/* Must be consisitent with function below */
-#define BLK_MQ_POLL_STATS_BKTS 16
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
 	int ddir, bytes, bucket;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6c4ab0d4a160..6c247861cb66 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -46,6 +46,9 @@ struct blk_stat_callback;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
+/* Must be consisitent with blk_mq_poll_stats_bkt() */
+#define BLK_MQ_POLL_STATS_BKTS 16
+
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
@@ -517,7 +520,7 @@ struct request_queue {
 	int			poll_nsec;
 
 	struct blk_stat_callback	*poll_cb;
-	struct blk_rq_stat	poll_stat[2];
+	struct blk_rq_stat	poll_stat[BLK_MQ_POLL_STATS_BKTS];
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
-- 
cgit v1.2.3


From 92a68fa047ca5b8e1991af2d50b23ad9452613cd Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 31 Mar 2017 19:21:41 -0400
Subject: ftrace: Move the function commands into the tracing directory

As nothing outside the tracing directory uses the function command mechanism,
I'm moving the prototypes out of the include/linux/ftrace.h and into the
local kernel/trace/trace.h header. I plan on making them hook to the
trace_array structure which is local to kernel/trace, and I do not want to
expose it to the rest of the kernel. This requires that the command functions
must also be local to tracing. But luckily nothing else uses them.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 19 -------------------
 kernel/trace/trace.h   | 20 ++++++++++++++++++++
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3e790ff1c501..774e7a95c201 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -326,14 +326,6 @@ static inline void stack_tracer_disable(void) { }
 static inline void stack_tracer_enable(void) { }
 #endif
 
-struct ftrace_func_command {
-	struct list_head	list;
-	char			*name;
-	int			(*func)(struct ftrace_hash *hash,
-					char *func, char *cmd,
-					char *params, int enable);
-};
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int ftrace_arch_code_modify_prepare(void);
@@ -421,9 +413,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset);
 void ftrace_free_filter(struct ftrace_ops *ops);
 void ftrace_ops_set_global_filter(struct ftrace_ops *ops);
 
-int register_ftrace_command(struct ftrace_func_command *cmd);
-int unregister_ftrace_command(struct ftrace_func_command *cmd);
-
 enum {
 	FTRACE_UPDATE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -639,14 +628,6 @@ static inline void ftrace_enable_daemon(void) { }
 static inline void ftrace_module_init(struct module *mod) { }
 static inline void ftrace_module_enable(struct module *mod) { }
 static inline void ftrace_release_mod(struct module *mod) { }
-static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
-{
-	return -EINVAL;
-}
-static inline __init int unregister_ftrace_command(char *cmd_name)
-{
-	return -EINVAL;
-}
 static inline int ftrace_text_reserved(const void *start, const void *end)
 {
 	return 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2ff6d49fa5ca..a63411c53c5e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -880,6 +880,13 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
 extern struct list_head ftrace_pids;
 
 #ifdef CONFIG_FUNCTION_TRACER
+struct ftrace_func_command {
+	struct list_head	list;
+	char			*name;
+	int			(*func)(struct ftrace_hash *hash,
+					char *func, char *cmd,
+					char *params, int enable);
+};
 extern bool ftrace_filter_param __initdata;
 static inline int ftrace_trace_task(struct trace_array *tr)
 {
@@ -948,10 +955,23 @@ extern void
 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
 extern void unregister_ftrace_function_probe_all(char *glob);
 
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
+
 void ftrace_create_filter_files(struct ftrace_ops *ops,
 				struct dentry *parent);
 void ftrace_destroy_filter_files(struct ftrace_ops *ops);
 #else
+struct ftrace_func_command;
+
+static inline __init int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+	return -EINVAL;
+}
+static inline __init int unregister_ftrace_command(char *cmd_name)
+{
+	return -EINVAL;
+}
 /*
  * The ops parameter passed in is usually undefined.
  * This must be a macro.
-- 
cgit v1.2.3


From eee8ded131f15e0f5b1897c9c4a7687fabd28822 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 4 Apr 2017 21:31:28 -0400
Subject: ftrace: Have the function probes call their own function

Now that the function probes have their own ftrace_ops, there's no reason to
continue using the ftrace_func_hash to find which probe to call in the
function callback. The ops that is passed in to the function callback is
part of the probe_ops to call.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |   4 +-
 kernel/trace/ftrace.c  | 225 +++++++++++++++++++++----------------------------
 kernel/trace/trace.h   |   1 +
 3 files changed, 101 insertions(+), 129 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 774e7a95c201..6d2a63e4ea52 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -443,8 +443,8 @@ enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
 	FTRACE_ITER_NOTRACE	= (1 << 1),
 	FTRACE_ITER_PRINTALL	= (1 << 2),
-	FTRACE_ITER_DO_HASH	= (1 << 3),
-	FTRACE_ITER_HASH	= (1 << 4),
+	FTRACE_ITER_DO_PROBES	= (1 << 3),
+	FTRACE_ITER_PROBE	= (1 << 4),
 	FTRACE_ITER_ENABLED	= (1 << 5),
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cf6b7263199a..493c7ff7e860 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1096,14 +1096,7 @@ static bool update_all_ops;
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_probe {
-	struct hlist_node	node;
-	struct ftrace_probe_ops	*ops;
-	unsigned long		ip;
-	struct list_head	free_list;
-};
+static LIST_HEAD(ftrace_func_probes);
 
 struct ftrace_func_entry {
 	struct hlist_node hlist;
@@ -1270,7 +1263,7 @@ static void
 remove_hash_entry(struct ftrace_hash *hash,
 		  struct ftrace_func_entry *entry)
 {
-	hlist_del(&entry->hlist);
+	hlist_del_rcu(&entry->hlist);
 	hash->count--;
 }
 
@@ -3063,35 +3056,58 @@ struct ftrace_iterator {
 	loff_t				func_pos;
 	struct ftrace_page		*pg;
 	struct dyn_ftrace		*func;
-	struct ftrace_func_probe	*probe;
+	struct ftrace_probe_ops		*probe;
+	struct ftrace_func_entry	*probe_entry;
 	struct trace_parser		parser;
 	struct ftrace_hash		*hash;
 	struct ftrace_ops		*ops;
-	int				hidx;
+	int				pidx;
 	int				idx;
 	unsigned			flags;
 };
 
 static void *
-t_hash_next(struct seq_file *m, loff_t *pos)
+t_probe_next(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
+	struct ftrace_hash *hash;
+	struct list_head *next;
 	struct hlist_node *hnd = NULL;
 	struct hlist_head *hhd;
+	int size;
 
 	(*pos)++;
 	iter->pos = *pos;
 
-	if (iter->probe)
-		hnd = &iter->probe->node;
- retry:
-	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+	if (list_empty(&ftrace_func_probes))
 		return NULL;
 
-	hhd = &ftrace_func_hash[iter->hidx];
+	if (!iter->probe) {
+		next = ftrace_func_probes.next;
+		iter->probe = list_entry(next, struct ftrace_probe_ops, list);
+	}
+
+	if (iter->probe_entry)
+		hnd = &iter->probe_entry->hlist;
+
+	hash = iter->probe->ops.func_hash->filter_hash;
+	size = 1 << hash->size_bits;
+
+ retry:
+	if (iter->pidx >= size) {
+		if (iter->probe->list.next == &ftrace_func_probes)
+			return NULL;
+		next = iter->probe->list.next;
+		iter->probe = list_entry(next, struct ftrace_probe_ops, list);
+		hash = iter->probe->ops.func_hash->filter_hash;
+		size = 1 << hash->size_bits;
+		iter->pidx = 0;
+	}
+
+	hhd = &hash->buckets[iter->pidx];
 
 	if (hlist_empty(hhd)) {
-		iter->hidx++;
+		iter->pidx++;
 		hnd = NULL;
 		goto retry;
 	}
@@ -3101,7 +3117,7 @@ t_hash_next(struct seq_file *m, loff_t *pos)
 	else {
 		hnd = hnd->next;
 		if (!hnd) {
-			iter->hidx++;
+			iter->pidx++;
 			goto retry;
 		}
 	}
@@ -3109,26 +3125,28 @@ t_hash_next(struct seq_file *m, loff_t *pos)
 	if (WARN_ON_ONCE(!hnd))
 		return NULL;
 
-	iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node);
+	iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist);
 
 	return iter;
 }
 
-static void *t_hash_start(struct seq_file *m, loff_t *pos)
+static void *t_probe_start(struct seq_file *m, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 	loff_t l;
 
-	if (!(iter->flags & FTRACE_ITER_DO_HASH))
+	if (!(iter->flags & FTRACE_ITER_DO_PROBES))
 		return NULL;
 
 	if (iter->func_pos > *pos)
 		return NULL;
 
-	iter->hidx = 0;
+	iter->probe = NULL;
+	iter->probe_entry = NULL;
+	iter->pidx = 0;
 	for (l = 0; l <= (*pos - iter->func_pos); ) {
-		p = t_hash_next(m, &l);
+		p = t_probe_next(m, &l);
 		if (!p)
 			break;
 	}
@@ -3136,24 +3154,27 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 		return NULL;
 
 	/* Only set this if we have an item */
-	iter->flags |= FTRACE_ITER_HASH;
+	iter->flags |= FTRACE_ITER_PROBE;
 
 	return iter;
 }
 
 static int
-t_hash_show(struct seq_file *m, struct ftrace_iterator *iter)
+t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
 {
-	struct ftrace_func_probe *rec;
+	struct ftrace_probe_ops *probe;
+	struct ftrace_func_entry *probe_entry;
 
-	rec = iter->probe;
-	if (WARN_ON_ONCE(!rec))
+	probe = iter->probe;
+	probe_entry = iter->probe_entry;
+
+	if (WARN_ON_ONCE(!probe || !probe_entry))
 		return -EIO;
 
-	if (rec->ops->print)
-		return rec->ops->print(m, rec->ip, rec->ops, NULL);
+	if (probe->print)
+		return probe->print(m, probe_entry->ip, probe, NULL);
 
-	seq_printf(m, "%ps:%ps\n", (void *)rec->ip, (void *)rec->ops->func);
+	seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, (void *)probe->func);
 
 	return 0;
 }
@@ -3205,19 +3226,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	if (unlikely(ftrace_disabled))
 		return NULL;
 
-	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_next(m, pos);
+	if (iter->flags & FTRACE_ITER_PROBE)
+		return t_probe_next(m, pos);
 
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
-		/* next must increment pos, and t_hash_start does not */
+		/* next must increment pos, and t_probe_start does not */
 		(*pos)++;
-		return t_hash_start(m, &l);
+		return t_probe_start(m, &l);
 	}
 
 	ret = t_func_next(m, pos);
 
 	if (!ret)
-		return t_hash_start(m, &l);
+		return t_probe_start(m, &l);
 
 	return ret;
 }
@@ -3226,7 +3247,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
 {
 	iter->pos = 0;
 	iter->func_pos = 0;
-	iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
+	iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3255,15 +3276,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	    ftrace_hash_empty(iter->hash)) {
 		iter->func_pos = 1; /* Account for the message */
 		if (*pos > 0)
-			return t_hash_start(m, pos);
+			return t_probe_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
 		/* reset in case of seek/pread */
-		iter->flags &= ~FTRACE_ITER_HASH;
+		iter->flags &= ~FTRACE_ITER_PROBE;
 		return iter;
 	}
 
-	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_start(m, pos);
+	if (iter->flags & FTRACE_ITER_PROBE)
+		return t_probe_start(m, pos);
 
 	/*
 	 * Unfortunately, we need to restart at ftrace_pages_start
@@ -3279,7 +3300,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	}
 
 	if (!p)
-		return t_hash_start(m, pos);
+		return t_probe_start(m, pos);
 
 	return iter;
 }
@@ -3310,8 +3331,8 @@ static int t_show(struct seq_file *m, void *v)
 	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec;
 
-	if (iter->flags & FTRACE_ITER_HASH)
-		return t_hash_show(m, iter);
+	if (iter->flags & FTRACE_ITER_PROBE)
+		return t_probe_show(m, iter);
 
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		if (iter->flags & FTRACE_ITER_NOTRACE)
@@ -3490,7 +3511,7 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 	struct ftrace_ops *ops = inode->i_private;
 
 	return ftrace_regex_open(ops,
-			FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+			FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES,
 			inode, file);
 }
 
@@ -3765,16 +3786,9 @@ core_initcall(ftrace_mod_cmd_init);
 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 				      struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
-	struct ftrace_func_probe *entry;
-	struct hlist_head *hhd;
-	unsigned long key;
-
-	key = hash_long(ip, FTRACE_HASH_BITS);
+	struct ftrace_probe_ops *probe_ops;
 
-	hhd = &ftrace_func_hash[key];
-
-	if (hlist_empty(hhd))
-		return;
+	probe_ops = container_of(op, struct ftrace_probe_ops, ops);
 
 	/*
 	 * Disable preemption for these calls to prevent a RCU grace
@@ -3782,20 +3796,10 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 	 * on the hash. rcu_read_lock is too dangerous here.
 	 */
 	preempt_disable_notrace();
-	hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
-		if (entry->ip == ip)
-			entry->ops->func(ip, parent_ip, entry->ops, NULL);
-	}
+	probe_ops->func(ip, parent_ip, probe_ops, NULL);
 	preempt_enable_notrace();
 }
 
-static void ftrace_free_entry(struct ftrace_func_probe *entry)
-{
-	if (entry->ops->free)
-		entry->ops->free(entry->ops, entry->ip, NULL);
-	kfree(entry);
-}
-
 struct ftrace_func_map {
 	struct ftrace_func_entry	entry;
 	void				*data;
@@ -3942,13 +3946,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			       void *data)
 {
 	struct ftrace_func_entry *entry;
-	struct ftrace_func_probe *probe;
 	struct ftrace_hash **orig_hash;
 	struct ftrace_hash *old_hash;
 	struct ftrace_hash *hash;
-	struct hlist_head hl;
-	struct hlist_node *n;
-	unsigned long key;
 	int count = 0;
 	int size;
 	int ret;
@@ -3961,6 +3961,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED)) {
 		ops->ops.func = function_trace_probe_call;
 		ftrace_ops_init(&ops->ops);
+		INIT_LIST_HEAD(&ops->list);
 	}
 
 	mutex_lock(&ops->ops.func_hash->regex_lock);
@@ -3978,31 +3979,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	if (ret < 0)
 		goto out;
 
-	INIT_HLIST_HEAD(&hl);
-
 	size = 1 << hash->size_bits;
 	for (i = 0; i < size; i++) {
 		hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
 			if (ftrace_lookup_ip(old_hash, entry->ip))
 				continue;
-			probe = kmalloc(sizeof(*probe), GFP_KERNEL);
-			if (!probe) {
-				count = -ENOMEM;
-				goto err_free;
-			}
-			probe->ops = ops;
-			probe->ip = entry->ip;
 			/*
 			 * The caller might want to do something special
 			 * for each function we find. We call the callback
 			 * to give the caller an opportunity to do so.
 			 */
-			if (ops->init && ops->init(ops, entry->ip, data) < 0) {
-				kfree(probe);
-				goto err_free;
+			if (ops->init) {
+				ret = ops->init(ops, entry->ip, data);
+				if (ret < 0)
+					goto out;
 			}
-			hlist_add_head(&probe->node, &hl);
-
 			count++;
 		}
 	}
@@ -4012,17 +4003,15 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash,
 						      hash, 1);
 	if (ret < 0)
-		goto err_free_unlock;
+		goto out_unlock;
 
-	hlist_for_each_entry_safe(probe, n, &hl, node) {
-		hlist_del(&probe->node);
-		key = hash_long(probe->ip, FTRACE_HASH_BITS);
-		hlist_add_head_rcu(&probe->node, &ftrace_func_hash[key]);
-	}
+	if (list_empty(&ops->list))
+		list_add(&ops->list, &ftrace_func_probes);
 
 	if (!(ops->ops.flags & FTRACE_OPS_FL_ENABLED))
 		ret = ftrace_startup(&ops->ops, 0);
 
+ out_unlock:
 	mutex_unlock(&ftrace_lock);
 
 	if (!ret)
@@ -4032,34 +4021,22 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 	free_ftrace_hash(hash);
 
 	return ret;
-
- err_free_unlock:
-	mutex_unlock(&ftrace_lock);
- err_free:
-	hlist_for_each_entry_safe(probe, n, &hl, node) {
-		hlist_del(&probe->node);
-		if (ops->free)
-			ops->free(ops, probe->ip, NULL);
-		kfree(probe);
-	}
-	goto out;
 }
 
 int
 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 {
 	struct ftrace_ops_hash old_hash_ops;
-	struct ftrace_func_entry *rec_entry;
-	struct ftrace_func_probe *entry;
-	struct ftrace_func_probe *p;
+	struct ftrace_func_entry *entry;
 	struct ftrace_glob func_g;
 	struct ftrace_hash **orig_hash;
 	struct ftrace_hash *old_hash;
-	struct list_head free_list;
 	struct ftrace_hash *hash = NULL;
 	struct hlist_node *tmp;
+	struct hlist_head hhd;
 	char str[KSYM_SYMBOL_LEN];
 	int i, ret;
+	int size;
 
 	if (!(ops->ops.flags & FTRACE_OPS_FL_INITIALIZED))
 		return -EINVAL;
@@ -4097,18 +4074,12 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 	if (!hash)
 		goto out_unlock;
 
-	INIT_LIST_HEAD(&free_list);
-
-	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
-		struct hlist_head *hhd = &ftrace_func_hash[i];
+	INIT_HLIST_HEAD(&hhd);
 
-		hlist_for_each_entry_safe(entry, tmp, hhd, node) {
-
-			/* break up if statements for readability */
-			if (entry->ops != ops)
-				continue;
+	size = 1 << hash->size_bits;
+	for (i = 0; i < size; i++) {
+		hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) {
 
-			/* do this last, since it is the most expensive */
 			if (func_g.search) {
 				kallsyms_lookup(entry->ip, NULL, NULL,
 						NULL, str);
@@ -4116,26 +4087,24 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 					continue;
 			}
 
-			rec_entry = ftrace_lookup_ip(hash, entry->ip);
-			/* It is possible more than one entry had this ip */
-			if (rec_entry)
-				free_hash_entry(hash, rec_entry);
-
-			hlist_del_rcu(&entry->node);
-			list_add(&entry->free_list, &free_list);
+			remove_hash_entry(hash, entry);
+			hlist_add_head(&entry->hlist, &hhd);
 		}
 	}
 
 	/* Nothing found? */
-	if (list_empty(&free_list)) {
+	if (hlist_empty(&hhd)) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 
 	mutex_lock(&ftrace_lock);
 
-	if (ftrace_hash_empty(hash))
+	if (ftrace_hash_empty(hash)) {
 		ftrace_shutdown(&ops->ops, 0);
+		list_del_init(&ops->list);
+	}
+
 
 	ret = ftrace_hash_move_and_update_ops(&ops->ops, orig_hash,
 					      hash, 1);
@@ -4146,9 +4115,11 @@ unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 				       &old_hash_ops);
 	synchronize_sched();
 
-	list_for_each_entry_safe(entry, p, &free_list, free_list) {
-		list_del(&entry->free_list);
-		ftrace_free_entry(entry);
+	hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) {
+		hlist_del(&entry->hlist);
+		if (ops->free)
+			ops->free(ops, entry->ip, NULL);
+		kfree(entry);
 	}
 	mutex_unlock(&ftrace_lock);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e16c67c49de4..d457addcc224 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -933,6 +933,7 @@ static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) {
 
 struct ftrace_probe_ops {
 	struct ftrace_ops	ops;
+	struct list_head	list;
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					struct ftrace_probe_ops *ops,
-- 
cgit v1.2.3


From ed067d4a859ff696373324c5061392e013a7561a Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Tue, 11 Apr 2017 20:08:34 +0200
Subject: linux/kernel.h: Add ALIGN_DOWN macro

Few parts of kernel define their own macro for aligning down so provide
a common define for this, with the same usage and assumptions as existing
ALIGN.

Convert also three existing implementations to this one.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/metag/kernel/stacktrace.c | 2 --
 drivers/gpu/drm/udl/udl_fb.c   | 2 +-
 include/linux/kernel.h         | 1 +
 include/video/udlfb.h          | 2 +-
 4 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/metag/kernel/stacktrace.c b/arch/metag/kernel/stacktrace.c
index 91ffc4b75c33..09d67b7f51ca 100644
--- a/arch/metag/kernel/stacktrace.c
+++ b/arch/metag/kernel/stacktrace.c
@@ -31,8 +31,6 @@ static void tbi_boing_init(void)
 }
 #endif
 
-#define ALIGN_DOWN(addr, size)  ((addr)&(~((size)-1)))
-
 /*
  * Unwind the current stack frame and store the new register values in the
  * structure passed as argument. Unwinding is equivalent to a function return,
diff --git a/drivers/gpu/drm/udl/udl_fb.c b/drivers/gpu/drm/udl/udl_fb.c
index 8e8d60e9a1a2..b1f0d523dff9 100644
--- a/drivers/gpu/drm/udl/udl_fb.c
+++ b/drivers/gpu/drm/udl/udl_fb.c
@@ -37,7 +37,7 @@ struct udl_fbdev {
 };
 
 #define DL_ALIGN_UP(x, a) ALIGN(x, a)
-#define DL_ALIGN_DOWN(x, a) ALIGN(x-(a-1), a)
+#define DL_ALIGN_DOWN(x, a) ALIGN_DOWN(x, a)
 
 /** Read the red component (0..255) of a 32 bpp colour. */
 #define DLO_RGB_GETRED(col) (uint8_t)((col) & 0xFF)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c26dc3a8295..3d9f8420f973 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -47,6 +47,7 @@
 
 /* @a is a power of 2 value */
 #define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
+#define ALIGN_DOWN(x, a)	__ALIGN_KERNEL((x) - ((a) - 1), (a))
 #define __ALIGN_MASK(x, mask)	__ALIGN_KERNEL_MASK((x), (mask))
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 #define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
diff --git a/include/video/udlfb.h b/include/video/udlfb.h
index f9466fa54ba4..3ea90aea5617 100644
--- a/include/video/udlfb.h
+++ b/include/video/udlfb.h
@@ -92,6 +92,6 @@ struct dlfb_data {
 
 /* remove these once align.h patch is taken into kernel */
 #define DL_ALIGN_UP(x, a) ALIGN(x, a)
-#define DL_ALIGN_DOWN(x, a) ALIGN(x-(a-1), a)
+#define DL_ALIGN_DOWN(x, a) ALIGN_DOWN(x, a)
 
 #endif
-- 
cgit v1.2.3


From 6ade8694f471d847500c7cec152cc15171cef5d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 20 Apr 2017 17:30:06 -0700
Subject: kvm: Move srcu_struct fields to end of struct kvm

Parallelizing SRCU callback handling will increase the size of
srcu_struct, which will move the kvm structure's kvm_arch field out
of reach of powerpc's current assembly code, which will result in the
following sort of build error:

arch/powerpc/kvm/book3s_hv_rmhandlers.S:617: Error: operand out of range (0x000000000000b328 is not between 0xffffffffffff8000 and 0x0000000000007fff)

This commit moves the srcu_struct fields in the kvm structure to follow
the kvm_arch field, which will allow powerpc's assembly code to continue
to be able to reach the kvm_arch field.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Michael Ellerman <michaele@au1.ibm.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
[ paulmck: Moved this commit to precede SRCU callback parallelization,
  and reworded the commit log into future tense, all in the name of
  bisectability. ]
---
 include/linux/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..96c8e29c6442 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -375,8 +375,6 @@ struct kvm {
 	struct mutex slots_lock;
 	struct mm_struct *mm; /* userspace tied to this vm */
 	struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM];
-	struct srcu_struct srcu;
-	struct srcu_struct irq_srcu;
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 
 	/*
@@ -429,6 +427,8 @@ struct kvm {
 	struct list_head devices;
 	struct dentry *debugfs_dentry;
 	struct kvm_stat_data **debugfs_stat_data;
+	struct srcu_struct srcu;
+	struct srcu_struct irq_srcu;
 };
 
 #define kvm_err(fmt, ...) \
-- 
cgit v1.2.3


From da915ad5cf25b5f5d358dd3670c3378d8ae8c03e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 5 Apr 2017 09:01:53 -0700
Subject: srcu: Parallelize callback handling

Peter Zijlstra proposed using SRCU to reduce mmap_sem contention [1,2],
however, there are workloads that could result in a high volume of
concurrent invocations of call_srcu(), which with current SRCU would
result in excessive lock contention on the srcu_struct structure's
->queue_lock, which protects SRCU's callback lists.  This commit therefore
moves SRCU to per-CPU callback lists, thus greatly reducing contention.

Because a given SRCU instance no longer has a single centralized callback
list, starting grace periods and invoking callbacks are both more complex
than in the single-list Classic SRCU implementation.  Starting grace
periods and handling callbacks are now handled using an srcu_node tree
that is in some ways similar to the rcu_node trees used by RCU-bh,
RCU-preempt, and RCU-sched (for example, the srcu_node tree shape is
controlled by exactly the same Kconfig options and boot parameters that
control the shape of the rcu_node tree).

In addition, the old per-CPU srcu_array structure is now named srcu_data
and contains an rcu_segcblist structure named ->srcu_cblist for its
callbacks (and a spinlock to protect this).  The srcu_struct gets
an srcu_gp_seq that is used to associate callback segments with the
corresponding completion-time grace-period number.  These completion-time
grace-period numbers are propagated up the srcu_node tree so that the
grace-period workqueue handler can determine whether additional grace
periods are needed on the one hand and where to look for callbacks that
are ready to be invoked.

The srcu_barrier() function must now wait on all instances of the per-CPU
->srcu_cblist.  Because each ->srcu_cblist is protected by ->lock,
srcu_barrier() can remotely add the needed callbacks.  In theory,
it could also remotely start grace periods, but in practice doing so
is complex and racy.  And interestingly enough, it is never necessary
for srcu_barrier() to start a grace period because srcu_barrier() only
enqueues a callback when a callback is already present--and it turns out
that a grace period has to have already been started for this pre-existing
callback.  Furthermore, it is only the callback that srcu_barrier()
needs to wait on, not any particular grace period.  Therefore, a new
rcu_segcblist_entrain() function enqueues the srcu_barrier() function's
callback into the same segment occupied by the last pre-existing callback
in the list.  The special case where all the pre-existing callbacks are
on a different list (because they are in the process of being invoked)
is handled by enqueuing srcu_barrier()'s callback into the RCU_DONE_TAIL
segment, relying on the done-callbacks check that takes place after all
callbacks are inovked.

Note that the readers use the same algorithm as before.  Note that there
is a separate srcu_idx that tells the readers what counter to increment.
This unfortunately cannot be combined with srcu_gp_seq because they
need to be incremented at different times.

This commit introduces some ugly #ifdefs in rcutorture.  These will go
away when I feel good enough about Tree SRCU to ditch Classic SRCU.

Some crude performance comparisons, courtesy of a quickly hacked rcuperf
asynchronous-grace-period capability:

			Callback Queuing Overhead
			-------------------------
	# CPUS		Classic SRCU	Tree SRCU
	------          ------------    ---------
	     2              0.349 us     0.342 us
	    16             31.66  us     0.4   us
	    41             ---------     0.417 us

The times are the 90th percentiles, a statistic that was chosen to reject
the overheads of the occasional srcu_barrier() call needed to avoid OOMing
the test machine.  The rcuperf test hangs when running Classic SRCU at 41
CPUs, hence the line of dashes.  Despite the hacks to both the rcuperf code
and that statistics, this is a convincing demonstration of Tree SRCU's
performance and scalability advantages.

[1] https://lwn.net/Articles/309030/
[2] https://patchwork.kernel.org/patch/5108281/

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Fix initialization if synchronize_srcu_expedited() called first. ]
---
 include/linux/rcu_segcblist.h |  42 ++-
 include/linux/srcutree.h      |  80 ++++--
 kernel/rcu/rcutorture.c       |  20 +-
 kernel/rcu/srcutree.c         | 642 +++++++++++++++++++++++++++++++++---------
 kernel/rcu/tree.c             |   6 +
 kernel/rcu/tree.h             |   8 +
 6 files changed, 647 insertions(+), 151 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 74b1e7243955..ced8f313fd05 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -401,6 +401,37 @@ static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
 	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
 }
 
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment.  If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use.  IMPORTANT:  The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period.  You have been warned.
+ */
+static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	int i;
+
+	if (rcu_segcblist_n_cbs(rsclp) == 0)
+		return false;
+	WRITE_ONCE(rsclp->len, rsclp->len + 1);
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is entrained. */
+	rhp->next = NULL;
+	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1])
+			break;
+	*rsclp->tails[i] = rhp;
+	for (; i <= RCU_NEXT_TAIL; i++)
+		rsclp->tails[i] = &rhp->next;
+	return true;
+}
+
 /*
  * Extract only the counts from the specified rcu_segcblist structure,
  * and place them in the specified rcu_cblist structure.  This function
@@ -537,7 +568,8 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
 	int i, j;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return;
 
 	/*
 	 * Find all callbacks whose ->gp_seq numbers indicate that they
@@ -582,8 +614,9 @@ static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
  * them to complete at the end of the earlier grace period.
  *
  * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number at which new callbacks would become
- * ready to invoke.
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke.  Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
  */
 static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
 					    unsigned long seq)
@@ -591,7 +624,8 @@ static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
 	int i;
 
 	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return false;
 
 	/*
 	 * Find the segment preceding the oldest segment of callbacks
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index f2b3bd6c6bc2..0400e211aa44 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -24,25 +24,75 @@
 #ifndef _LINUX_SRCU_TREE_H
 #define _LINUX_SRCU_TREE_H
 
-struct srcu_array {
-	unsigned long lock_count[2];
-	unsigned long unlock_count[2];
+#include <linux/rcu_node_tree.h>
+#include <linux/completion.h>
+
+struct srcu_node;
+struct srcu_struct;
+
+/*
+ * Per-CPU structure feeding into leaf srcu_node, similar in function
+ * to rcu_node.
+ */
+struct srcu_data {
+	/* Read-side state. */
+	unsigned long srcu_lock_count[2];	/* Locks per CPU. */
+	unsigned long srcu_unlock_count[2];	/* Unlocks per CPU. */
+
+	/* Update-side state. */
+	spinlock_t lock ____cacheline_internodealigned_in_smp;
+	struct rcu_segcblist srcu_cblist;	/* List of callbacks.*/
+	unsigned long srcu_gp_seq_needed;	/* Furthest future GP needed. */
+	bool srcu_cblist_invoking;		/* Invoking these CBs? */
+	struct delayed_work work;		/* Context for CB invoking. */
+	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
+	struct srcu_node *mynode;		/* Leaf srcu_node. */
+	int cpu;
+	struct srcu_struct *sp;
 };
 
+/*
+ * Node in SRCU combining tree, similar in function to rcu_data.
+ */
+struct srcu_node {
+	spinlock_t lock;
+	unsigned long srcu_have_cbs[4];		/* GP seq for children */
+						/*  having CBs, but only */
+						/*  is > ->srcu_gq_seq. */
+	struct srcu_node *srcu_parent;		/* Next up in tree. */
+	int grplo;				/* Least CPU for node. */
+	int grphi;				/* Biggest CPU for node. */
+};
+
+/*
+ * Per-SRCU-domain structure, similar in function to rcu_state.
+ */
 struct srcu_struct {
-	unsigned long completed;
-	unsigned long srcu_gp_seq;
-	atomic_t srcu_exp_cnt;
-	struct srcu_array __percpu *per_cpu_ref;
-	spinlock_t queue_lock; /* protect ->srcu_cblist */
-	struct rcu_segcblist srcu_cblist;
+	struct srcu_node node[NUM_RCU_NODES];	/* Combining tree. */
+	struct srcu_node *level[RCU_NUM_LVLS + 1];
+						/* First node at each level. */
+	struct mutex srcu_cb_mutex;		/* Serialize CB preparation. */
+	spinlock_t gp_lock;			/* protect ->srcu_cblist */
+	struct mutex srcu_gp_mutex;		/* Serialize GP work. */
+	unsigned int srcu_idx;			/* Current rdr array element. */
+	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
+	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
+	atomic_t srcu_exp_cnt;			/* # ongoing expedited GPs. */
+	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
+	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
+	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
+	struct completion srcu_barrier_completion;
+						/* Awaken barrier rq at end. */
+	atomic_t srcu_barrier_cpu_cnt;		/* # CPUs not yet posting a */
+						/*  callback for the barrier */
+						/*  operation. */
 	struct delayed_work work;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 };
 
-/* Values for -> state variable. */
+/* Values for state variable (bottom bits of ->srcu_gp_seq). */
 #define SRCU_STATE_IDLE		0
 #define SRCU_STATE_SCAN1	1
 #define SRCU_STATE_SCAN2	2
@@ -51,11 +101,9 @@ void process_srcu(struct work_struct *work);
 
 #define __SRCU_STRUCT_INIT(name)					\
 	{								\
-		.completed = -300,					\
-		.per_cpu_ref = &name##_srcu_array,			\
-		.queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),	\
-		.srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist),\
-		.work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\
+		.sda = &name##_srcu_data,				\
+		.gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock),		\
+		.srcu_gp_seq_needed = 0 - 1,				\
 		__SRCU_DEP_MAP_INIT(name)				\
 	}
 
@@ -79,7 +127,7 @@ void process_srcu(struct work_struct *work);
  * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)					\
-	static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\
+	static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\
 	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
 #define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
 #define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6f344b6748a8..e9d4527cdd43 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -563,17 +563,30 @@ static void srcu_torture_stats(void)
 	int idx;
 
 #if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU)
+#ifdef CONFIG_TREE_SRCU
+	idx = srcu_ctlp->srcu_idx & 0x1;
+#else /* #ifdef CONFIG_TREE_SRCU */
 	idx = srcu_ctlp->completed & 0x1;
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
 	pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
 		 torture_type, TORTURE_FLAG, idx);
 	for_each_possible_cpu(cpu) {
 		unsigned long l0, l1;
 		unsigned long u0, u1;
 		long c0, c1;
-		struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
+#ifdef CONFIG_TREE_SRCU
+		struct srcu_data *counts;
 
+		counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
+		u0 = counts->srcu_unlock_count[!idx];
+		u1 = counts->srcu_unlock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
+		struct srcu_array *counts;
+
+		counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu);
 		u0 = counts->unlock_count[!idx];
 		u1 = counts->unlock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
 
 		/*
 		 * Make sure that a lock is always counted if the corresponding
@@ -581,8 +594,13 @@ static void srcu_torture_stats(void)
 		 */
 		smp_rmb();
 
+#ifdef CONFIG_TREE_SRCU
+		l0 = counts->srcu_lock_count[!idx];
+		l1 = counts->srcu_lock_count[idx];
+#else /* #ifdef CONFIG_TREE_SRCU */
 		l0 = counts->lock_count[!idx];
 		l1 = counts->lock_count[idx];
+#endif /* #else #ifdef CONFIG_TREE_SRCU */
 
 		c0 = l0 - u0;
 		c1 = l1 - u1;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index da676b0d016b..12feeca18f46 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -36,19 +36,110 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
-#include <linux/rcu_node_tree.h>
 #include "rcu.h"
 
-static int init_srcu_struct_fields(struct srcu_struct *sp)
+static void srcu_invoke_callbacks(struct work_struct *work);
+static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+
+/*
+ * Initialize SRCU combining tree.  Note that statically allocated
+ * srcu_struct structures might already have srcu_read_lock() and
+ * srcu_read_unlock() running against them.  So if the is_static parameter
+ * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ */
+static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 {
-	sp->completed = 0;
+	int cpu;
+	int i;
+	int level = 0;
+	int levelspread[RCU_NUM_LVLS];
+	struct srcu_data *sdp;
+	struct srcu_node *snp;
+	struct srcu_node *snp_first;
+
+	/* Work out the overall tree geometry. */
+	sp->level[0] = &sp->node[0];
+	for (i = 1; i < rcu_num_lvls; i++)
+		sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1];
+	rcu_init_levelspread(levelspread, num_rcu_lvl);
+
+	/* Each pass through this loop initializes one srcu_node structure. */
+	rcu_for_each_node_breadth_first(sp, snp) {
+		spin_lock_init(&snp->lock);
+		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++)
+			snp->srcu_have_cbs[i] = 0;
+		snp->grplo = -1;
+		snp->grphi = -1;
+		if (snp == &sp->node[0]) {
+			/* Root node, special case. */
+			snp->srcu_parent = NULL;
+			continue;
+		}
+
+		/* Non-root node. */
+		if (snp == sp->level[level + 1])
+			level++;
+		snp->srcu_parent = sp->level[level - 1] +
+				   (snp - sp->level[level]) /
+				   levelspread[level - 1];
+	}
+
+	/*
+	 * Initialize the per-CPU srcu_data array, which feeds into the
+	 * leaves of the srcu_node tree.
+	 */
+	WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) !=
+		     ARRAY_SIZE(sdp->srcu_unlock_count));
+	level = rcu_num_lvls - 1;
+	snp_first = sp->level[level];
+	for_each_possible_cpu(cpu) {
+		sdp = per_cpu_ptr(sp->sda, cpu);
+		spin_lock_init(&sdp->lock);
+		rcu_segcblist_init(&sdp->srcu_cblist);
+		sdp->srcu_cblist_invoking = false;
+		sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+		sdp->mynode = &snp_first[cpu / levelspread[level]];
+		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
+			if (snp->grplo < 0)
+				snp->grplo = cpu;
+			snp->grphi = cpu;
+		}
+		sdp->cpu = cpu;
+		INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
+		sdp->sp = sp;
+		if (is_static)
+			continue;
+
+		/* Dynamically allocated, better be no srcu_read_locks()! */
+		for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) {
+			sdp->srcu_lock_count[i] = 0;
+			sdp->srcu_unlock_count[i] = 0;
+		}
+	}
+}
+
+/*
+ * Initialize non-compile-time initialized fields, including the
+ * associated srcu_node and srcu_data structures.  The is_static
+ * parameter is passed through to init_srcu_struct_nodes(), and
+ * also tells us that ->sda has already been wired up to srcu_data.
+ */
+static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
+{
+	mutex_init(&sp->srcu_cb_mutex);
+	mutex_init(&sp->srcu_gp_mutex);
+	sp->srcu_idx = 0;
 	sp->srcu_gp_seq = 0;
 	atomic_set(&sp->srcu_exp_cnt, 0);
-	spin_lock_init(&sp->queue_lock);
-	rcu_segcblist_init(&sp->srcu_cblist);
+	sp->srcu_barrier_seq = 0;
+	mutex_init(&sp->srcu_barrier_mutex);
+	atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
 	INIT_DELAYED_WORK(&sp->work, process_srcu);
-	sp->per_cpu_ref = alloc_percpu(struct srcu_array);
-	return sp->per_cpu_ref ? 0 : -ENOMEM;
+	if (!is_static)
+		sp->sda = alloc_percpu(struct srcu_data);
+	init_srcu_struct_nodes(sp, is_static);
+	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
+	return sp->sda ? 0 : -ENOMEM;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -59,7 +150,8 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
 	/* Don't re-initialize a lock while it is held. */
 	debug_check_no_locks_freed((void *)sp, sizeof(*sp));
 	lockdep_init_map(&sp->dep_map, name, key, 0);
-	return init_srcu_struct_fields(sp);
+	spin_lock_init(&sp->gp_lock);
+	return init_srcu_struct_fields(sp, false);
 }
 EXPORT_SYMBOL_GPL(__init_srcu_struct);
 
@@ -75,15 +167,41 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
  */
 int init_srcu_struct(struct srcu_struct *sp)
 {
-	return init_srcu_struct_fields(sp);
+	spin_lock_init(&sp->gp_lock);
+	return init_srcu_struct_fields(sp, false);
 }
 EXPORT_SYMBOL_GPL(init_srcu_struct);
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 /*
- * Returns approximate total of the readers' ->lock_count[] values for the
- * rank of per-CPU counters specified by idx.
+ * First-use initialization of statically allocated srcu_struct
+ * structure.  Wiring up the combining tree is more than can be
+ * done with compile-time initialization, so this check is added
+ * to each update-side SRCU primitive.  Use ->gp_lock, which -is-
+ * compile-time initialized, to resolve races involving multiple
+ * CPUs trying to garner first-use privileges.
+ */
+static void check_init_srcu_struct(struct srcu_struct *sp)
+{
+	unsigned long flags;
+
+	WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT);
+	/* The smp_load_acquire() pairs with the smp_store_release(). */
+	if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/
+		return; /* Already initialized. */
+	spin_lock_irqsave(&sp->gp_lock, flags);
+	if (!rcu_seq_state(sp->srcu_gp_seq_needed)) {
+		spin_unlock_irqrestore(&sp->gp_lock, flags);
+		return;
+	}
+	init_srcu_struct_fields(sp, true);
+	spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
+/*
+ * Returns approximate total of the readers' ->srcu_lock_count[] values
+ * for the rank of per-CPU counters specified by idx.
  */
 static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 {
@@ -91,16 +209,16 @@ static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
 
-		sum += READ_ONCE(cpuc->lock_count[idx]);
+		sum += READ_ONCE(cpuc->srcu_lock_count[idx]);
 	}
 	return sum;
 }
 
 /*
- * Returns approximate total of the readers' ->unlock_count[] values for the
- * rank of per-CPU counters specified by idx.
+ * Returns approximate total of the readers' ->srcu_unlock_count[] values
+ * for the rank of per-CPU counters specified by idx.
  */
 static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 {
@@ -108,9 +226,9 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
 
-		sum += READ_ONCE(cpuc->unlock_count[idx]);
+		sum += READ_ONCE(cpuc->srcu_unlock_count[idx]);
 	}
 	return sum;
 }
@@ -145,14 +263,14 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 	 * the current index but not have incremented the lock counter yet.
 	 *
 	 * Possible bug: There is no guarantee that there haven't been
-	 * ULONG_MAX increments of ->lock_count[] since the unlocks were
+	 * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were
 	 * counted, meaning that this could return true even if there are
 	 * still active readers.  Since there are no memory barriers around
-	 * srcu_flip(), the CPU is not required to increment ->completed
+	 * srcu_flip(), the CPU is not required to increment ->srcu_idx
 	 * before running srcu_readers_unlock_idx(), which means that there
 	 * could be an arbitrarily large number of critical sections that
 	 * execute after srcu_readers_unlock_idx() but use the old value
-	 * of ->completed.
+	 * of ->srcu_idx.
 	 */
 	return srcu_readers_lock_idx(sp, idx) == unlocks;
 }
@@ -172,12 +290,12 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 	unsigned long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu);
+		struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu);
 
-		sum += READ_ONCE(cpuc->lock_count[0]);
-		sum += READ_ONCE(cpuc->lock_count[1]);
-		sum -= READ_ONCE(cpuc->unlock_count[0]);
-		sum -= READ_ONCE(cpuc->unlock_count[1]);
+		sum += READ_ONCE(cpuc->srcu_lock_count[0]);
+		sum += READ_ONCE(cpuc->srcu_lock_count[1]);
+		sum -= READ_ONCE(cpuc->srcu_unlock_count[0]);
+		sum -= READ_ONCE(cpuc->srcu_unlock_count[1]);
 	}
 	return sum;
 }
@@ -193,18 +311,21 @@ static bool srcu_readers_active(struct srcu_struct *sp)
  */
 void cleanup_srcu_struct(struct srcu_struct *sp)
 {
+	int cpu;
+
 	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
-	if (WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)))
-		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
-	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE)) {
-		pr_info("cleanup_srcu_struct: Active srcu_struct %lu CBs %c state: %d\n", rcu_segcblist_n_cbs(&sp->srcu_cblist), ".E"[rcu_segcblist_empty(&sp->srcu_cblist)], rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
+	for_each_possible_cpu(cpu)
+		flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work);
+	if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+	    WARN_ON(srcu_readers_active(sp))) {
+		pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)));
 		return; /* Caller forgot to stop doing call_srcu()? */
 	}
-	free_percpu(sp->per_cpu_ref);
-	sp->per_cpu_ref = NULL;
+	free_percpu(sp->sda);
+	sp->sda = NULL;
 }
 EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
 
@@ -217,8 +338,8 @@ int __srcu_read_lock(struct srcu_struct *sp)
 {
 	int idx;
 
-	idx = READ_ONCE(sp->completed) & 0x1;
-	__this_cpu_inc(sp->per_cpu_ref->lock_count[idx]);
+	idx = READ_ONCE(sp->srcu_idx) & 0x1;
+	__this_cpu_inc(sp->sda->srcu_lock_count[idx]);
 	smp_mb(); /* B */  /* Avoid leaking the critical section. */
 	return idx;
 }
@@ -233,7 +354,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 {
 	smp_mb(); /* C */  /* Avoid leaking the critical section. */
-	this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]);
+	this_cpu_inc(sp->sda->srcu_unlock_count[idx]);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
@@ -251,19 +372,207 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
  */
 static void srcu_gp_start(struct srcu_struct *sp)
 {
+	struct srcu_data *sdp = this_cpu_ptr(sp->sda);
 	int state;
 
-	rcu_segcblist_accelerate(&sp->srcu_cblist,
-				 rcu_seq_snap(&sp->srcu_gp_seq));
+	RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock),
+			 "Invoked srcu_gp_start() without ->gp_lock!");
+	WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+	rcu_segcblist_advance(&sdp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+				       rcu_seq_snap(&sp->srcu_gp_seq));
 	rcu_seq_start(&sp->srcu_gp_seq);
 	state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 	WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
 }
 
+/*
+ * Track online CPUs to guide callback workqueue placement.
+ */
+DEFINE_PER_CPU(bool, srcu_online);
+
+void srcu_online_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(per_cpu(srcu_online, cpu), true);
+}
+
+void srcu_offline_cpu(unsigned int cpu)
+{
+	WRITE_ONCE(per_cpu(srcu_online, cpu), false);
+}
+
+/*
+ * Place the workqueue handler on the specified CPU if online, otherwise
+ * just run it whereever.  This is useful for placing workqueue handlers
+ * that are to invoke the specified CPU's callbacks.
+ */
+static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
+				       struct delayed_work *dwork,
+				       unsigned long delay)
+{
+	bool ret;
+
+	preempt_disable();
+	if (READ_ONCE(per_cpu(srcu_online, cpu)))
+		ret = queue_delayed_work_on(cpu, wq, dwork, delay);
+	else
+		ret = queue_delayed_work(wq, dwork, delay);
+	preempt_enable();
+	return ret;
+}
+
+/*
+ * Schedule callback invocation for the specified srcu_data structure,
+ * if possible, on the corresponding CPU.
+ */
+static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
+{
+	srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq,
+				   &sdp->work, delay);
+}
+
+/*
+ * Schedule callback invocation for all srcu_data structures associated
+ * with the specified srcu_node structure, if possible, on the corresponding
+ * CPUs.
+ */
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp)
+{
+	int cpu;
+
+	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), SRCU_INTERVAL);
+}
+
+/*
+ * Note the end of an SRCU grace period.  Initiates callback invocation
+ * and starts a new grace period if needed.
+ *
+ * The ->srcu_cb_mutex acquisition does not protect any data, but
+ * instead prevents more than one grace period from starting while we
+ * are initiating callback invocation.  This allows the ->srcu_have_cbs[]
+ * array to have a finite number of elements.
+ */
+static void srcu_gp_end(struct srcu_struct *sp)
+{
+	bool cbs;
+	unsigned long gpseq;
+	int idx;
+	int idxnext;
+	struct srcu_node *snp;
+
+	/* Prevent more than one additional grace period. */
+	mutex_lock(&sp->srcu_cb_mutex);
+
+	/* End the current grace period. */
+	spin_lock_irq(&sp->gp_lock);
+	idx = rcu_seq_state(sp->srcu_gp_seq);
+	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+	rcu_seq_end(&sp->srcu_gp_seq);
+	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	spin_unlock_irq(&sp->gp_lock);
+	mutex_unlock(&sp->srcu_gp_mutex);
+	/* A new grace period can start at this point.  But only one. */
+
+	/* Initiate callback invocation as needed. */
+	idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs);
+	idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs);
+	rcu_for_each_node_breadth_first(sp, snp) {
+		spin_lock_irq(&snp->lock);
+		cbs = false;
+		if (snp >= sp->level[rcu_num_lvls - 1])
+			cbs = snp->srcu_have_cbs[idx] == gpseq;
+		snp->srcu_have_cbs[idx] = gpseq;
+		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+		spin_unlock_irq(&snp->lock);
+		if (cbs) {
+			smp_mb(); /* GP end before CB invocation. */
+			srcu_schedule_cbs_snp(sp, snp);
+		}
+	}
+
+	/* Callback initiation done, allow grace periods after next. */
+	mutex_unlock(&sp->srcu_cb_mutex);
+
+	/* Start a new grace period if needed. */
+	spin_lock_irq(&sp->gp_lock);
+	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	if (!rcu_seq_state(gpseq) &&
+	    ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) {
+		srcu_gp_start(sp);
+		spin_unlock_irq(&sp->gp_lock);
+		/* Throttle expedited grace periods: Should be rare! */
+		srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
+				    rcu_seq_ctr(gpseq) & 0xf
+				    ? 0
+				    : SRCU_INTERVAL);
+	} else {
+		spin_unlock_irq(&sp->gp_lock);
+	}
+}
+
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent grace-period
+ * requests.  The winner has to do the work of actually starting grace
+ * period s.  Losers must either ensure that their desired grace-period
+ * number is recorded on at least their leaf srcu_node structure, or they
+ * must take steps to invoke their own callbacks.
+ */
+static void srcu_funnel_gp_start(struct srcu_struct *sp,
+				 struct srcu_data *sdp,
+				 unsigned long s)
+{
+	unsigned long flags;
+	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
+	struct srcu_node *snp = sdp->mynode;
+	unsigned long snp_seq;
+
+	/* Each pass through the loop does one level of the srcu_node tree. */
+	for (; snp != NULL; snp = snp->srcu_parent) {
+		if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode)
+			return; /* GP already done and CBs recorded. */
+		spin_lock_irqsave(&snp->lock, flags);
+		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
+			snp_seq = snp->srcu_have_cbs[idx];
+			spin_unlock_irqrestore(&snp->lock, flags);
+			if (snp == sdp->mynode && snp_seq != s) {
+				smp_mb(); /* CBs after GP! */
+				srcu_schedule_cbs_sdp(sdp, 0);
+			}
+			return;
+		}
+		snp->srcu_have_cbs[idx] = s;
+		spin_unlock_irqrestore(&snp->lock, flags);
+	}
+
+	/* Top of tree, must ensure the grace period will be started. */
+	spin_lock_irqsave(&sp->gp_lock, flags);
+	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) {
+		/*
+		 * Record need for grace period s.  Pair with load
+		 * acquire setting up for initialization.
+		 */
+		smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
+	}
+
+	/* If grace period not already done and none in progress, start it. */
+	if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
+	    rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) {
+		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
+		srcu_gp_start(sp);
+		queue_delayed_work(system_power_efficient_wq, &sp->work,
+				   atomic_read(&sp->srcu_exp_cnt)
+				   ? 0
+				   : SRCU_INTERVAL);
+	}
+	spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
 /*
  * Wait until all readers counted by array index idx complete, but
  * loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->completed is not changed while checking.
+ * The caller must ensure that ->srcu_idx is not changed while checking.
  */
 static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 {
@@ -277,13 +586,13 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 }
 
 /*
- * Increment the ->completed counter so that future SRCU readers will
- * use the other rank of the ->(un)lock_count[] arrays.  This allows
+ * Increment the ->srcu_idx counter so that future SRCU readers will
+ * use the other rank of the ->srcu_(un)lock_count[] arrays.  This allows
  * us to wait for pre-existing readers in a starvation-free manner.
  */
 static void srcu_flip(struct srcu_struct *sp)
 {
-	WRITE_ONCE(sp->completed, sp->completed + 1);
+	WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1);
 
 	/*
 	 * Ensure that if the updater misses an __srcu_read_unlock()
@@ -296,21 +605,9 @@ static void srcu_flip(struct srcu_struct *sp)
 }
 
 /*
- * End an SRCU grace period.
- */
-static void srcu_gp_end(struct srcu_struct *sp)
-{
-	rcu_seq_end(&sp->srcu_gp_seq);
-
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_advance(&sp->srcu_cblist,
-			      rcu_seq_current(&sp->srcu_gp_seq));
-	spin_unlock_irq(&sp->queue_lock);
-}
-
-/*
- * Enqueue an SRCU callback on the specified srcu_struct structure,
- * initiating grace-period processing if it is not already running.
+ * Enqueue an SRCU callback on the srcu_data structure associated with
+ * the current CPU and the specified srcu_struct structure, initiating
+ * grace-period processing if it is not already running.
  *
  * Note that all CPUs must agree that the grace period extended beyond
  * all pre-existing SRCU read-side critical section.  On systems with
@@ -335,33 +632,40 @@ static void srcu_gp_end(struct srcu_struct *sp)
  * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
  * srcu_struct structure.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 	       rcu_callback_t func)
 {
 	unsigned long flags;
-
-	head->next = NULL;
-	head->func = func;
-	spin_lock_irqsave(&sp->queue_lock, flags);
-	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
-		srcu_gp_start(sp);
-		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
+	bool needgp = false;
+	unsigned long s;
+	struct srcu_data *sdp;
+
+	check_init_srcu_struct(sp);
+	rhp->func = func;
+	local_irq_save(flags);
+	sdp = this_cpu_ptr(sp->sda);
+	spin_lock(&sdp->lock);
+	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false);
+	rcu_segcblist_advance(&sdp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	s = rcu_seq_snap(&sp->srcu_gp_seq);
+	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
+	if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+		sdp->srcu_gp_seq_needed = s;
+		needgp = true;
 	}
-	spin_unlock_irqrestore(&sp->queue_lock, flags);
+	spin_unlock_irqrestore(&sdp->lock, flags);
+	if (needgp)
+		srcu_funnel_gp_start(sp, sdp, s);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
-static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
-
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
 static void __synchronize_srcu(struct srcu_struct *sp)
 {
 	struct rcu_synchronize rcu;
-	struct rcu_head *head = &rcu.head;
 
 	RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
 			 lock_is_held(&rcu_bh_lock_map) ||
@@ -372,26 +676,12 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return;
 	might_sleep();
+	check_init_srcu_struct(sp);
 	init_completion(&rcu.completion);
-
-	head->next = NULL;
-	head->func = wakeme_after_rcu;
-	spin_lock_irq(&sp->queue_lock);
-	smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */
-	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_IDLE) {
-		/* steal the processing owner */
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-		srcu_gp_start(sp);
-		spin_unlock_irq(&sp->queue_lock);
-		/* give the processing owner to work_struct */
-		srcu_reschedule(sp, 0);
-	} else {
-		rcu_segcblist_enqueue(&sp->srcu_cblist, head, false);
-		spin_unlock_irq(&sp->queue_lock);
-	}
-
+	init_rcu_head_on_stack(&rcu.head);
+	call_srcu(sp, &rcu.head, wakeme_after_rcu);
 	wait_for_completion(&rcu.completion);
-	smp_mb(); /* Caller's later accesses after GP. */
+	destroy_rcu_head_on_stack(&rcu.head);
 }
 
 /**
@@ -408,6 +698,7 @@ void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
 	bool do_norm = rcu_gp_is_normal();
 
+	check_init_srcu_struct(sp);
 	if (!do_norm) {
 		atomic_inc(&sp->srcu_exp_cnt);
 		smp_mb__after_atomic(); /* increment before GP. */
@@ -415,7 +706,7 @@ void synchronize_srcu_expedited(struct srcu_struct *sp)
 	__synchronize_srcu(sp);
 	if (!do_norm) {
 		smp_mb__before_atomic(); /* GP before decrement. */
-		atomic_dec(&sp->srcu_exp_cnt);
+		WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
 	}
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
@@ -426,8 +717,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
  *
  * Wait for the count to drain to zero of both indexes. To avoid the
  * possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->completed & 1) ^ 1) to drain to zero at first,
- * and then flip the completed and wait for the count of the other index.
+ * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
+ * and then flip the srcu_idx and wait for the count of the other index.
  *
  * Can block; must be called from process context.
  *
@@ -468,13 +759,69 @@ void synchronize_srcu(struct srcu_struct *sp)
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
+/*
+ * Callback function for srcu_barrier() use.
+ */
+static void srcu_barrier_cb(struct rcu_head *rhp)
+{
+	struct srcu_data *sdp;
+	struct srcu_struct *sp;
+
+	sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
+	sp = sdp->sp;
+	if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+		complete(&sp->srcu_barrier_completion);
+}
+
 /**
  * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
  * @sp: srcu_struct on which to wait for in-flight callbacks.
  */
 void srcu_barrier(struct srcu_struct *sp)
 {
-	synchronize_srcu(sp);
+	int cpu;
+	struct srcu_data *sdp;
+	unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq);
+
+	check_init_srcu_struct(sp);
+	mutex_lock(&sp->srcu_barrier_mutex);
+	if (rcu_seq_done(&sp->srcu_barrier_seq, s)) {
+		smp_mb(); /* Force ordering following return. */
+		mutex_unlock(&sp->srcu_barrier_mutex);
+		return; /* Someone else did our work for us. */
+	}
+	rcu_seq_start(&sp->srcu_barrier_seq);
+	init_completion(&sp->srcu_barrier_completion);
+
+	/* Initial count prevents reaching zero until all CBs are posted. */
+	atomic_set(&sp->srcu_barrier_cpu_cnt, 1);
+
+	/*
+	 * Each pass through this loop enqueues a callback, but only
+	 * on CPUs already having callbacks enqueued.  Note that if
+	 * a CPU already has callbacks enqueue, it must have already
+	 * registered the need for a future grace period, so all we
+	 * need do is enqueue a callback that will use the same
+	 * grace period as the last callback already in the queue.
+	 */
+	for_each_possible_cpu(cpu) {
+		sdp = per_cpu_ptr(sp->sda, cpu);
+		spin_lock_irq(&sdp->lock);
+		atomic_inc(&sp->srcu_barrier_cpu_cnt);
+		sdp->srcu_barrier_head.func = srcu_barrier_cb;
+		if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
+					   &sdp->srcu_barrier_head, 0))
+			atomic_dec(&sp->srcu_barrier_cpu_cnt);
+		spin_unlock_irq(&sdp->lock);
+	}
+
+	/* Remove the initial count, at which point reaching zero can happen. */
+	if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt))
+		complete(&sp->srcu_barrier_completion);
+	wait_for_completion(&sp->srcu_barrier_completion);
+
+	rcu_seq_end(&sp->srcu_barrier_seq);
+	mutex_unlock(&sp->srcu_barrier_mutex);
 }
 EXPORT_SYMBOL_GPL(srcu_barrier);
 
@@ -487,21 +834,24 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
  */
 unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
-	return sp->completed;
+	return sp->srcu_idx;
 }
 EXPORT_SYMBOL_GPL(srcu_batches_completed);
 
 /*
- * Core SRCU state machine.  Advance callbacks from ->batch_check0 to
- * ->batch_check1 and then to ->batch_done as readers drain.
+ * Core SRCU state machine.  Push state bits of ->srcu_gp_seq
+ * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has
+ * completed in that state.
  */
-static void srcu_advance_batches(struct srcu_struct *sp)
+static void srcu_advance_state(struct srcu_struct *sp)
 {
 	int idx;
 
+	mutex_lock(&sp->srcu_gp_mutex);
+
 	/*
 	 * Because readers might be delayed for an extended period after
-	 * fetching ->completed for their index, at any point in time there
+	 * fetching ->srcu_idx for their index, at any point in time there
 	 * might well be readers using both idx=0 and idx=1.  We therefore
 	 * need to wait for readers to clear from both index values before
 	 * invoking a callback.
@@ -511,23 +861,29 @@ static void srcu_advance_batches(struct srcu_struct *sp)
 	 */
 	idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */
 	if (idx == SRCU_STATE_IDLE) {
-		spin_lock_irq(&sp->queue_lock);
-		if (rcu_segcblist_empty(&sp->srcu_cblist)) {
-			spin_unlock_irq(&sp->queue_lock);
+		spin_lock_irq(&sp->gp_lock);
+		if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+			WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq));
+			spin_unlock_irq(&sp->gp_lock);
+			mutex_unlock(&sp->srcu_gp_mutex);
 			return;
 		}
 		idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
 		if (idx == SRCU_STATE_IDLE)
 			srcu_gp_start(sp);
-		spin_unlock_irq(&sp->queue_lock);
-		if (idx != SRCU_STATE_IDLE)
+		spin_unlock_irq(&sp->gp_lock);
+		if (idx != SRCU_STATE_IDLE) {
+			mutex_unlock(&sp->srcu_gp_mutex);
 			return; /* Someone else started the grace period. */
+		}
 	}
 
 	if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 1))
+		idx = 1 ^ (sp->srcu_idx & 1);
+		if (!try_check_zero(sp, idx, 1)) {
+			mutex_unlock(&sp->srcu_gp_mutex);
 			return; /* readers present, retry later. */
+		}
 		srcu_flip(sp);
 		rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2);
 	}
@@ -538,10 +894,12 @@ static void srcu_advance_batches(struct srcu_struct *sp)
 		 * SRCU read-side critical sections are normally short,
 		 * so check at least twice in quick succession after a flip.
 		 */
-		idx = 1 ^ (sp->completed & 1);
-		if (!try_check_zero(sp, idx, 2))
-			return; /* readers present, retry after later. */
-		srcu_gp_end(sp);
+		idx = 1 ^ (sp->srcu_idx & 1);
+		if (!try_check_zero(sp, idx, 2)) {
+			mutex_unlock(&sp->srcu_gp_mutex);
+			return; /* readers present, retry later. */
+		}
+		srcu_gp_end(sp);  /* Releases ->srcu_gp_mutex. */
 	}
 }
 
@@ -551,28 +909,51 @@ static void srcu_advance_batches(struct srcu_struct *sp)
  * the workqueue.  Note that needed memory barriers have been executed
  * in this task's context by srcu_readers_active_idx_check().
  */
-static void srcu_invoke_callbacks(struct srcu_struct *sp)
+static void srcu_invoke_callbacks(struct work_struct *work)
 {
+	bool more;
 	struct rcu_cblist ready_cbs;
 	struct rcu_head *rhp;
+	struct srcu_data *sdp;
+	struct srcu_struct *sp;
 
-	spin_lock_irq(&sp->queue_lock);
-	if (!rcu_segcblist_ready_cbs(&sp->srcu_cblist)) {
-		spin_unlock_irq(&sp->queue_lock);
-		return;
-	}
+	sdp = container_of(work, struct srcu_data, work.work);
+	sp = sdp->sp;
 	rcu_cblist_init(&ready_cbs);
-	rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
+	spin_lock_irq(&sdp->lock);
+	smp_mb(); /* Old grace periods before callback invocation! */
+	rcu_segcblist_advance(&sdp->srcu_cblist,
+			      rcu_seq_current(&sp->srcu_gp_seq));
+	if (sdp->srcu_cblist_invoking ||
+	    !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
+		spin_unlock_irq(&sdp->lock);
+		return;  /* Someone else on the job or nothing to do. */
+	}
+
+	/* We are on the job!  Extract and invoke ready callbacks. */
+	sdp->srcu_cblist_invoking = true;
+	rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs);
+	spin_unlock_irq(&sdp->lock);
 	rhp = rcu_cblist_dequeue(&ready_cbs);
 	for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) {
 		local_bh_disable();
 		rhp->func(rhp);
 		local_bh_enable();
 	}
-	spin_lock_irq(&sp->queue_lock);
-	rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs);
-	spin_unlock_irq(&sp->queue_lock);
+
+	/*
+	 * Update counts, accelerate new callbacks, and if needed,
+	 * schedule another round of callback invocation.
+	 */
+	spin_lock_irq(&sdp->lock);
+	rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs);
+	(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
+				       rcu_seq_snap(&sp->srcu_gp_seq));
+	sdp->srcu_cblist_invoking = false;
+	more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
+	spin_unlock_irq(&sdp->lock);
+	if (more)
+		srcu_schedule_cbs_sdp(sdp, 0);
 }
 
 /*
@@ -581,19 +962,21 @@ static void srcu_invoke_callbacks(struct srcu_struct *sp)
  */
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
 {
-	bool pending = true;
-	int state;
+	bool pushgp = true;
 
-	if (rcu_segcblist_empty(&sp->srcu_cblist)) {
-		spin_lock_irq(&sp->queue_lock);
-		state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq));
-		if (rcu_segcblist_empty(&sp->srcu_cblist) &&
-		    state == SRCU_STATE_IDLE)
-			pending = false;
-		spin_unlock_irq(&sp->queue_lock);
+	spin_lock_irq(&sp->gp_lock);
+	if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) {
+		if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) {
+			/* All requests fulfilled, time to go idle. */
+			pushgp = false;
+		}
+	} else if (!rcu_seq_state(sp->srcu_gp_seq)) {
+		/* Outstanding request and no GP.  Start one. */
+		srcu_gp_start(sp);
 	}
+	spin_unlock_irq(&sp->gp_lock);
 
-	if (pending)
+	if (pushgp)
 		queue_delayed_work(system_power_efficient_wq, &sp->work, delay);
 }
 
@@ -606,8 +989,7 @@ void process_srcu(struct work_struct *work)
 
 	sp = container_of(work, struct srcu_struct, work.work);
 
-	srcu_advance_batches(sp);
-	srcu_invoke_callbacks(sp);
+	srcu_advance_state(sp);
 	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 346948b51b0b..3c23435d2083 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3776,12 +3776,16 @@ int rcutree_online_cpu(unsigned int cpu)
 {
 	sync_sched_exp_online_cleanup(cpu);
 	rcutree_affinity_setting(cpu, -1);
+	if (IS_ENABLED(CONFIG_TREE_SRCU))
+		srcu_online_cpu(cpu);
 	return 0;
 }
 
 int rcutree_offline_cpu(unsigned int cpu)
 {
 	rcutree_affinity_setting(cpu, cpu);
+	if (IS_ENABLED(CONFIG_TREE_SRCU))
+		srcu_offline_cpu(cpu);
 	return 0;
 }
 
@@ -4157,6 +4161,8 @@ void __init rcu_init(void)
 	for_each_online_cpu(cpu) {
 		rcutree_prepare_cpu(cpu);
 		rcu_cpu_starting(cpu);
+		if (IS_ENABLED(CONFIG_TREE_SRCU))
+			srcu_online_cpu(cpu);
 	}
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a2a45cb629d6..0e598ab08fea 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -541,6 +541,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
 static void rcu_dynticks_task_enter(void);
 static void rcu_dynticks_task_exit(void);
 
+#ifdef CONFIG_SRCU
+void srcu_online_cpu(unsigned int cpu);
+void srcu_offline_cpu(unsigned int cpu);
+#else /* #ifdef CONFIG_SRCU */
+void srcu_online_cpu(unsigned int cpu) { }
+void srcu_offline_cpu(unsigned int cpu) { }
+#endif /* #else #ifdef CONFIG_SRCU */
+
 #endif /* #ifndef RCU_TREE_NONCORE */
 
 #ifdef CONFIG_RCU_TRACE
-- 
cgit v1.2.3


From bcbfdd01dce5556a952fae84ef16fd0f12525e7b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 Apr 2017 15:50:41 -0700
Subject: rcu: Make non-preemptive schedule be Tasks RCU quiescent state

Currently, a call to schedule() acts as a Tasks RCU quiescent state
only if a context switch actually takes place.  However, just the
call to schedule() guarantees that the calling task has moved off of
whatever tracing trampoline that it might have been one previously.
This commit therefore plumbs schedule()'s "preempt" parameter into
rcu_note_context_switch(), which then records the Tasks RCU quiescent
state, but only if this call to schedule() was -not- due to a preemption.

To avoid adding overhead to the common-case context-switch path,
this commit hides the rcu_note_context_switch() check under an existing
non-common-case check.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 11 ++++++++---
 include/linux/rcutiny.h  | 13 +++++++++----
 include/linux/rcutree.h  |  5 +++--
 kernel/rcu/tree.c        | 22 +++++++++++++++++++++-
 kernel/rcu/update.c      |  1 +
 kernel/sched/core.c      |  2 +-
 6 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e6146d0074f8..f531b29207da 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -363,15 +363,20 @@ static inline void rcu_init_nohz(void)
 #ifdef CONFIG_TASKS_RCU
 #define TASKS_RCU(x) x
 extern struct srcu_struct tasks_rcu_exit_srcu;
-#define rcu_note_voluntary_context_switch(t) \
+#define rcu_note_voluntary_context_switch_lite(t) \
 	do { \
-		rcu_all_qs(); \
 		if (READ_ONCE((t)->rcu_tasks_holdout)) \
 			WRITE_ONCE((t)->rcu_tasks_holdout, false); \
 	} while (0)
+#define rcu_note_voluntary_context_switch(t) \
+	do { \
+		rcu_all_qs(); \
+		rcu_note_voluntary_context_switch_lite(t); \
+	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
-#define rcu_note_voluntary_context_switch(t)	rcu_all_qs()
+#define rcu_note_voluntary_context_switch_lite(t)	do { } while (0)
+#define rcu_note_voluntary_context_switch(t)		rcu_all_qs()
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5219be250f00..74d9c3a1feee 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -92,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 	call_rcu(head, func);
 }
 
-static inline void rcu_note_context_switch(void)
-{
-	rcu_sched_qs();
-}
+#define rcu_note_context_switch(preempt) \
+	do { \
+		rcu_sched_qs(); \
+		rcu_note_voluntary_context_switch_lite(current); \
+	} while (0)
 
 /*
  * Take advantage of the fact that there is only one CPU, which
@@ -242,6 +243,10 @@ static inline bool rcu_is_watching(void)
 
 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
+static inline void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+}
+
 static inline void rcu_all_qs(void)
 {
 	barrier(); /* Avoid RCU read-side critical sections leaking across. */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 63a4e4cf40a5..0bacb6b2af69 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,7 +30,7 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
-void rcu_note_context_switch(void);
+void rcu_note_context_switch(bool preempt);
 int rcu_needs_cpu(u64 basem, u64 *nextevt);
 void rcu_cpu_stall_reset(void);
 
@@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void);
  */
 static inline void rcu_virt_note_context_switch(int cpu)
 {
-	rcu_note_context_switch();
+	rcu_note_context_switch(false);
 }
 
 void synchronize_rcu_bh(void);
@@ -108,6 +108,7 @@ void rcu_scheduler_starting(void);
 extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
+void rcu_request_urgent_qs_task(struct task_struct *t);
 
 void rcu_all_qs(void);
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3c23435d2083..891d97109e09 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -458,7 +458,7 @@ static void rcu_momentary_dyntick_idle(void)
  * and requires special handling for preemptible RCU.
  * The caller must have disabled interrupts.
  */
-void rcu_note_context_switch(void)
+void rcu_note_context_switch(bool preempt)
 {
 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
 	trace_rcu_utilization(TPS("Start context switch"));
@@ -471,6 +471,8 @@ void rcu_note_context_switch(void)
 	if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs)))
 		rcu_momentary_dyntick_idle();
 	this_cpu_inc(rcu_dynticks.rcu_qs_ctr);
+	if (!preempt)
+		rcu_note_voluntary_context_switch_lite(current);
 out:
 	trace_rcu_utilization(TPS("End context switch"));
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
@@ -1149,6 +1151,24 @@ bool notrace rcu_is_watching(void)
 }
 EXPORT_SYMBOL_GPL(rcu_is_watching);
 
+/*
+ * If a holdout task is actually running, request an urgent quiescent
+ * state from its CPU.  This is unsynchronized, so migrations can cause
+ * the request to go to the wrong CPU.  Which is OK, all that will happen
+ * is that the CPU's next context switch will be a bit slower and next
+ * time around this task will generate another request.
+ */
+void rcu_request_urgent_qs_task(struct task_struct *t)
+{
+	int cpu;
+
+	barrier();
+	cpu = task_cpu(t);
+	if (!task_curr(t))
+		return; /* This task is not running on that CPU. */
+	smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true);
+}
+
 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
 
 /*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c5df0d756900..273e869ca21d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -665,6 +665,7 @@ static void check_holdout_task(struct task_struct *t,
 		put_task_struct(t);
 		return;
 	}
+	rcu_request_urgent_qs_task(t);
 	if (!needreport)
 		return;
 	if (*firstreport) {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..2adf7b6c04e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3378,7 +3378,7 @@ static void __sched notrace __schedule(bool preempt)
 		hrtick_clear(rq);
 
 	local_irq_disable();
-	rcu_note_context_switch();
+	rcu_note_context_switch(preempt);
 
 	/*
 	 * Make sure that signal_pending_state()->signal_pending() below
-- 
cgit v1.2.3


From f9f38e33389c019ec880f6825119c94867c1fde0 Mon Sep 17 00:00:00 2001
From: Helen Koike <helen.koike@collabora.co.uk>
Date: Mon, 10 Apr 2017 12:51:07 -0300
Subject: nvme: improve performance for virtual NVMe devices

This change provides a mechanism to reduce the number of MMIO doorbell
writes for the NVMe driver. When running in a virtualized environment
like QEMU, the cost of an MMIO is quite hefy here. The main idea for
the patch is provide the device two memory location locations:
 1) to store the doorbell values so they can be lookup without the doorbell
    MMIO write
 2) to store an event index.
I believe the doorbell value is obvious, the event index not so much.
Similar to the virtio specification, the virtual device can tell the
driver (guest OS) not to write MMIO unless you are writing past this
value.

FYI: doorbell values are written by the nvme driver (guest OS) and the
event index is written by the virtual device (host OS).

The patch implements a new admin command that will communicate where
these two memory locations reside. If the command fails, the nvme
driver will work as before without any optimizations.

Contributions:
  Eric Northup <digitaleric@google.com>
  Frank Swiderski <fes@google.com>
  Ted Tso <tytso@mit.edu>
  Keith Busch <keith.busch@intel.com>

Just to give an idea on the performance boost with the vendor
extension: Running fio [1], a stock NVMe driver I get about 200K read
IOPs with my vendor patch I get about 1000K read IOPs. This was
running with a null device i.e. the backing device simply returned
success on every read IO request.

[1] Running on a 4 core machine:
  fio --time_based --name=benchmark --runtime=30
  --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
  --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
  --rw=randread --blocksize=4k --randrepeat=false

Signed-off-by: Rob Nelson <rlnelson@google.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
[koike: updated for upstream]
Signed-off-by: Helen Koike <helen.koike@collabora.co.uk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nvme.h    |  13 +++++
 2 files changed, 154 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index af783a33e93a..a363fecb8d82 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -103,8 +103,22 @@ struct nvme_dev {
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	u32 *dbbuf_dbs;
+	dma_addr_t dbbuf_dbs_dma_addr;
+	u32 *dbbuf_eis;
+	dma_addr_t dbbuf_eis_dma_addr;
 };
 
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+	return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+	return (qid * 2 + 1) * stride;
+}
+
 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
 {
 	return container_of(ctrl, struct nvme_dev, ctrl);
@@ -133,6 +147,10 @@ struct nvme_queue {
 	u16 qid;
 	u8 cq_phase;
 	u8 cqe_seen;
+	u32 *dbbuf_sq_db;
+	u32 *dbbuf_cq_db;
+	u32 *dbbuf_sq_ei;
+	u32 *dbbuf_cq_ei;
 };
 
 /*
@@ -171,6 +189,112 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+	return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+	if (dev->dbbuf_dbs)
+		return 0;
+
+	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_dbs_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_dbs)
+		return -ENOMEM;
+	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_eis_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+	if (dev->dbbuf_dbs) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+	}
+	if (dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+		dev->dbbuf_eis = NULL;
+	}
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+			    struct nvme_queue *nvmeq, int qid)
+{
+	if (!dev->dbbuf_dbs || !qid)
+		return;
+
+	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+	struct nvme_command c;
+
+	if (!dev->dbbuf_dbs)
+		return;
+
+	memset(&c, 0, sizeof(c));
+	c.dbbuf.opcode = nvme_admin_dbbuf;
+	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+		dev_warn(dev->dev, "unable to set dbbuf\n");
+		/* Free memory and continue on */
+		nvme_dbbuf_dma_free(dev);
+	}
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+					      volatile u32 *dbbuf_ei)
+{
+	if (dbbuf_db) {
+		u16 old_value;
+
+		/*
+		 * Ensure that the queue is written before updating
+		 * the doorbell in memory
+		 */
+		wmb();
+
+		old_value = *dbbuf_db;
+		*dbbuf_db = value;
+
+		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -297,7 +421,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
-	writel(tail, nvmeq->q_db);
+	if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+					      nvmeq->dbbuf_sq_ei))
+		writel(tail, nvmeq->q_db);
 	nvmeq->sq_tail = tail;
 }
 
@@ -686,7 +812,9 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 		return;
 
 	if (likely(nvmeq->cq_vector >= 0))
-		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+		if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+						      nvmeq->dbbuf_cq_ei))
+			writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 	nvmeq->cq_head = head;
 	nvmeq->cq_phase = phase;
 
@@ -1070,6 +1198,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+	nvme_dbbuf_init(dev, nvmeq, qid);
 	dev->online_queues++;
 	spin_unlock_irq(&nvmeq->q_lock);
 }
@@ -1542,6 +1671,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		if (blk_mq_alloc_tag_set(&dev->tagset))
 			return 0;
 		dev->ctrl.tagset = &dev->tagset;
+
+		nvme_dbbuf_set(dev);
 	} else {
 		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
 
@@ -1728,6 +1859,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
 
+	nvme_dbbuf_dma_free(dev);
 	put_device(dev->dev);
 	if (dev->tagset.tags)
 		blk_mq_free_tag_set(&dev->tagset);
@@ -1795,6 +1927,13 @@ static void nvme_reset_work(struct work_struct *work)
 		dev->ctrl.opal_dev = NULL;
 	}
 
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+		result = nvme_dbbuf_dma_alloc(dev);
+		if (result)
+			dev_warn(dev->dev,
+				 "unable to allocate dma for dbbuf\n");
+	}
+
 	result = nvme_setup_io_queues(dev);
 	if (result)
 		goto out;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9061780b141f..b625bacf37ef 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -245,6 +245,7 @@ enum {
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
+	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 7,
 };
 
 struct nvme_lbaf {
@@ -603,6 +604,7 @@ enum nvme_admin_opcode {
 	nvme_admin_download_fw		= 0x11,
 	nvme_admin_ns_attach		= 0x15,
 	nvme_admin_keep_alive		= 0x18,
+	nvme_admin_dbbuf		= 0x7C,
 	nvme_admin_format_nvm		= 0x80,
 	nvme_admin_security_send	= 0x81,
 	nvme_admin_security_recv	= 0x82,
@@ -874,6 +876,16 @@ struct nvmf_property_get_command {
 	__u8		resv4[16];
 };
 
+struct nvme_dbbuf {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	__le64			prp1;
+	__le64			prp2;
+	__u32			rsvd12[6];
+};
+
 struct nvme_command {
 	union {
 		struct nvme_common_command common;
@@ -893,6 +905,7 @@ struct nvme_command {
 		struct nvmf_connect_command connect;
 		struct nvmf_property_set_command prop_set;
 		struct nvmf_property_get_command prop_get;
+		struct nvme_dbbuf dbbuf;
 	};
 };
 
-- 
cgit v1.2.3


From 39498faef7c02f9f6de4060ccdc7e8975a6e690b Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 11 Apr 2017 11:32:28 -0700
Subject: nvmet_fc: add target feature flags for upcall isr contexts

Two new feature flags were added to control whether upcalls to the
transport result in context switches or stay in the calling context.

NVMET_FCTGTFEAT_CMD_IN_ISR:
  By default, if the flag is not set, the transport assumes the
  lldd is in a non-isr context and in the cpu context it should be
  for the io queue. As such, the cmd handler is called directly in the
  calling context.
  If the flag is set, indicating the upcall is an isr context, the
  transport mandates a transition to a workqueue. The workqueue assigned
  to the queue is used for the context.
NVMET_FCTGTFEAT_OPDONE_IN_ISR
  By default, if the flag is not set, the transport assumes the
  lldd is in a non-isr context and in the cpu context it should be
  for the io queue. As such, the fcp operation done callback is called
  directly in the calling context.
  If the flag is set, indicating the upcall is an isr context, the
  transport mandates a transition to a workqueue. The workqueue assigned
  to the queue is used for the context.

Updated lpfc for flags

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/fc.c       | 37 ++++++++++++++++++++++++++++++++++---
 drivers/nvme/target/fcloop.c   |  5 +++--
 drivers/scsi/lpfc/lpfc_nvmet.c |  4 +++-
 include/linux/nvme-fc-driver.h | 16 ++++++++++++++++
 4 files changed, 56 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 2c0709f0a7d3..a4fad3d0b660 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -86,6 +86,7 @@ struct nvmet_fc_fcp_iod {
 
 	struct nvmet_req		req;
 	struct work_struct		work;
+	struct work_struct		done_work;
 
 	struct nvmet_fc_tgtport		*tgtport;
 	struct nvmet_fc_tgt_queue	*queue;
@@ -213,6 +214,7 @@ static DEFINE_IDA(nvmet_fc_tgtport_cnt);
 
 static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
 static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
 static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
 static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
 static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
@@ -414,6 +416,7 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 
 	for (i = 0; i < queue->sqsize; fod++, i++) {
 		INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+		INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
 		fod->tgtport = tgtport;
 		fod->queue = queue;
 		fod->active = false;
@@ -1807,10 +1810,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 	}
 }
 
+/*
+ * actual done handler for FCP operations when completed by the lldd
+ */
 static void
-nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 {
-	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
 	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
 	unsigned long flags;
 	bool abort;
@@ -1905,6 +1911,28 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 	}
 }
 
+static void
+nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
+{
+	struct nvmet_fc_fcp_iod *fod =
+		container_of(work, struct nvmet_fc_fcp_iod, done_work);
+
+	nvmet_fc_fod_op_done(fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgt_queue *queue = fod->queue;
+
+	if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
+		/* context switch so completion is not in ISR context */
+		queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
+	else
+		nvmet_fc_fod_op_done(fod);
+}
+
 /*
  * actual completion handler after execution by the nvmet layer
  */
@@ -2149,7 +2177,10 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
 			((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
 	memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
 
-	queue_work_on(queue->cpu, queue->work_q, &fod->work);
+	if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
+		queue_work_on(queue->cpu, queue->work_q, &fod->work);
+	else
+		nvmet_fc_handle_fcp_rqst(tgtport, fod);
 
 	return 0;
 }
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 4e8e6a22bce1..a5cdeca39013 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -575,8 +575,9 @@ struct nvmet_fc_target_template tgttemplate = {
 	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
 	.dma_boundary		= FCLOOP_DMABOUND_4G,
 	/* optional features */
-	.target_features	= NVMET_FCTGTFEAT_READDATA_RSP |
-				  NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED,
+	.target_features	= NVMET_FCTGTFEAT_CMD_IN_ISR |
+				  NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+				  NVMET_FCTGTFEAT_OPDONE_IN_ISR,
 	/* sizes of additional private data for data structures */
 	.target_priv_sz		= sizeof(struct fcloop_tport),
 };
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 7ca868f394da..176b4e035bcf 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -669,7 +669,9 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba)
 	lpfc_tgttemplate.max_hw_queues = phba->cfg_nvme_io_channel;
 	lpfc_tgttemplate.max_sgl_segments = phba->cfg_sg_seg_cnt;
 	lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP |
-					   NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED;
+					   NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+					   NVMET_FCTGTFEAT_CMD_IN_ISR |
+					   NVMET_FCTGTFEAT_OPDONE_IN_ISR;
 
 #if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
 	error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate,
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 16eb264980c2..d70a9c98bc23 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -655,6 +655,22 @@ enum {
 		 * on. The transport should pick a cpu to schedule the work
 		 * on.
 		 */
+	NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2),
+		/* Bit 2: When 0, the LLDD is calling the cmd rcv handler
+		 * in a non-isr context, allowing the transport to finish
+		 * op completion in the calling context. When 1, the LLDD
+		 * is calling the cmd rcv handler in an ISR context,
+		 * requiring the transport to transition to a workqueue
+		 * for op completion.
+		 */
+	NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3),
+		/* Bit 3: When 0, the LLDD is calling the op done handler
+		 * in a non-isr context, allowing the transport to finish
+		 * op completion in the calling context. When 1, the LLDD
+		 * is calling the op done handler in an ISR context,
+		 * requiring the transport to transition to a workqueue
+		 * for op completion.
+		 */
 };
 
 
-- 
cgit v1.2.3


From 19b58d9473e8e3d38e7f3602a07c8febfbd07bc1 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 11 Apr 2017 11:32:29 -0700
Subject: nvmet_fc: add req_release to lldd api

With the advent of the opdone calls changing context, the lldd can no
longer assume that once the op->done call returns for RSP operations
that the request struct is no longer being accessed.

As such, revise the lldd api for a req_release callback that the
transport will call when the job is complete. This will also be used
with abort cases.

Fixed text in api header for change in io complete semantics.

Revised lpfc to support the new req_release api.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/fc.c       |  8 +++--
 drivers/nvme/target/fcloop.c   | 15 ++++++---
 drivers/scsi/lpfc/lpfc_nvmet.c | 73 +++++++++++++++++++++++++++++++++++++++---
 drivers/scsi/lpfc/lpfc_nvmet.h |  7 ++--
 include/linux/nvme-fc-driver.h | 57 ++++++++++++++++++++-------------
 5 files changed, 124 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index a4fad3d0b660..d7068f039904 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -482,6 +482,8 @@ static void
 nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 			struct nvmet_fc_fcp_iod *fod)
 {
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
 	unsigned long flags;
 
 	spin_lock_irqsave(&queue->qlock, flags);
@@ -493,6 +495,8 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 	 * release the reference taken at queue lookup and fod allocation
 	 */
 	nvmet_fc_tgt_q_put(queue);
+
+	tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
 }
 
 static int
@@ -849,7 +853,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
 	int ret, idx;
 
 	if (!template->xmt_ls_rsp || !template->fcp_op ||
-	    !template->targetport_delete ||
+	    !template->fcp_req_release || !template->targetport_delete ||
 	    !template->max_hw_queues || !template->max_sgl_segments ||
 	    !template->max_dif_sgl_segments || !template->dma_boundary) {
 		ret = -EINVAL;
@@ -2124,7 +2128,7 @@ nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
  * If this routine returns error, the lldd should abort the exchange.
  *
  * @target_port: pointer to the (registered) target port the FCP CMD IU
- *              was receive on.
+ *              was received on.
  * @fcpreq:     pointer to a fcpreq request structure to be used to reference
  *              the exchange corresponding to the FCP Exchange.
  * @cmdiubuf:   pointer to the buffer containing the FCP CMD IU
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index a5cdeca39013..2c4d68201ba7 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -492,14 +492,18 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
 	tgt_fcpreq->fcp_error = fcp_err;
 	tgt_fcpreq->done(tgt_fcpreq);
 
-	if ((!fcp_err) && (op == NVMET_FCOP_RSP ||
-			op == NVMET_FCOP_READDATA_RSP ||
-			op == NVMET_FCOP_ABORT))
-		schedule_work(&tfcp_req->work);
-
 	return 0;
 }
 
+static void
+fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+	schedule_work(&tfcp_req->work);
+}
+
 static void
 fcloop_ls_abort(struct nvme_fc_local_port *localport,
 			struct nvme_fc_remote_port *remoteport,
@@ -570,6 +574,7 @@ struct nvmet_fc_target_template tgttemplate = {
 	.targetport_delete	= fcloop_targetport_delete,
 	.xmt_ls_rsp		= fcloop_xmt_ls_rsp,
 	.fcp_op			= fcloop_fcp_op,
+	.fcp_req_release	= fcloop_fcp_req_release,
 	.max_hw_queues		= FCLOOP_HW_QUEUES,
 	.max_sgl_segments	= FCLOOP_SGL_SEGS,
 	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 176b4e035bcf..1846c7eb086a 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -408,9 +408,7 @@ out:
 		if (phba->ktime_on)
 			lpfc_nvmet_ktime(phba, ctxp);
 #endif
-		/* Let Abort cmpl repost the context */
-		if (!(ctxp->flag & LPFC_NVMET_ABORT_OP))
-			lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+		/* lpfc_nvmet_xmt_fcp_release() will recycle the context */
 	} else {
 		ctxp->entry_cnt++;
 		start_clean = offsetof(struct lpfc_iocbq, wqe);
@@ -634,10 +632,47 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
 	complete(&tport->tport_unreg_done);
 }
 
+static void
+lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
+			   struct nvmefc_tgt_fcp_req *rsp)
+{
+	struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+	struct lpfc_nvmet_rcv_ctx *ctxp =
+		container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+	struct lpfc_hba *phba = ctxp->phba;
+	unsigned long flags;
+	bool aborting = false;
+
+	spin_lock_irqsave(&ctxp->ctxlock, flags);
+	if (ctxp->flag & LPFC_NVMET_ABORT_OP) {
+		aborting = true;
+		ctxp->flag |= LPFC_NVMET_CTX_RLS;
+	}
+	spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+	if (aborting)
+		/* let the abort path do the real release */
+		return;
+
+	/* Sanity check */
+	if (ctxp->state != LPFC_NVMET_STE_DONE) {
+		atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
+		lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
+				"6117 Bad state IO x%x aborted\n",
+				ctxp->oxid);
+	}
+
+	lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid,
+			 ctxp->state, 0);
+
+	lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+}
+
 static struct nvmet_fc_target_template lpfc_tgttemplate = {
 	.targetport_delete = lpfc_nvmet_targetport_delete,
 	.xmt_ls_rsp     = lpfc_nvmet_xmt_ls_rsp,
 	.fcp_op         = lpfc_nvmet_xmt_fcp_op,
+	.fcp_req_release = lpfc_nvmet_xmt_fcp_release,
 
 	.max_hw_queues  = 1,
 	.max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS,
@@ -834,6 +869,7 @@ dropit:
 	ctxp->wqeq = NULL;
 	ctxp->state = LPFC_NVMET_STE_RCV;
 	ctxp->rqb_buffer = (void *)nvmebuf;
+	spin_lock_init(&ctxp->ctxlock);
 
 	lpfc_nvmeio_data(phba, "NVMET LS   RCV: xri x%x sz %d from %06x\n",
 			 oxid, size, sid);
@@ -1595,6 +1631,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 	struct lpfc_nvmet_rcv_ctx *ctxp;
 	struct lpfc_nvmet_tgtport *tgtp;
 	uint32_t status, result;
+	unsigned long flags;
+	bool released = false;
 
 	ctxp = cmdwqe->context2;
 	status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1609,7 +1647,18 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 			result, wcqe->word3);
 
 	ctxp->state = LPFC_NVMET_STE_DONE;
-	lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+	spin_lock_irqsave(&ctxp->ctxlock, flags);
+	if (ctxp->flag & LPFC_NVMET_CTX_RLS)
+		released = true;
+	ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+	spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+	/*
+	 * if transport has released ctx, then can reuse it. Otherwise,
+	 * will be recycled by transport release call.
+	 */
+	if (released)
+		lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
 
 	cmdwqe->context2 = NULL;
 	cmdwqe->context3 = NULL;
@@ -1632,7 +1681,9 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 {
 	struct lpfc_nvmet_rcv_ctx *ctxp;
 	struct lpfc_nvmet_tgtport *tgtp;
+	unsigned long flags;
 	uint32_t status, result;
+	bool released = false;
 
 	ctxp = cmdwqe->context2;
 	status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1654,7 +1705,19 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 					ctxp->state, ctxp->oxid);
 		}
 		ctxp->state = LPFC_NVMET_STE_DONE;
-		lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+		spin_lock_irqsave(&ctxp->ctxlock, flags);
+		if (ctxp->flag & LPFC_NVMET_CTX_RLS)
+			released = true;
+		ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+		spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+		/*
+		 * if transport has released ctx, then can reuse it. Otherwise,
+		 * will be recycled by transport release call.
+		 */
+		if (released)
+			lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+
 		cmdwqe->context2 = NULL;
 		cmdwqe->context3 = NULL;
 	}
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h
index ca96f05c1604..02735fc6fd41 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.h
+++ b/drivers/scsi/lpfc/lpfc_nvmet.h
@@ -81,6 +81,7 @@ struct lpfc_nvmet_rcv_ctx {
 	struct lpfc_iocbq *wqeq;
 	struct lpfc_iocbq *abort_wqeq;
 	dma_addr_t txrdy_phys;
+	spinlock_t ctxlock; /* protect flag access */
 	uint32_t *txrdy;
 	uint32_t sid;
 	uint32_t offset;
@@ -97,8 +98,10 @@ struct lpfc_nvmet_rcv_ctx {
 #define LPFC_NVMET_STE_RSP		4
 #define LPFC_NVMET_STE_DONE		5
 	uint16_t flag;
-#define LPFC_NVMET_IO_INP		1
-#define LPFC_NVMET_ABORT_OP		2
+#define LPFC_NVMET_IO_INP		0x1
+#define LPFC_NVMET_ABORT_OP		0x2
+#define LPFC_NVMET_CTX_RLS		0x4
+
 	struct rqb_dmabuf *rqb_buffer;
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index d70a9c98bc23..d98ddb2feabc 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -741,12 +741,12 @@ struct nvmet_fc_target_port {
  *       be freed/released.
  *       Entrypoint is Mandatory.
  *
- * @fcp_op:  Called to perform a data transfer, transmit a response, or
- *       abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same
- *       LLDD-supplied exchange structure specified in the
- *       nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received.
- *       The op field in the structure shall indicate the operation for
- *       the LLDD to perform relative to the io.
+ * @fcp_op:  Called to perform a data transfer or transmit a response.
+ *       The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
+ *       exchange structure specified in the nvmet_fc_rcv_fcp_req() call
+ *       made when the FCP CMD IU was received. The op field in the
+ *       structure shall indicate the operation for the LLDD to perform
+ *       relative to the io.
  *         NVMET_FCOP_READDATA operation: the LLDD is to send the
  *           payload data (described by sglist) to the host in 1 or
  *           more FC sequences (preferrably 1).  Note: the fc-nvme layer
@@ -768,29 +768,35 @@ struct nvmet_fc_target_port {
  *           successfully, the LLDD is to update the nvmefc_tgt_fcp_req
  *           transferred_length field and may subsequently transmit the
  *           FCP_RSP iu payload (described by rspbuf, rspdma, rsplen).
- *           The LLDD is to await FCP_CONF reception to confirm the RSP
- *           reception by the host. The LLDD may retramsit the FCP_RSP iu
- *           if necessary per FC-NVME. Upon reception of FCP_CONF, or upon
- *           FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req
- *           fcp_error field and consider the operation complete..
+ *           If FCP_CONF is supported, the LLDD is to await FCP_CONF
+ *           reception to confirm the RSP reception by the host. The LLDD
+ *           may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon
+ *           transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ *           or upon success/failure of FCP_CONF if it is supported, the
+ *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
+ *           consider the operation complete.
  *         NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload
- *           (described by rspbuf, rspdma, rsplen).  The LLDD is to await
- *           FCP_CONF reception to confirm the RSP reception by the host.
- *           The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME.
- *           Upon reception of FCP_CONF, or upon FCP_CONF failure, the
+ *           (described by rspbuf, rspdma, rsplen). If FCP_CONF is
+ *           supported, the LLDD is to await FCP_CONF reception to confirm
+ *           the RSP reception by the host. The LLDD may retramsit the
+ *           FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon
+ *           transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ *           or upon success/failure of FCP_CONF if it is supported, the
  *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
- *           consider the operation complete..
+ *           consider the operation complete.
  *         NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
  *           corresponding to the fcp operation. The LLDD shall send
  *           ABTS and follow FC exchange abort-multi rules, including
  *           ABTS retries and possible logout.
  *       Upon completing the indicated operation, the LLDD is to set the
  *       status fields for the operation (tranferred_length and fcp_error
- *       status) in the request, then all the "done" routine
- *       indicated in the fcp request.  Upon return from the "done"
- *       routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation
- *       the fc-nvme layer will not longer reference the fcp request,
- *       allowing the LLDD to free/release the fcp request.
+ *       status) in the request, then call the "done" routine
+ *       indicated in the fcp request. After the operation completes,
+ *       regardless of whether the FCP_RSP iu was successfully transmit,
+ *       the LLDD-supplied exchange structure must remain valid until the
+ *       transport calls the fcp_req_release() callback to return ownership
+ *       of the exchange structure back to the LLDD so that it may be used
+ *       for another fcp command.
  *       Note: when calling the done routine for READDATA or WRITEDATA
  *       operations, the fc-nvme layer may immediate convert, in the same
  *       thread and before returning to the LLDD, the fcp operation to
@@ -802,6 +808,11 @@ struct nvmet_fc_target_port {
  *       Returns 0 on success, -<errno> on failure (Ex: -EIO)
  *       Entrypoint is Mandatory.
  *
+ * @fcp_req_release:  Called by the transport to return a nvmefc_tgt_fcp_req
+ *       to the LLDD after all operations on the fcp operation are complete.
+ *       This may be due to the command completing or upon completion of
+ *       abort cleanup.
+ *
  * @max_hw_queues:  indicates the maximum number of hw queues the LLDD
  *       supports for cpu affinitization.
  *       Value is Mandatory. Must be at least 1.
@@ -836,7 +847,9 @@ struct nvmet_fc_target_template {
 	int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_ls_req *tls_req);
 	int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
-				struct nvmefc_tgt_fcp_req *);
+				struct nvmefc_tgt_fcp_req *fcpreq);
+	void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
+				struct nvmefc_tgt_fcp_req *fcpreq);
 
 	u32	max_hw_queues;
 	u16	max_sgl_segments;
-- 
cgit v1.2.3


From a97ec51b37efacb84f286979876675a8143035b0 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 11 Apr 2017 11:32:31 -0700
Subject: nvmet_fc: Rework target side abort handling

target transport:
----------------------
There are cases when there is a need to abort in-progress target
operations (writedata) so that controller termination or errors can
clean up. That can't happen currently as the abort is another target
op type, so it can't be used till the running one finishes (and it may
not).  Solve by removing the abort op type and creating a separate
downcall from the transport to the lldd to request an io to be aborted.

The transport will abort ios on queue teardown or io errors. In general
the transport tries to call the lldd abort only when the io state is
idle. Meaning: ops that transmit data (readdata or rsp) will always
finish their transmit (or the lldd will see a state on the
link or initiator port that fails the transmit) and the done call for
the operation will occur. The transport will wait for the op done
upcall before calling the abort function, and as the io is idle, the
io can be cleaned up immediately after the abort call; Similarly, ios
that are not waiting for data or transmitting data must be in the nvmet
layer being processed. The transport will wait for the nvmet layer
completion before calling the abort function, and as the io is idle,
the io can be cleaned up immediately after the abort call; As for ops
that are waiting for data (writedata), they may be outstanding
indefinitely if the lldd doesn't see a condition where the initiatior
port or link is bad. In those cases, the transport will call the abort
function and wait for the lldd's op done upcall for the operation, where
it will then clean up the io.

Additionally, if a lldd receives an ABTS and matches it to an outstanding
request in the transport, A new new transport upcall was created to abort
the outstanding request in the transport. The transport expects any
outstanding op call (readdata or writedata) will completed by the lldd and
the operation upcall made. The transport doesn't act on the reported
abort (e.g. clean up the io) until an op done upcall occurs, a new op is
attempted, or the nvmet layer completes the io processing.

fcloop:
----------------------
Updated to support the new target apis.
On fcp io aborts from the initiator, the loopback context is updated to
NULL out the half that has completed. The initiator side is immediately
called after the abort request with an io completion (abort status).
On fcp io aborts from the target, the io is stopped and the initiator side
sees it as an aborted io. Target side ops, perhaps in progress while the
initiator side is done, continue but noop the data movement as there's no
structure on the initiator side to reference.

patch also contains:
----------------------
Revised lpfc to support the new abort api

commonized rsp buffer syncing and nulling of private data based on
calling paths.

errors in op done calls don't take action on the fod. They're bad
operations which implies the fod may be bad.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/fc.c       | 205 ++++++++++++++++++++++++++++++-----------
 drivers/nvme/target/fcloop.c   | 152 +++++++++++++++++++++++++-----
 drivers/scsi/lpfc/lpfc_nvmet.c |  49 +++++-----
 include/linux/nvme-fc-driver.h |  25 +++--
 4 files changed, 325 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index d7068f039904..e5d30bbb1b10 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -82,6 +82,8 @@ struct nvmet_fc_fcp_iod {
 	enum nvmet_fcp_datadir		io_dir;
 	bool				active;
 	bool				abort;
+	bool				aborted;
+	bool				writedataactive;
 	spinlock_t			flock;
 
 	struct nvmet_req		req;
@@ -420,6 +422,9 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 		fod->tgtport = tgtport;
 		fod->queue = queue;
 		fod->active = false;
+		fod->abort = false;
+		fod->aborted = false;
+		fod->fcpreq = NULL;
 		list_add_tail(&fod->fcp_list, &queue->fod_list);
 		spin_lock_init(&fod->flock);
 
@@ -466,7 +471,6 @@ nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
 	if (fod) {
 		list_del(&fod->fcp_list);
 		fod->active = true;
-		fod->abort = false;
 		/*
 		 * no queue reference is taken, as it was taken by the
 		 * queue lookup just prior to the allocation. The iod
@@ -486,9 +490,18 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
 	unsigned long flags;
 
+	fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+
+	fcpreq->nvmet_fc_private = NULL;
+
 	spin_lock_irqsave(&queue->qlock, flags);
 	list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
 	fod->active = false;
+	fod->abort = false;
+	fod->aborted = false;
+	fod->writedataactive = false;
+	fod->fcpreq = NULL;
 	spin_unlock_irqrestore(&queue->qlock, flags);
 
 	/*
@@ -622,33 +635,13 @@ nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
 }
 
 
-static void
-nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
-				struct nvmefc_tgt_fcp_req *fcpreq)
-{
-	int ret;
-
-	fcpreq->op = NVMET_FCOP_ABORT;
-	fcpreq->offset = 0;
-	fcpreq->timeout = 0;
-	fcpreq->transfer_length = 0;
-	fcpreq->transferred_length = 0;
-	fcpreq->fcp_error = 0;
-	fcpreq->sg_cnt = 0;
-
-	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fcpreq);
-	if (ret)
-		/* should never reach here !! */
-		WARN_ON(1);
-}
-
-
 static void
 nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
 {
+	struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport;
 	struct nvmet_fc_fcp_iod *fod = queue->fod;
 	unsigned long flags;
-	int i;
+	int i, writedataactive;
 	bool disconnect;
 
 	disconnect = atomic_xchg(&queue->connected, 0);
@@ -659,7 +652,20 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
 		if (fod->active) {
 			spin_lock(&fod->flock);
 			fod->abort = true;
+			writedataactive = fod->writedataactive;
 			spin_unlock(&fod->flock);
+			/*
+			 * only call lldd abort routine if waiting for
+			 * writedata. other outstanding ops should finish
+			 * on their own.
+			 */
+			if (writedataactive) {
+				spin_lock(&fod->flock);
+				fod->aborted = true;
+				spin_unlock(&fod->flock);
+				tgtport->ops->fcp_abort(
+					&tgtport->fc_target_port, fod->fcpreq);
+			}
 		}
 	}
 	spin_unlock_irqrestore(&queue->qlock, flags);
@@ -853,6 +859,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
 	int ret, idx;
 
 	if (!template->xmt_ls_rsp || !template->fcp_op ||
+	    !template->fcp_abort ||
 	    !template->fcp_req_release || !template->targetport_delete ||
 	    !template->max_hw_queues || !template->max_sgl_segments ||
 	    !template->max_dif_sgl_segments || !template->dma_boundary) {
@@ -1717,6 +1724,26 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 
 static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
 
+static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+
+	/* data no longer needed */
+	nvmet_fc_free_tgt_pgs(fod);
+
+	/*
+	 * if an ABTS was received or we issued the fcp_abort early
+	 * don't call abort routine again.
+	 */
+	/* no need to take lock - lock was taken earlier to get here */
+	if (!fod->aborted)
+		tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq);
+
+	nvmet_fc_free_fcp_iod(fod->queue, fod);
+}
+
 static void
 nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 				struct nvmet_fc_fcp_iod *fod)
@@ -1730,7 +1757,7 @@ nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 
 	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
 	if (ret)
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		nvmet_fc_abort_op(tgtport, fod);
 }
 
 static void
@@ -1739,6 +1766,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 {
 	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
 	struct scatterlist *sg, *datasg;
+	unsigned long flags;
 	u32 tlen, sg_off;
 	int ret;
 
@@ -1803,10 +1831,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 		 */
 		fod->abort = true;
 
-		if (op == NVMET_FCOP_WRITEDATA)
+		if (op == NVMET_FCOP_WRITEDATA) {
+			spin_lock_irqsave(&fod->flock, flags);
+			fod->writedataactive = false;
+			spin_unlock_irqrestore(&fod->flock, flags);
 			nvmet_req_complete(&fod->req,
 					NVME_SC_FC_TRANSPORT_ERROR);
-		else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+		} else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
 			fcpreq->fcp_error = ret;
 			fcpreq->transferred_length = 0;
 			nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
@@ -1814,6 +1845,27 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 	}
 }
 
+static inline bool
+__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	/* if in the middle of an io and we need to tear down */
+	if (abort) {
+		if (fcpreq->op == NVMET_FCOP_WRITEDATA) {
+			nvmet_req_complete(&fod->req,
+					NVME_SC_FC_TRANSPORT_ERROR);
+			return true;
+		}
+
+		nvmet_fc_abort_op(tgtport, fod);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * actual done handler for FCP operations when completed by the lldd
  */
@@ -1827,22 +1879,20 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 
 	spin_lock_irqsave(&fod->flock, flags);
 	abort = fod->abort;
+	fod->writedataactive = false;
 	spin_unlock_irqrestore(&fod->flock, flags);
 
-	/* if in the middle of an io and we need to tear down */
-	if (abort && fcpreq->op != NVMET_FCOP_ABORT) {
-		/* data no longer needed */
-		nvmet_fc_free_tgt_pgs(fod);
-
-		nvmet_req_complete(&fod->req, fcpreq->fcp_error);
-		return;
-	}
-
 	switch (fcpreq->op) {
 
 	case NVMET_FCOP_WRITEDATA:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
 		if (fcpreq->fcp_error ||
 		    fcpreq->transferred_length != fcpreq->transfer_length) {
+			spin_lock(&fod->flock);
+			fod->abort = true;
+			spin_unlock(&fod->flock);
+
 			nvmet_req_complete(&fod->req,
 					NVME_SC_FC_TRANSPORT_ERROR);
 			return;
@@ -1850,6 +1900,10 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 
 		fod->offset += fcpreq->transferred_length;
 		if (fod->offset != fod->total_length) {
+			spin_lock_irqsave(&fod->flock, flags);
+			fod->writedataactive = true;
+			spin_unlock_irqrestore(&fod->flock, flags);
+
 			/* transfer the next chunk */
 			nvmet_fc_transfer_fcp_data(tgtport, fod,
 						NVMET_FCOP_WRITEDATA);
@@ -1864,12 +1918,11 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 
 	case NVMET_FCOP_READDATA:
 	case NVMET_FCOP_READDATA_RSP:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
 		if (fcpreq->fcp_error ||
 		    fcpreq->transferred_length != fcpreq->transfer_length) {
-			/* data no longer needed */
-			nvmet_fc_free_tgt_pgs(fod);
-
-			nvmet_fc_abort_op(tgtport, fod->fcpreq);
+			nvmet_fc_abort_op(tgtport, fod);
 			return;
 		}
 
@@ -1878,8 +1931,6 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
 			/* data no longer needed */
 			nvmet_fc_free_tgt_pgs(fod);
-			fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
-					sizeof(fod->rspiubuf), DMA_TO_DEVICE);
 			nvmet_fc_free_fcp_iod(fod->queue, fod);
 			return;
 		}
@@ -1902,15 +1953,12 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		break;
 
 	case NVMET_FCOP_RSP:
-	case NVMET_FCOP_ABORT:
-		fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
-				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
 		nvmet_fc_free_fcp_iod(fod->queue, fod);
 		break;
 
 	default:
-		nvmet_fc_free_tgt_pgs(fod);
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
 		break;
 	}
 }
@@ -1958,10 +2006,7 @@ __nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
 		fod->queue->sqhd = cqe->sq_head;
 
 	if (abort) {
-		/* data no longer needed */
-		nvmet_fc_free_tgt_pgs(fod);
-
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		nvmet_fc_abort_op(tgtport, fod);
 		return;
 	}
 
@@ -2057,8 +2102,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 				&fod->queue->nvme_cq,
 				&fod->queue->nvme_sq,
 				&nvmet_fc_tgt_fcp_ops);
-	if (!ret) {	/* bad SQE content */
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+	if (!ret) {	/* bad SQE content or invalid ctrl state */
+		nvmet_fc_abort_op(tgtport, fod);
 		return;
 	}
 
@@ -2098,7 +2143,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 	return;
 
 transport_error:
-	nvmet_fc_abort_op(tgtport, fod->fcpreq);
+	nvmet_fc_abort_op(tgtport, fod);
 }
 
 /*
@@ -2151,7 +2196,6 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
 			(be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
 		return -EIO;
 
-
 	queue = nvmet_fc_find_target_queue(tgtport,
 				be64_to_cpu(cmdiu->connection_id));
 	if (!queue)
@@ -2190,6 +2234,59 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
 }
 EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
 
+/**
+ * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD
+ *                       upon the reception of an ABTS for a FCP command
+ *
+ * Notify the transport that an ABTS has been received for a FCP command
+ * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The
+ * LLDD believes the command is still being worked on
+ * (template_ops->fcp_req_release() has not been called).
+ *
+ * The transport will wait for any outstanding work (an op to the LLDD,
+ * which the lldd should complete with error due to the ABTS; or the
+ * completion from the nvmet layer of the nvme command), then will
+ * stop processing and call the nvmet_fc_rcv_fcp_req() callback to
+ * return the i/o context to the LLDD.  The LLDD may send the BA_ACC
+ * to the ABTS either after return from this function (assuming any
+ * outstanding op work has been terminated) or upon the callback being
+ * called.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ *              was received on.
+ * @fcpreq:     pointer to the fcpreq request structure that corresponds
+ *              to the exchange that received the ABTS.
+ */
+void
+nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+
+	if (!fod || fod->fcpreq != fcpreq)
+		/* job appears to have already completed, ignore abort */
+		return;
+
+	queue = fod->queue;
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	if (fod->active) {
+		/*
+		 * mark as abort. The abort handler, invoked upon completion
+		 * of any work, will detect the aborted status and do the
+		 * callback.
+		 */
+		spin_lock(&fod->flock);
+		fod->abort = true;
+		fod->aborted = true;
+		spin_unlock(&fod->flock);
+	}
+	spin_unlock_irqrestore(&queue->qlock, flags);
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
+
 enum {
 	FCT_TRADDR_ERR		= 0,
 	FCT_TRADDR_WWNN		= 1 << 0,
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index dbcafd30dd9b..aaa3dbe22bd5 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -246,7 +246,10 @@ struct fcloop_lsreq {
 struct fcloop_fcpreq {
 	struct fcloop_tport		*tport;
 	struct nvmefc_fcp_req		*fcpreq;
+	spinlock_t			reqlock;
 	u16				status;
+	bool				active;
+	bool				aborted;
 	struct work_struct		work;
 	struct nvmefc_tgt_fcp_req	tgt_fcp_req;
 };
@@ -254,6 +257,7 @@ struct fcloop_fcpreq {
 struct fcloop_ini_fcpreq {
 	struct nvmefc_fcp_req		*fcpreq;
 	struct fcloop_fcpreq		*tfcp_req;
+	struct work_struct		iniwork;
 };
 
 static inline struct fcloop_lsreq *
@@ -345,7 +349,21 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
 }
 
 /*
- * FCP IO operation done. call back up initiator "done" flows.
+ * FCP IO operation done by initiator abort.
+ * call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+{
+	struct fcloop_ini_fcpreq *inireq =
+		container_of(work, struct fcloop_ini_fcpreq, iniwork);
+
+	inireq->fcpreq->done(inireq->fcpreq);
+}
+
+/*
+ * FCP IO operation done by target completion.
+ * call back up initiator "done" flows.
  */
 static void
 fcloop_tgt_fcprqst_done_work(struct work_struct *work)
@@ -353,9 +371,13 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 	struct fcloop_fcpreq *tfcp_req =
 		container_of(work, struct fcloop_fcpreq, work);
 	struct fcloop_tport *tport = tfcp_req->tport;
-	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	struct nvmefc_fcp_req *fcpreq;
+
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	spin_unlock(&tfcp_req->reqlock);
 
-	if (tport->remoteport) {
+	if (tport->remoteport && fcpreq) {
 		fcpreq->status = tfcp_req->status;
 		fcpreq->done(fcpreq);
 	}
@@ -384,8 +406,10 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
 
 	inireq->fcpreq = fcpreq;
 	inireq->tfcp_req = tfcp_req;
+	INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
 	tfcp_req->fcpreq = fcpreq;
 	tfcp_req->tport = rport->targetport->private;
+	spin_lock_init(&tfcp_req->reqlock);
 	INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
 
 	ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
@@ -453,50 +477,86 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
 			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
 {
 	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
-	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	struct nvmefc_fcp_req *fcpreq;
 	u32 rsplen = 0, xfrlen = 0;
-	int fcp_err = 0;
+	int fcp_err = 0, active, aborted;
 	u8 op = tgt_fcpreq->op;
 
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	active = tfcp_req->active;
+	aborted = tfcp_req->aborted;
+	tfcp_req->active = true;
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(active))
+		/* illegal - call while i/o active */
+		return -EALREADY;
+
+	if (unlikely(aborted)) {
+		/* target transport has aborted i/o prior */
+		spin_lock(&tfcp_req->reqlock);
+		tfcp_req->active = false;
+		spin_unlock(&tfcp_req->reqlock);
+		tgt_fcpreq->transferred_length = 0;
+		tgt_fcpreq->fcp_error = -ECANCELED;
+		tgt_fcpreq->done(tgt_fcpreq);
+		return 0;
+	}
+
+	/*
+	 * if fcpreq is NULL, the I/O has been aborted (from
+	 * initiator side). For the target side, act as if all is well
+	 * but don't actually move data.
+	 */
+
 	switch (op) {
 	case NVMET_FCOP_WRITEDATA:
 		xfrlen = tgt_fcpreq->transfer_length;
-		fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
-					tgt_fcpreq->offset, xfrlen);
-		fcpreq->transferred_length += xfrlen;
+		if (fcpreq) {
+			fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+					fcpreq->first_sgl, tgt_fcpreq->offset,
+					xfrlen);
+			fcpreq->transferred_length += xfrlen;
+		}
 		break;
 
 	case NVMET_FCOP_READDATA:
 	case NVMET_FCOP_READDATA_RSP:
 		xfrlen = tgt_fcpreq->transfer_length;
-		fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
-					tgt_fcpreq->offset, xfrlen);
-		fcpreq->transferred_length += xfrlen;
+		if (fcpreq) {
+			fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+					fcpreq->first_sgl, tgt_fcpreq->offset,
+					xfrlen);
+			fcpreq->transferred_length += xfrlen;
+		}
 		if (op == NVMET_FCOP_READDATA)
 			break;
 
 		/* Fall-Thru to RSP handling */
 
 	case NVMET_FCOP_RSP:
-		rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
-				fcpreq->rsplen : tgt_fcpreq->rsplen);
-		memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
-		if (rsplen < tgt_fcpreq->rsplen)
-			fcp_err = -E2BIG;
-		fcpreq->rcv_rsplen = rsplen;
-		fcpreq->status = 0;
+		if (fcpreq) {
+			rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+					fcpreq->rsplen : tgt_fcpreq->rsplen);
+			memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+			if (rsplen < tgt_fcpreq->rsplen)
+				fcp_err = -E2BIG;
+			fcpreq->rcv_rsplen = rsplen;
+			fcpreq->status = 0;
+		}
 		tfcp_req->status = 0;
 		break;
 
-	case NVMET_FCOP_ABORT:
-		tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
-		break;
-
 	default:
 		fcp_err = -EINVAL;
 		break;
 	}
 
+	spin_lock(&tfcp_req->reqlock);
+	tfcp_req->active = false;
+	spin_unlock(&tfcp_req->reqlock);
+
 	tgt_fcpreq->transferred_length = xfrlen;
 	tgt_fcpreq->fcp_error = fcp_err;
 	tgt_fcpreq->done(tgt_fcpreq);
@@ -504,6 +564,32 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
 	return 0;
 }
 
+static void
+fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+	int active;
+
+	/*
+	 * mark aborted only in case there were 2 threads in transport
+	 * (one doing io, other doing abort) and only kills ops posted
+	 * after the abort request
+	 */
+	spin_lock(&tfcp_req->reqlock);
+	active = tfcp_req->active;
+	tfcp_req->aborted = true;
+	spin_unlock(&tfcp_req->reqlock);
+
+	tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
+
+	/*
+	 * nothing more to do. If io wasn't active, the transport should
+	 * immediately call the req_release. If it was active, the op
+	 * will complete, and the lldd should call req_release.
+	 */
+}
+
 static void
 fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
 			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
@@ -526,6 +612,27 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 			void *hw_queue_handle,
 			struct nvmefc_fcp_req *fcpreq)
 {
+	struct fcloop_rport *rport = remoteport->private;
+	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+	struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+
+	if (!tfcp_req)
+		/* abort has already been called */
+		return;
+
+	if (rport->targetport)
+		nvmet_fc_rcv_fcp_abort(rport->targetport,
+					&tfcp_req->tgt_fcp_req);
+
+	/* break initiator/target relationship for io */
+	spin_lock(&tfcp_req->reqlock);
+	inireq->tfcp_req = NULL;
+	tfcp_req->fcpreq = NULL;
+	spin_unlock(&tfcp_req->reqlock);
+
+	/* post the aborted io completion */
+	fcpreq->status = -ECANCELED;
+	schedule_work(&inireq->iniwork);
 }
 
 static void
@@ -583,6 +690,7 @@ struct nvmet_fc_target_template tgttemplate = {
 	.targetport_delete	= fcloop_targetport_delete,
 	.xmt_ls_rsp		= fcloop_xmt_ls_rsp,
 	.fcp_op			= fcloop_fcp_op,
+	.fcp_abort		= fcloop_tgt_fcp_abort,
 	.fcp_req_release	= fcloop_fcp_req_release,
 	.max_hw_queues		= FCLOOP_HW_QUEUES,
 	.max_sgl_segments	= FCLOOP_SGL_SEGS,
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 1846c7eb086a..d488c3318d4b 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -542,27 +542,6 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
 	}
 #endif
 
-	if (rsp->op == NVMET_FCOP_ABORT) {
-		lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
-				"6103 Abort op: oxri x%x %d cnt %d\n",
-				ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
-		lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
-				 "xri x%x state x%x cnt x%x\n",
-				 ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
-		atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
-		ctxp->entry_cnt++;
-		ctxp->flag |= LPFC_NVMET_ABORT_OP;
-		if (ctxp->flag & LPFC_NVMET_IO_INP)
-			lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
-						       ctxp->oxid);
-		else
-			lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
-							 ctxp->oxid);
-		return 0;
-	}
-
 	/* Sanity check */
 	if (ctxp->state == LPFC_NVMET_STE_ABORT) {
 		atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
@@ -632,6 +611,33 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
 	complete(&tport->tport_unreg_done);
 }
 
+static void
+lpfc_nvmet_xmt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			 struct nvmefc_tgt_fcp_req *req)
+{
+	struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+	struct lpfc_nvmet_rcv_ctx *ctxp =
+		container_of(req, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+	struct lpfc_hba *phba = ctxp->phba;
+
+	lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+			"6103 Abort op: oxri x%x %d cnt %d\n",
+			ctxp->oxid, ctxp->state, ctxp->entry_cnt);
+
+	lpfc_nvmeio_data(phba, "NVMET FCP ABRT: xri x%x state x%x cnt x%x\n",
+			 ctxp->oxid, ctxp->state, ctxp->entry_cnt);
+
+	atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
+	ctxp->entry_cnt++;
+	ctxp->flag |= LPFC_NVMET_ABORT_OP;
+	if (ctxp->flag & LPFC_NVMET_IO_INP)
+		lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+					       ctxp->oxid);
+	else
+		lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+						 ctxp->oxid);
+}
+
 static void
 lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
 			   struct nvmefc_tgt_fcp_req *rsp)
@@ -672,6 +678,7 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = {
 	.targetport_delete = lpfc_nvmet_targetport_delete,
 	.xmt_ls_rsp     = lpfc_nvmet_xmt_ls_rsp,
 	.fcp_op         = lpfc_nvmet_xmt_fcp_op,
+	.fcp_abort      = lpfc_nvmet_xmt_fcp_abort,
 	.fcp_req_release = lpfc_nvmet_xmt_fcp_release,
 
 	.max_hw_queues  = 1,
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index d98ddb2feabc..0db37158a61d 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -533,9 +533,6 @@ enum {
 					 * rsp as well
 					 */
 	NVMET_FCOP_RSP		= 4,	/* send rsp frame */
-	NVMET_FCOP_ABORT	= 5,	/* abort exchange via ABTS */
-	NVMET_FCOP_BA_ACC	= 6,	/* send BA_ACC */
-	NVMET_FCOP_BA_RJT	= 7,	/* send BA_RJT */
 };
 
 /**
@@ -572,8 +569,6 @@ enum {
  *     upon compeletion of the operation.  The nvmet-fc layer will also set a
  *     private pointer for its own use in the done routine.
  *
- * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !!
- *
  * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op
  * entrypoint.
  * @op:       Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx)
@@ -784,10 +779,6 @@ struct nvmet_fc_target_port {
  *           or upon success/failure of FCP_CONF if it is supported, the
  *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
  *           consider the operation complete.
- *         NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
- *           corresponding to the fcp operation. The LLDD shall send
- *           ABTS and follow FC exchange abort-multi rules, including
- *           ABTS retries and possible logout.
  *       Upon completing the indicated operation, the LLDD is to set the
  *       status fields for the operation (tranferred_length and fcp_error
  *       status) in the request, then call the "done" routine
@@ -808,6 +799,17 @@ struct nvmet_fc_target_port {
  *       Returns 0 on success, -<errno> on failure (Ex: -EIO)
  *       Entrypoint is Mandatory.
  *
+ * @fcp_abort:  Called by the transport to abort an active command.
+ *       The command may be in-between operations (nothing active in LLDD)
+ *       or may have an active WRITEDATA operation pending. The LLDD is to
+ *       initiate the ABTS process for the command and return from the
+ *       callback. The ABTS does not need to be complete on the command.
+ *       The fcp_abort callback inherently cannot fail. After the
+ *       fcp_abort() callback completes, the transport will wait for any
+ *       outstanding operation (if there was one) to complete, then will
+ *       call the fcp_req_release() callback to return the command's
+ *       exchange context back to the LLDD.
+ *
  * @fcp_req_release:  Called by the transport to return a nvmefc_tgt_fcp_req
  *       to the LLDD after all operations on the fcp operation are complete.
  *       This may be due to the command completing or upon completion of
@@ -848,6 +850,8 @@ struct nvmet_fc_target_template {
 				struct nvmefc_tgt_ls_req *tls_req);
 	int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_fcp_req *fcpreq);
+	void (*fcp_abort)(struct nvmet_fc_target_port *tgtport,
+				struct nvmefc_tgt_fcp_req *fcpreq);
 	void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_fcp_req *fcpreq);
 
@@ -877,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport,
 			struct nvmefc_tgt_fcp_req *fcpreq,
 			void *cmdiubuf, u32 cmdiubuf_len);
 
+void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *fcpreq);
+
 #endif /* _NVME_FC_DRIVER_H */
-- 
cgit v1.2.3


From 50f2112cf7a3e62a8d33838eb205d5fef306457a Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 11 Apr 2017 12:50:09 -0400
Subject: locks: Set FL_CLOSE when removing flock locks on close()

Set FL_CLOSE in fl_flags as in locks_remove_posix() when clearing locks.
NFS will check for this flag to ensure an unlock is sent in a following
patch.

Fuse handles flock and posix locks differently for FL_CLOSE, and so
requires a fixup to retain the existing behavior for flock.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/fuse/file.c     | 2 +-
 fs/locks.c         | 2 +-
 include/linux/fs.h | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ec238fb5a584..995da8957f6f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2168,7 +2168,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 	}
 
 	/* Unlock on close is handled by the flush method */
-	if (fl->fl_flags & FL_CLOSE)
+	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
 		return 0;
 
 	fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
diff --git a/fs/locks.c b/fs/locks.c
index 26811321d39b..af2031a1fcff 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2504,7 +2504,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 		.fl_owner = filp,
 		.fl_pid = current->tgid,
 		.fl_file = filp,
-		.fl_flags = FL_FLOCK,
+		.fl_flags = FL_FLOCK | FL_CLOSE,
 		.fl_type = F_UNLCK,
 		.fl_end = OFFSET_MAX,
 	};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..72061aa65405 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -909,6 +909,8 @@ static inline struct file *get_file(struct file *f)
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
 #define FL_LAYOUT	2048	/* outstanding pNFS layout */
 
+#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
+
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
  * asynchronous locking.
-- 
cgit v1.2.3


From 7d6ddf88c4db372689c8aa65ea652d0514d66c06 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 11 Apr 2017 12:50:10 -0400
Subject: NFS: Add an iocounter wait function for async RPC tasks

By sleeping on a new NFS Unlock-On-Close waitqueue, rpc tasks may wait for
a lock context's iocounter to reach zero.  The rpc waitqueue is only woken
when the open_context has the NFS_CONTEXT_UNLOCK flag set in order to
mitigate spurious wake-ups for any iocounter reaching zero.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           |  1 +
 fs/nfs/pagelist.c         | 34 +++++++++++++++++++++++++++++++++-
 include/linux/nfs_fs.h    |  1 +
 include/linux/nfs_fs_sb.h |  1 +
 include/linux/nfs_page.h  |  1 +
 5 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3ffbffe8f39f..3e7b2e6a7cfb 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp)
 static void pnfs_init_server(struct nfs_server *server)
 {
 	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
+	rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC");
 }
 
 #else
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index f53610672f03..ad92b401326c 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -102,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
 			TASK_KILLABLE);
 }
 
+/**
+ * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O
+ * to complete
+ * @task: the rpc_task that should wait
+ * @l_ctx: nfs_lock_context with io_counter to check
+ *
+ * Returns true if there is outstanding I/O to wait on and the
+ * task has been put to sleep.
+ */
+bool
+nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
+{
+	struct inode *inode = d_inode(l_ctx->open_context->dentry);
+	bool ret = false;
+
+	if (atomic_read(&l_ctx->io_count) > 0) {
+		rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL);
+		ret = true;
+	}
+
+	if (atomic_read(&l_ctx->io_count) == 0) {
+		rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task);
+		ret = false;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
+
 /*
  * nfs_page_group_lock - lock the head of the page group
  * @req - request in group that is to be locked
@@ -385,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req)
 		req->wb_page = NULL;
 	}
 	if (l_ctx != NULL) {
-		if (atomic_dec_and_test(&l_ctx->io_count))
+		if (atomic_dec_and_test(&l_ctx->io_count)) {
 			wake_up_atomic_t(&l_ctx->io_count);
+			if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags))
+				rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq);
+		}
 		nfs_put_lock_context(l_ctx);
 		req->wb_lock_context = NULL;
 	}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1b29915247b2..9aa044e76820 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -76,6 +76,7 @@ struct nfs_open_context {
 #define NFS_CONTEXT_ERROR_WRITE		(0)
 #define NFS_CONTEXT_RESEND_WRITES	(1)
 #define NFS_CONTEXT_BAD			(2)
+#define NFS_CONTEXT_UNLOCK	(3)
 	int error;
 
 	struct list_head list;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b34097c67848..2a70f34dffe8 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -222,6 +222,7 @@ struct nfs_server {
 	u32			mountd_version;
 	unsigned short		mountd_port;
 	unsigned short		mountd_protocol;
+	struct rpc_wait_queue	uoc_rpcwaitq;
 };
 
 /* Server capabilities */
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 6f01e28bba27..247cc3d3498f 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -141,6 +141,7 @@ extern int nfs_page_group_lock(struct nfs_page *, bool);
 extern void nfs_page_group_lock_wait(struct nfs_page *);
 extern void nfs_page_group_unlock(struct nfs_page *);
 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
 
 /*
  * Lock the page of an asynchronous request
-- 
cgit v1.2.3


From b1ece737f44f91dca8f4829cf0b442e752e406db Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Tue, 11 Apr 2017 12:50:11 -0400
Subject: lockd: Introduce nlmclnt_operations

NFS would enjoy the ability to modify the behavior of the NLM client's
unlock RPC task in order to delay the transmission of the unlock until IO
that was submitted under that lock has completed.  This ability can ensure
that the NLM client will always complete the transmission of an unlock even
if the waiting caller has been interrupted with fatal signal.

For this purpose, a pointer to a struct nlmclnt_operations can be assigned
in a nfs_module's nfs_rpc_ops that will install those nlmclnt_operations on
the nlm_host.  The struct nlmclnt_operations defines three callback
operations that will be used in a following patch:

nlmclnt_alloc_call - used to call back after a successful allocation of
	a struct nlm_rqst in nlmclnt_proc().

nlmclnt_unlock_prepare - used to call back during NLM unlock's
	rpc_call_prepare.  The NLM client defers calling rpc_call_start()
	until this callback returns false.

nlmclnt_release_call - used to call back when the NLM client's struct
	nlm_rqst is freed.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/lockd/clntlock.c         |  1 +
 fs/lockd/clntproc.c         | 26 +++++++++++++++++++++++++-
 fs/nfs/client.c             |  1 +
 fs/nfs/nfs3proc.c           |  2 +-
 fs/nfs/proc.c               |  2 +-
 include/linux/lockd/bind.h  | 24 ++++++++++++++++++++++--
 include/linux/lockd/lockd.h |  2 ++
 include/linux/nfs_xdr.h     |  1 +
 8 files changed, 54 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 41e491b8e5d7..27d577dbe51a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
 	if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL)
 		goto out_nobind;
 
+	host->h_nlmclnt_ops = nlm_init->nlmclnt_ops;
 	return host;
 out_nobind:
 	nlmclnt_release_host(host);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 112952037933..066ac313ae5c 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req)
  * @host: address of a valid nlm_host context representing the NLM server
  * @cmd: fcntl-style file lock operation to perform
  * @fl: address of arguments for the lock operation
+ * @data: address of data to be sent to callback operations
  *
  */
-int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
+int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data)
 {
 	struct nlm_rqst		*call;
 	int			status;
+	const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops;
 
 	call = nlm_alloc_call(host);
 	if (call == NULL)
 		return -ENOMEM;
 
+	if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call)
+		nlmclnt_ops->nlmclnt_alloc_call(data);
+
 	nlmclnt_locks_init_private(fl, host);
 	if (!fl->fl_u.nfs_fl.owner) {
 		/* lockowner allocation has failed */
@@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
 	}
 	/* Set up the argument struct */
 	nlmclnt_setlockargs(call, fl);
+	call->a_callback_data = data;
 
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
 		if (fl->fl_type != F_UNLCK) {
@@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 
 void nlmclnt_release_call(struct nlm_rqst *call)
 {
+	const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops;
+
 	if (!atomic_dec_and_test(&call->a_count))
 		return;
+	if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call)
+		nlmclnt_ops->nlmclnt_release_call(call->a_callback_data);
 	nlmclnt_release_host(call->a_host);
 	nlmclnt_release_lockargs(call);
 	kfree(call);
@@ -687,6 +697,19 @@ out:
 	return status;
 }
 
+static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data)
+{
+	struct nlm_rqst	*req = data;
+	const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops;
+	bool defer_call = false;
+
+	if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare)
+		defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data);
+
+	if (!defer_call)
+		rpc_call_start(task);
+}
+
 static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
 {
 	struct nlm_rqst	*req = data;
@@ -720,6 +743,7 @@ die:
 }
 
 static const struct rpc_call_ops nlmclnt_unlock_ops = {
+	.rpc_call_prepare = nlmclnt_unlock_prepare,
 	.rpc_call_done = nlmclnt_unlock_callback,
 	.rpc_release = nlmclnt_rpc_release,
 };
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3e7b2e6a7cfb..e0302101e18a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -546,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server)
 		.noresvport	= server->flags & NFS_MOUNT_NORESVPORT ?
 					1 : 0,
 		.net		= clp->cl_net,
+		.nlmclnt_ops 	= clp->cl_nfs_mod->rpc_ops->nlmclnt_ops,
 	};
 
 	if (nlm_init.nfs_version > 3)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index dc925b531f32..03b3c3de28f1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -870,7 +870,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(filp);
 
-	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
 }
 
 static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b7bca8303989..9872cf676a50 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	struct inode *inode = file_inode(filp);
 
-	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
+	return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL);
 }
 
 /* Helper functions for NFS lock bounds checking */
diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h
index 140edab64446..05728396a1a1 100644
--- a/include/linux/lockd/bind.h
+++ b/include/linux/lockd/bind.h
@@ -18,6 +18,7 @@
 
 /* Dummy declarations */
 struct svc_rqst;
+struct rpc_task;
 
 /*
  * This is the set of functions for lockd->nfsd communication
@@ -43,6 +44,7 @@ struct nlmclnt_initdata {
 	u32			nfs_version;
 	int			noresvport;
 	struct net		*net;
+	const struct nlmclnt_operations	*nlmclnt_ops;
 };
 
 /*
@@ -52,8 +54,26 @@ struct nlmclnt_initdata {
 extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init);
 extern void	nlmclnt_done(struct nlm_host *host);
 
-extern int	nlmclnt_proc(struct nlm_host *host, int cmd,
-					struct file_lock *fl);
+/*
+ * NLM client operations provide a means to modify RPC processing of NLM
+ * requests.  Callbacks receive a pointer to data passed into the call to
+ * nlmclnt_proc().
+ */
+struct nlmclnt_operations {
+	/* Called on successful allocation of nlm_rqst, use for allocation or
+	 * reference counting. */
+	void (*nlmclnt_alloc_call)(void *);
+
+	/* Called in rpc_task_prepare for unlock.  A return value of true
+	 * indicates the callback has put the task to sleep on a waitqueue
+	 * and NLM should not call rpc_call_start(). */
+	bool (*nlmclnt_unlock_prepare)(struct rpc_task*, void *);
+
+	/* Called when the nlm_rqst is freed, callbacks should clean up here */
+	void (*nlmclnt_release_call)(void *);
+};
+
+extern int	nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data);
 extern int	lockd_up(struct net *net);
 extern void	lockd_down(struct net *net);
 
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index b37dee3acaba..41f7b6a04d69 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -69,6 +69,7 @@ struct nlm_host {
 	char			*h_addrbuf;	/* address eyecatcher */
 	struct net		*net;		/* host net */
 	char			nodename[UNX_MAXNODENAME + 1];
+	const struct nlmclnt_operations	*h_nlmclnt_ops;	/* Callback ops for NLM users */
 };
 
 /*
@@ -142,6 +143,7 @@ struct nlm_rqst {
 	struct nlm_block *	a_block;
 	unsigned int		a_retries;	/* Retry count */
 	u8			a_owner[NLMCLNT_OHSIZE];
+	void *	a_callback_data; /* sent to nlmclnt_operations callbacks */
 };
 
 /*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 51e27f9746ee..677c6b91dfcd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1551,6 +1551,7 @@ struct nfs_rpc_ops {
 	const struct inode_operations *dir_inode_ops;
 	const struct inode_operations *file_inode_ops;
 	const struct file_operations *file_ops;
+	const struct nlmclnt_operations *nlmclnt_ops;
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
-- 
cgit v1.2.3


From b62ea4112ce3746664dcc2f232d03461f0e6f3c7 Mon Sep 17 00:00:00 2001
From: Martin Kaiser <martin@kaiser.cx>
Date: Fri, 21 Apr 2017 16:47:11 +0200
Subject: video: fbdev: imxfb: support AUS mode

Some displays require setting AUS mode in the LDCD AUS Mode Control
Register to work with the imxfb driver. Like the value of the Panel
Configuration Register, the AUS mode setting depends on the display
mode.

Allow setting AUS mode from the device tree by adding a boolean
property. Make this property optional to keep the DT ABI stable.
AUS mode can be set only on imx21 and compatible chipsets.

Signed-off-by: Martin Kaiser <martin@kaiser.cx>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 drivers/video/fbdev/imxfb.c               | 17 +++++++++++++++++
 include/linux/platform_data/video-imxfb.h |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index 1b0faadb3080..c166e0725be5 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -117,6 +117,9 @@
 
 #define IMXFB_LSCR1_DEFAULT 0x00120300
 
+#define LCDC_LAUSCR	0x80
+#define LAUSCR_AUS_MODE	(1<<31)
+
 /* Used fb-mode. Can be set on kernel command line, therefore file-static. */
 static const char *fb_mode;
 
@@ -158,6 +161,7 @@ struct imxfb_info {
 	dma_addr_t		dbar2;
 
 	u_int			pcr;
+	u_int			lauscr;
 	u_int			pwmr;
 	u_int			lscr1;
 	u_int			dmacr;
@@ -422,6 +426,11 @@ static int imxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
 	pcr |= imxfb_mode->pcr & ~(0x3f | (7 << 25));
 
 	fbi->pcr = pcr;
+	/*
+	 * The LCDC AUS Mode Control Register does not exist on imx1.
+	 */
+	if (!is_imx1_fb(fbi) && imxfb_mode->aus_mode)
+		fbi->lauscr = LAUSCR_AUS_MODE;
 
 	/*
 	 * Copy the RGB parameters for this display
@@ -638,6 +647,9 @@ static int imxfb_activate_var(struct fb_var_screeninfo *var, struct fb_info *inf
 	if (fbi->dmacr)
 		writel(fbi->dmacr, fbi->regs + LCDC_DMACR);
 
+	if (fbi->lauscr)
+		writel(fbi->lauscr, fbi->regs + LCDC_LAUSCR);
+
 	return 0;
 }
 
@@ -734,6 +746,11 @@ static int imxfb_of_read_mode(struct device *dev, struct device_node *np,
 	imxfb_mode->bpp = bpp;
 	imxfb_mode->pcr = pcr;
 
+	/*
+	 * fsl,aus-mode is optional
+	 */
+	imxfb_mode->aus_mode = of_property_read_bool(np, "fsl,aus-mode");
+
 	return 0;
 }
 
diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h
index a5c0a71ec914..cf9348b376ac 100644
--- a/include/linux/platform_data/video-imxfb.h
+++ b/include/linux/platform_data/video-imxfb.h
@@ -50,6 +50,7 @@
 struct imx_fb_videomode {
 	struct fb_videomode mode;
 	u32 pcr;
+	bool aus_mode;
 	unsigned char	bpp;
 };
 
-- 
cgit v1.2.3


From dd77abf8a03a1ebd4dd3ddebecce312dcb0d1af1 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Sun, 19 Mar 2017 11:01:28 +0200
Subject: IB/mlx4: Support RAW Ethernet when RoCE is disabled

On some environments, such as certain SR-IOV VF configurations, RoCE
isn't supported for mlx4 Ethernet ports. Currently the driver will
not open IB device on that port.

This is problematic since we do want user-space RAW Ethernet QPs functionality
to remain in place. For that end, enhance the relevant driver flows such that we
do create a device instance in that case.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx4/main.c | 26 +++++++++++---------------
 include/linux/mlx4/device.h       |  3 +--
 2 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index fba94df28cf1..2cc6f21baea1 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2867,23 +2867,19 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (mlx4_ib_init_sriov(ibdev))
 		goto err_mad;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE ||
-	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
-		if (!iboe->nb.notifier_call) {
-			iboe->nb.notifier_call = mlx4_ib_netdev_event;
-			err = register_netdevice_notifier(&iboe->nb);
-			if (err) {
-				iboe->nb.notifier_call = NULL;
-				goto err_notif;
-			}
-		}
-		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
-			err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
-			if (err) {
-				goto err_notif;
-			}
+	if (!iboe->nb.notifier_call) {
+		iboe->nb.notifier_call = mlx4_ib_netdev_event;
+		err = register_netdevice_notifier(&iboe->nb);
+		if (err) {
+			iboe->nb.notifier_call = NULL;
+			goto err_notif;
 		}
 	}
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+		if (err)
+			goto err_notif;
+	}
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
 		if (device_create_file(&ibdev->ib_dev.dev,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1beb1ec2fbdf..74b765ce48ab 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1011,8 +1011,7 @@ struct mlx4_mad_ifc {
 #define mlx4_foreach_ib_transport_port(port, dev)                         \
 	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
 		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \
-			((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2))
+		    ((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_ETH))
 
 #define MLX4_INVALID_SLAVE_ID	0xFF
 #define MLX4_SINK_COUNTER_INDEX(dev)	(dev->caps.max_counters - 1)
-- 
cgit v1.2.3


From 19cc75249adc61401aa4b21a6654e0a7f7ea8fe2 Mon Sep 17 00:00:00 2001
From: Ariel Levkovich <lariel@mellanox.com>
Date: Mon, 3 Apr 2017 13:11:03 +0300
Subject: IB/mlx5: Use IP version matching to classify IP traffic

This change adds the ability for flow steering to classify IPv4/6
packets with MPLS tag (Ethertype 0x8847 and 0x8848) as standard IP
packets and hit IPv4/6 classifed steering rules.

When user added a flow rule with IP classification, driver was
implicitly adding ethertype matching to the created rule in order
to distinguish between IPv4 and IPv6 protocols.
Since IP packets with MPLS tag header have MPLS ethertype, they missed
the rule and ended up hitting the default filters.
Such behavior prevented from MPLS packets to undergo inbound traffic
load balancing flows (if such were defined by configuring RSS) to
achieve higher throughput - the way that non-MPLS IP packets performed.

Since our device is able to look past the MPLS tag and identify the
next protocol we introduce this solution which replaces Ethertype
matching by the device's capability to perform IP version parsing
and matching in order to distinguish between IPv4 and IPv6.
Therefore, whenever a flow with IP spec is added and device support IP
version matching, driver will implicitly add IP version matching to the
rule (Based on the IP spec type) without Ethertype matching which will
cause relevant MPLS tagged packets to hit this rule as well.
Otherwise (device doesn't support IP version matching), we fall back to
setting Ethertype matching.

If the user's filters specify an L2 ethertype and an IP spec
the rule will then match both the ethertype and the IP version.

The device's support for IP version matching is reported by the
device via dedicated capability bit in query_device_cap and named
outer/inner_ip_version.

Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c | 66 +++++++++++++++++++++++++++++----------
 include/linux/mlx5/mlx5_ifc.h     |  6 ++--
 2 files changed, 52 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 1c85c39d1d03..c28b6952b0ab 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1736,8 +1736,11 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
 		   offsetof(typeof(filter), field) -\
 		   sizeof(filter.field))
 
-static int parse_flow_attr(u32 *match_c, u32 *match_v,
-			   const union ib_flow_spec *ib_spec, u32 *tag_id)
+#define IPV4_VERSION 4
+#define IPV6_VERSION 6
+static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c,
+			   u32 *match_v, const union ib_flow_spec *ib_spec,
+			   u32 *tag_id)
 {
 	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					   misc_parameters);
@@ -1745,17 +1748,22 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
 					   misc_parameters);
 	void *headers_c;
 	void *headers_v;
+	int match_ipv;
 
 	if (ib_spec->type & IB_FLOW_SPEC_INNER) {
 		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					 inner_headers);
 		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
 					 inner_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version);
 	} else {
 		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
 					 outer_headers);
 		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
 					 outer_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
 	}
 
 	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
@@ -1811,10 +1819,17 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
 		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
 			return -EOPNOTSUPP;
 
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-			 ethertype, 0xffff);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-			 ethertype, ETH_P_IP);
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, IPV4_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IP);
+		}
 
 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
 				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
@@ -1843,10 +1858,17 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
 		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
 			return -EOPNOTSUPP;
 
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-			 ethertype, 0xffff);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-			 ethertype, ETH_P_IPV6);
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, IPV6_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IPV6);
+		}
 
 		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
 				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
@@ -1968,10 +1990,16 @@ static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
 	       is_multicast_ether_addr(eth_spec->val.dst_mac);
 }
 
-static bool is_valid_ethertype(const struct ib_flow_attr *flow_attr,
+static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
+			       const struct ib_flow_attr *flow_attr,
 			       bool check_inner)
 {
 	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
+	int match_ipv = check_inner ?
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version) :
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
 	int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
 	bool ipv4_spec_valid, ipv6_spec_valid;
 	unsigned int ip_spec_type = 0;
@@ -2002,16 +2030,20 @@ static bool is_valid_ethertype(const struct ib_flow_attr *flow_attr,
 			(ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
 		ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
 			(ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
-		type_valid = ipv4_spec_valid || ipv6_spec_valid;
+
+		type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
+			     (((eth_type == ETH_P_MPLS_UC) ||
+			       (eth_type == ETH_P_MPLS_MC)) && match_ipv);
 	}
 
 	return type_valid;
 }
 
-static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
+static bool is_valid_attr(struct mlx5_core_dev *mdev,
+			  const struct ib_flow_attr *flow_attr)
 {
-	return is_valid_ethertype(flow_attr, false) &&
-	       is_valid_ethertype(flow_attr, true);
+	return is_valid_ethertype(mdev, flow_attr, false) &&
+	       is_valid_ethertype(mdev, flow_attr, true);
 }
 
 static void put_flow_table(struct mlx5_ib_dev *dev,
@@ -2154,7 +2186,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 	u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
 	int err = 0;
 
-	if (!is_valid_attr(flow_attr))
+	if (!is_valid_attr(dev->mdev, flow_attr))
 		return ERR_PTR(-EINVAL);
 
 	spec = mlx5_vzalloc(sizeof(*spec));
@@ -2167,7 +2199,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
 	INIT_LIST_HEAD(&handler->list);
 
 	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		err = parse_flow_attr(spec->match_criteria,
+		err = parse_flow_attr(dev->mdev, spec->match_criteria,
 				      spec->match_value, ib_flow, &flow_tag);
 		if (err < 0)
 			goto free;
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 7c50bd39b297..4da6e803b627 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -236,7 +236,7 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_dmac[0x1];
 	u8         outer_smac[0x1];
 	u8         outer_ether_type[0x1];
-	u8         reserved_at_3[0x1];
+	u8         outer_ip_version[0x1];
 	u8         outer_first_prio[0x1];
 	u8         outer_first_cfi[0x1];
 	u8         outer_first_vid[0x1];
@@ -265,7 +265,7 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         inner_dmac[0x1];
 	u8         inner_smac[0x1];
 	u8         inner_ether_type[0x1];
-	u8         reserved_at_23[0x1];
+	u8         inner_ip_version[0x1];
 	u8         inner_first_prio[0x1];
 	u8         inner_first_cfi[0x1];
 	u8         inner_first_vid[0x1];
@@ -371,7 +371,7 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         cvlan_tag[0x1];
 	u8         svlan_tag[0x1];
 	u8         frag[0x1];
-	u8         reserved_at_93[0x4];
+	u8         ip_version[0x4];
 	u8         tcp_flags[0x9];
 
 	u8         tcp_sport[0x10];
-- 
cgit v1.2.3


From e1f24a79f424ddb03828de7c0152668c9a30146e Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Sun, 16 Apr 2017 07:29:29 +0300
Subject: IB/mlx5: Support congestion related counters

This patch adds support to query the congestion related hardware counters
through new command and links them with other hw counters being available
in hw_counters sysfs location.

In order to reuse existing infrastructure it renames related q_counter
data structures to more generic counters to reflect q_counters and
congestion counters and maybe some other counters in the future.

New hardware counters:
 * rp_cnp_handled - CNP packets handled by the reaction point
 * rp_cnp_ignored - CNP packets ignored by the reaction point
 * np_cnp_sent    - CNP packets sent by notification point to respond to
                     CE marked RoCE packets
 * np_ecn_marked_roce_packets - CE marked RoCE packets received by
                                notification point

It also avoids returning ENOSYS which is specific for invalid
system call and produces the following checkpatch.pl warning.

WARNING: ENOSYS means 'invalid syscall nr' and nothing else
+		return -ENOSYS;

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Daniel Jurgens <danielj@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/cmd.c     |  11 +++
 drivers/infiniband/hw/mlx5/cmd.h     |   2 +
 drivers/infiniband/hw/mlx5/main.c    | 172 ++++++++++++++++++++++++-----------
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   7 +-
 drivers/infiniband/hw/mlx5/qp.c      |   7 +-
 include/linux/mlx5/mlx5_ifc.h        |  18 ++--
 6 files changed, 150 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index cdc2d3017da7..18d5e1db93ed 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -46,3 +46,14 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey)
 				      null_mkey);
 	return err;
 }
+
+int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+				bool reset, void *out, int out_size)
+{
+	u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = { };
+
+	MLX5_SET(query_cong_statistics_in, in, opcode,
+		 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
+	MLX5_SET(query_cong_statistics_in, in, clear, reset);
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index 7ca8a7b6434d..fa09228193a6 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -37,4 +37,6 @@
 #include <linux/mlx5/driver.h>
 
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
+int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev,
+				bool reset, void *out, int out_size);
 #endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index cca3848a2671..e81391901260 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -57,6 +57,7 @@
 #include <linux/mlx5/fs.h>
 #include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
+#include "cmd.h"
 
 #define DRIVER_NAME "mlx5_ib"
 #define DRIVER_VERSION "2.2-1"
@@ -3213,7 +3214,7 @@ static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
 		mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
-struct mlx5_ib_q_counter {
+struct mlx5_ib_counter {
 	const char *name;
 	size_t offset;
 };
@@ -3221,18 +3222,18 @@ struct mlx5_ib_q_counter {
 #define INIT_Q_COUNTER(_name)		\
 	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
 
-static const struct mlx5_ib_q_counter basic_q_cnts[] = {
+static const struct mlx5_ib_counter basic_q_cnts[] = {
 	INIT_Q_COUNTER(rx_write_requests),
 	INIT_Q_COUNTER(rx_read_requests),
 	INIT_Q_COUNTER(rx_atomic_requests),
 	INIT_Q_COUNTER(out_of_buffer),
 };
 
-static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = {
+static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
 	INIT_Q_COUNTER(out_of_sequence),
 };
 
-static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
+static const struct mlx5_ib_counter retrans_q_cnts[] = {
 	INIT_Q_COUNTER(duplicate_request),
 	INIT_Q_COUNTER(rnr_nak_retry_err),
 	INIT_Q_COUNTER(packet_seq_err),
@@ -3240,22 +3241,31 @@ static const struct mlx5_ib_q_counter retrans_q_cnts[] = {
 	INIT_Q_COUNTER(local_ack_timeout_err),
 };
 
-static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
+#define INIT_CONG_COUNTER(_name)		\
+	{ .name = #_name, .offset =	\
+		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
+
+static const struct mlx5_ib_counter cong_cnts[] = {
+	INIT_CONG_COUNTER(rp_cnp_ignored),
+	INIT_CONG_COUNTER(rp_cnp_handled),
+	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
+	INIT_CONG_COUNTER(np_cnp_sent),
+};
+
+static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
 {
 	unsigned int i;
 
 	for (i = 0; i < dev->num_ports; i++) {
 		mlx5_core_dealloc_q_counter(dev->mdev,
-					    dev->port[i].q_cnts.set_id);
-		kfree(dev->port[i].q_cnts.names);
-		kfree(dev->port[i].q_cnts.offsets);
+					    dev->port[i].cnts.set_id);
+		kfree(dev->port[i].cnts.names);
+		kfree(dev->port[i].cnts.offsets);
 	}
 }
 
-static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
-				      const char ***names,
-				      size_t **offsets,
-				      u32 *num)
+static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_counters *cnts)
 {
 	u32 num_counters;
 
@@ -3266,27 +3276,32 @@ static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev,
 
 	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
 		num_counters += ARRAY_SIZE(retrans_q_cnts);
+	cnts->num_q_counters = num_counters;
 
-	*names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL);
-	if (!*names)
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
+		num_counters += ARRAY_SIZE(cong_cnts);
+	}
+
+	cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
+	if (!cnts->names)
 		return -ENOMEM;
 
-	*offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL);
-	if (!*offsets)
+	cnts->offsets = kcalloc(num_counters,
+				sizeof(cnts->offsets), GFP_KERNEL);
+	if (!cnts->offsets)
 		goto err_names;
 
-	*num = num_counters;
-
 	return 0;
 
 err_names:
-	kfree(*names);
+	kfree(cnts->names);
 	return -ENOMEM;
 }
 
-static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
-				    const char **names,
-				    size_t *offsets)
+static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
+				  const char **names,
+				  size_t *offsets)
 {
 	int i;
 	int j = 0;
@@ -3309,9 +3324,16 @@ static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev,
 			offsets[j] = retrans_q_cnts[i].offset;
 		}
 	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
+			names[j] = cong_cnts[i].name;
+			offsets[j] = cong_cnts[i].offset;
+		}
+	}
 }
 
-static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
+static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
 {
 	int i;
 	int ret;
@@ -3320,7 +3342,7 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
 		struct mlx5_ib_port *port = &dev->port[i];
 
 		ret = mlx5_core_alloc_q_counter(dev->mdev,
-						&port->q_cnts.set_id);
+						&port->cnts.set_id);
 		if (ret) {
 			mlx5_ib_warn(dev,
 				     "couldn't allocate queue counter for port %d, err %d\n",
@@ -3328,15 +3350,12 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
 			goto dealloc_counters;
 		}
 
-		ret = __mlx5_ib_alloc_q_counters(dev,
-						 &port->q_cnts.names,
-						 &port->q_cnts.offsets,
-						 &port->q_cnts.num_counters);
+		ret = __mlx5_ib_alloc_counters(dev, &port->cnts);
 		if (ret)
 			goto dealloc_counters;
 
-		mlx5_ib_fill_q_counters(dev, port->q_cnts.names,
-					port->q_cnts.offsets);
+		mlx5_ib_fill_counters(dev, port->cnts.names,
+				      port->cnts.offsets);
 	}
 
 	return 0;
@@ -3344,7 +3363,7 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
 dealloc_counters:
 	while (--i >= 0)
 		mlx5_core_dealloc_q_counter(dev->mdev,
-					    dev->port[i].q_cnts.set_id);
+					    dev->port[i].cnts.set_id);
 
 	return ret;
 }
@@ -3359,44 +3378,93 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
 	if (port_num == 0)
 		return NULL;
 
-	return rdma_alloc_hw_stats_struct(port->q_cnts.names,
-					  port->q_cnts.num_counters,
+	return rdma_alloc_hw_stats_struct(port->cnts.names,
+					  port->cnts.num_q_counters +
+					  port->cnts.num_cong_counters,
 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
 }
 
-static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
-				struct rdma_hw_stats *stats,
-				u8 port_num, int index)
+static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_port *port,
+				    struct rdma_hw_stats *stats)
 {
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_port *port = &dev->port[port_num - 1];
 	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
 	void *out;
 	__be32 val;
-	int ret;
-	int i;
-
-	if (!stats)
-		return -ENOSYS;
+	int ret, i;
 
 	out = mlx5_vzalloc(outlen);
 	if (!out)
 		return -ENOMEM;
 
 	ret = mlx5_core_query_q_counter(dev->mdev,
-					port->q_cnts.set_id, 0,
+					port->cnts.set_id, 0,
 					out, outlen);
 	if (ret)
 		goto free;
 
-	for (i = 0; i < port->q_cnts.num_counters; i++) {
-		val = *(__be32 *)(out + port->q_cnts.offsets[i]);
+	for (i = 0; i < port->cnts.num_q_counters; i++) {
+		val = *(__be32 *)(out + port->cnts.offsets[i]);
 		stats->value[i] = (u64)be32_to_cpu(val);
 	}
 
 free:
 	kvfree(out);
-	return port->q_cnts.num_counters;
+	return ret;
+}
+
+static int mlx5_ib_query_cong_counters(struct mlx5_ib_dev *dev,
+				       struct mlx5_ib_port *port,
+				       struct rdma_hw_stats *stats)
+{
+	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
+	void *out;
+	int ret, i;
+	int offset = port->cnts.num_q_counters;
+
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	ret = mlx5_cmd_query_cong_counter(dev->mdev, false, out, outlen);
+	if (ret)
+		goto free;
+
+	for (i = 0; i < port->cnts.num_cong_counters; i++) {
+		stats->value[i + offset] =
+			be64_to_cpup((__be64 *)(out +
+				     port->cnts.offsets[i + offset]));
+	}
+
+free:
+	kvfree(out);
+	return ret;
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+				struct rdma_hw_stats *stats,
+				u8 port_num, int index)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_port *port = &dev->port[port_num - 1];
+	int ret, num_counters;
+
+	if (!stats)
+		return -EINVAL;
+
+	ret = mlx5_ib_query_q_counters(dev, port, stats);
+	if (ret)
+		return ret;
+	num_counters = port->cnts.num_q_counters;
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		ret = mlx5_ib_query_cong_counters(dev, port, stats);
+		if (ret)
+			return ret;
+		num_counters += port->cnts.num_cong_counters;
+	}
+
+	return num_counters;
 }
 
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
@@ -3603,14 +3671,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 		goto err_rsrc;
 
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-		err = mlx5_ib_alloc_q_counters(dev);
+		err = mlx5_ib_alloc_counters(dev);
 		if (err)
 			goto err_odp;
 	}
 
 	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
 	if (!dev->mdev->priv.uar)
-		goto err_q_cnt;
+		goto err_cnt;
 
 	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
 	if (err)
@@ -3654,9 +3722,9 @@ err_bfreg:
 err_uar_page:
 	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 
-err_q_cnt:
+err_cnt:
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-		mlx5_ib_dealloc_q_counters(dev);
+		mlx5_ib_dealloc_counters(dev);
 
 err_odp:
 	mlx5_ib_odp_remove_one(dev);
@@ -3690,7 +3758,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
 	mlx5_put_uars_page(dev->mdev, mdev->priv.uar);
 	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-		mlx5_ib_dealloc_q_counters(dev);
+		mlx5_ib_dealloc_counters(dev);
 	destroy_umrc_res(dev);
 	mlx5_ib_odp_remove_one(dev);
 	destroy_dev_resources(&dev->devr);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ce8ba617d46e..191b82b038d7 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -595,15 +595,16 @@ struct mlx5_ib_resources {
 	struct mutex	mutex;
 };
 
-struct mlx5_ib_q_counters {
+struct mlx5_ib_counters {
 	const char **names;
 	size_t *offsets;
-	u32 num_counters;
+	u32 num_q_counters;
+	u32 num_cong_counters;
 	u16 set_id;
 };
 
 struct mlx5_ib_port {
-	struct mlx5_ib_q_counters q_cnts;
+	struct mlx5_ib_counters cnts;
 };
 
 struct mlx5_roce {
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index ed6320186f89..4e5a811d33c7 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2799,7 +2799,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       qp->port) - 1;
 		mibport = &dev->port[port_num];
 		context->qp_counter_set_usr_page |=
-			cpu_to_be32((u32)(mibport->q_cnts.set_id) << 24);
+			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
 	}
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -2827,7 +2827,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
 		raw_qp_param.operation = op;
 		if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-			raw_qp_param.rq_q_ctr_id = mibport->q_cnts.set_id;
+			raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
 			raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
 		}
 
@@ -4965,7 +4965,8 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) {
 			MLX5_SET64(modify_rq_in, in, modify_bitmask,
 				   MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID);
-			MLX5_SET(rqc, rqc, counter_set_id, dev->port->q_cnts.set_id);
+			MLX5_SET(rqc, rqc, counter_set_id,
+				 dev->port->cnts.set_id);
 		} else
 			pr_info_once("%s: Receive WQ counters are not supported on current FW\n",
 				     dev->ib_dev.name);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 4da6e803b627..954f42c268a4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4735,17 +4735,17 @@ struct mlx5_ifc_query_cong_statistics_out_bits {
 
 	u8         reserved_at_40[0x40];
 
-	u8         cur_flows[0x20];
+	u8         rp_cur_flows[0x20];
 
 	u8         sum_flows[0x20];
 
-	u8         cnp_ignored_high[0x20];
+	u8         rp_cnp_ignored_high[0x20];
 
-	u8         cnp_ignored_low[0x20];
+	u8         rp_cnp_ignored_low[0x20];
 
-	u8         cnp_handled_high[0x20];
+	u8         rp_cnp_handled_high[0x20];
 
-	u8         cnp_handled_low[0x20];
+	u8         rp_cnp_handled_low[0x20];
 
 	u8         reserved_at_140[0x100];
 
@@ -4755,13 +4755,13 @@ struct mlx5_ifc_query_cong_statistics_out_bits {
 
 	u8         accumulators_period[0x20];
 
-	u8         ecn_marked_roce_packets_high[0x20];
+	u8         np_ecn_marked_roce_packets_high[0x20];
 
-	u8         ecn_marked_roce_packets_low[0x20];
+	u8         np_ecn_marked_roce_packets_low[0x20];
 
-	u8         cnps_sent_high[0x20];
+	u8         np_cnp_sent_high[0x20];
 
-	u8         cnps_sent_low[0x20];
+	u8         np_cnp_sent_low[0x20];
 
 	u8         reserved_at_320[0x560];
 };
-- 
cgit v1.2.3


From 2bca34455b257d75080d87e800ae14afe49001bf Mon Sep 17 00:00:00 2001
From: Vignesh R <vigneshr@ti.com>
Date: Tue, 11 Apr 2017 17:22:24 +0530
Subject: spi: Add can_dma like interface for spi_flash_read

Add an interface analogous to ->can_dma() for spi_flash_read()
interface. This will enable SPI controller drivers to inform SPI core
when not to do DMA mappings.

Signed-off-by: Vignesh R <vigneshr@ti.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi.c       | 2 +-
 include/linux/spi/spi.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 656dd3e3220c..5e1bb43b8a8f 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -2811,7 +2811,7 @@ int spi_flash_read(struct spi_device *spi,
 
 	mutex_lock(&master->bus_lock_mutex);
 	mutex_lock(&master->io_mutex);
-	if (master->dma_rx) {
+	if (master->dma_rx && master->spi_flash_can_dma(spi, msg)) {
 		rx_dev = master->dma_rx->device->dev;
 		ret = spi_map_buf(master, rx_dev, &msg->rx_sg,
 				  msg->buf, msg->len,
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 75c6bd0ac605..cd8ae65568e3 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -375,6 +375,8 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @unprepare_message: undo any work done by prepare_message().
  * @spi_flash_read: to support spi-controller hardwares that provide
  *                  accelerated interface to read from flash devices.
+ * @spi_flash_can_dma: analogous to can_dma() interface, but for
+ *		       controllers implementing spi_flash_read.
  * @flash_read_supported: spi device supports flash read
  * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS
  *	number. Any individual value may be -ENOENT for CS lines that
@@ -538,6 +540,8 @@ struct spi_master {
 				 struct spi_message *message);
 	int (*spi_flash_read)(struct  spi_device *spi,
 			      struct spi_flash_read_message *msg);
+	bool (*spi_flash_can_dma)(struct spi_device *spi,
+				  struct spi_flash_read_message *msg);
 	bool (*flash_read_supported)(struct spi_device *spi);
 
 	/*
-- 
cgit v1.2.3


From 7acf8a1e8a28b3d7407a8d8061a7d0766cfac2f4 Mon Sep 17 00:00:00 2001
From: Matthew Whitehead <tedheadster@gmail.com>
Date: Wed, 19 Apr 2017 12:37:10 -0400
Subject: Replace 2 jiffies with sysctl netdev_budget_usecs to enable softirq
 tuning

Constants used for tuning are generally a bad idea, especially as hardware
changes over time. Replace the constant 2 jiffies with sysctl variable
netdev_budget_usecs to enable sysadmins to tune the softirq processing.
Also document the variable.

For example, a very fast machine might tune this to 1000 microseconds,
while my regression testing 486DX-25 needs it to be 4000 microseconds on
a nearly idle network to prevent time_squeeze from being incremented.

Version 2: changed jiffies to microseconds for predictable units.

Signed-off-by: Matthew Whitehead <tedheadster@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 11 ++++++++++-
 include/linux/netdevice.h    |  1 +
 include/uapi/linux/sysctl.h  |  1 +
 kernel/sysctl_binary.c       |  1 +
 net/core/dev.c               |  4 +++-
 net/core/sysctl_net_core.c   |  8 ++++++++
 6 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 2ebabc93014a..14db18c970b1 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -188,7 +188,16 @@ netdev_budget
 
 Maximum number of packets taken from all interfaces in one polling cycle (NAPI
 poll). In one polling cycle interfaces which are registered to polling are
-probed in a round-robin manner.
+probed in a round-robin manner. Also, a polling cycle may not exceed
+netdev_budget_usecs microseconds, even if netdev_budget has not been
+exhausted.
+
+netdev_budget_usecs
+---------------------
+
+Maximum number of microseconds in one NAPI polling cycle. Polling
+will exit when either netdev_budget_usecs have elapsed during the
+poll cycle or the number of packets processed reaches netdev_budget.
 
 netdev_max_backlog
 ------------------
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0f3c38ce5417..c49cf21f2b31 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3296,6 +3296,7 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
 extern int		netdev_budget;
+extern unsigned int	netdev_budget_usecs;
 
 /* Called by rtnetlink.c:rtnl_unlock() */
 void netdev_run_todo(void);
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index e13d48058b8d..177f5f139b36 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -274,6 +274,7 @@ enum
 	NET_CORE_AEVENT_ETIME=20,
 	NET_CORE_AEVENT_RSEQTH=21,
 	NET_CORE_WARNINGS=22,
+	NET_CORE_BUDGET_USECS=23,
 };
 
 /* /proc/sys/net/ethernet */
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index ece4b177052b..4ee3e49530d2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -197,6 +197,7 @@ static const struct bin_table bin_net_core_table[] = {
 	{ CTL_INT,	NET_CORE_AEVENT_ETIME,	"xfrm_aevent_etime" },
 	{ CTL_INT,	NET_CORE_AEVENT_RSEQTH,	"xfrm_aevent_rseqth" },
 	{ CTL_INT,	NET_CORE_WARNINGS,	"warnings" },
+	{ CTL_INT,	NET_CORE_BUDGET_USECS,	"netdev_budget_usecs" },
 	{},
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 5d33e2baab2b..1c53c055b197 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3441,6 +3441,7 @@ EXPORT_SYMBOL(netdev_max_backlog);
 
 int netdev_tstamp_prequeue __read_mostly = 1;
 int netdev_budget __read_mostly = 300;
+unsigned int __read_mostly netdev_budget_usecs = 2000;
 int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
@@ -5307,7 +5308,8 @@ out_unlock:
 static __latent_entropy void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
-	unsigned long time_limit = jiffies + 2;
+	unsigned long time_limit = jiffies +
+		usecs_to_jiffies(netdev_budget_usecs);
 	int budget = netdev_budget;
 	LIST_HEAD(list);
 	LIST_HEAD(repoll);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7f9cc400eca0..ea23254b2457 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -452,6 +452,14 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= &one,
 		.extra2		= &max_skb_frags,
 	},
+	{
+		.procname	= "netdev_budget_usecs",
+		.data		= &netdev_budget_usecs,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From 3073f070a137e140e3faefa87f2446a8deffc07f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 17 Feb 2017 23:13:25 -0500
Subject: switch memcpy_from_msg() to copy_from_iter_full()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c776abd86937..53383bce27f1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3113,7 +3113,7 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
-	return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
+	return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
 }
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
-- 
cgit v1.2.3


From 4f757f3cbf54edef7b75c68d6d6d2f1a0ca08d2e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Apr 2017 17:31:22 -0400
Subject: make sure that mntns_install() doesn't end up with referral for root

new flag: LOOKUP_DOWN.  If the starting point is overmounted, cross
into whatever's mounted on top, triggering referrals et.al.

Use that instead of follow_down_one() loop in mntns_install(), handle
errors properly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 38 ++++++++++++++++++++++++++++++++++++++
 fs/namespace.c        | 18 +++++++++++-------
 include/linux/namei.h |  1 +
 3 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 60c0a78ebca7..646db9cf2579 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2252,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd)
 	return walk_component(nd, 0);
 }
 
+static int handle_lookup_down(struct nameidata *nd)
+{
+	struct path path = nd->path;
+	struct inode *inode = nd->inode;
+	unsigned seq = nd->seq;
+	int err;
+
+	if (nd->flags & LOOKUP_RCU) {
+		/*
+		 * don't bother with unlazy_walk on failure - we are
+		 * at the very beginning of walk, so we lose nothing
+		 * if we simply redo everything in non-RCU mode
+		 */
+		if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
+			return -ECHILD;
+	} else {
+		dget(path.dentry);
+		err = follow_managed(&path, nd);
+		if (unlikely(err < 0))
+			return err;
+		inode = d_backing_inode(path.dentry);
+		seq = 0;
+	}
+	path_to_nameidata(&path, nd);
+	nd->inode = inode;
+	nd->seq = seq;
+	return 0;
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
 {
@@ -2260,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path
 
 	if (IS_ERR(s))
 		return PTR_ERR(s);
+
+	if (unlikely(flags & LOOKUP_DOWN)) {
+		err = handle_lookup_down(nd);
+		if (unlikely(err < 0)) {
+			terminate_walk(nd);
+			return err;
+		}
+	}
+
 	while (!(err = link_path_walk(s, nd))
 		&& ((err = lookup_last(nd)) > 0)) {
 		s = trailing_symlink(nd);
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..0886ef28bff6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3465,8 +3465,9 @@ static void mntns_put(struct ns_common *ns)
 static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 {
 	struct fs_struct *fs = current->fs;
-	struct mnt_namespace *mnt_ns = to_mnt_ns(ns);
+	struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
 	struct path root;
+	int err;
 
 	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
 	    !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
@@ -3477,15 +3478,18 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
 		return -EINVAL;
 
 	get_mnt_ns(mnt_ns);
-	put_mnt_ns(nsproxy->mnt_ns);
+	old_mnt_ns = nsproxy->mnt_ns;
 	nsproxy->mnt_ns = mnt_ns;
 
 	/* Find the root */
-	root.mnt    = &mnt_ns->root->mnt;
-	root.dentry = mnt_ns->root->mnt.mnt_root;
-	path_get(&root);
-	while(d_mountpoint(root.dentry) && follow_down_one(&root))
-		;
+	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
+				"/", LOOKUP_DOWN, &root);
+	if (err) {
+		/* revert to old namespace */
+		nsproxy->mnt_ns = old_mnt_ns;
+		put_mnt_ns(mnt_ns);
+		return err;
+	}
 
 	/* Update the pwd and root */
 	set_fs_pwd(fs, &root);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f29abda31e6d..8b4794e83196 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -44,6 +44,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
 #define LOOKUP_EMPTY		0x4000
+#define LOOKUP_DOWN		0x8000
 
 extern int path_pts(struct path *path);
 
-- 
cgit v1.2.3


From 19b7ccf8651df09d274671b53039c672a52ad84d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 18 Apr 2017 18:43:20 +0200
Subject: block: get rid of blk_integrity_revalidate()

Commit 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk")
introduced blk_integrity_revalidate(), which seems to assume ownership
of the stable pages flag and unilaterally clears it if no blk_integrity
profile is registered:

    if (bi->profile)
            disk->queue->backing_dev_info->capabilities |=
                    BDI_CAP_STABLE_WRITES;
    else
            disk->queue->backing_dev_info->capabilities &=
                    ~BDI_CAP_STABLE_WRITES;

It's called from revalidate_disk() and rescan_partitions(), making it
impossible to enable stable pages for drivers that support partitions
and don't use blk_integrity: while the call in revalidate_disk() can be
trivially worked around (see zram, which doesn't support partitions and
hence gets away with zram_revalidate_disk()), rescan_partitions() can
be triggered from userspace at any time.  This breaks rbd, where the
ceph messenger is responsible for generating/verifying CRCs.

Since blk_integrity_{un,}register() "must" be used for (un)registering
the integrity profile with the block layer, move BDI_CAP_STABLE_WRITES
setting there.  This way drivers that call blk_integrity_register() and
use integrity infrastructure won't interfere with drivers that don't
but still want stable pages.

Fixes: 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk")
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.4+, needs backporting
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-integrity.c     | 19 ++-----------------
 block/partition-generic.c |  1 -
 fs/block_dev.c            |  1 -
 include/linux/genhd.h     |  2 --
 4 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index b3622cb00fc2..ce43a8214d3e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -417,7 +417,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
 	bi->tuple_size = template->tuple_size;
 	bi->tag_size = template->tag_size;
 
-	blk_integrity_revalidate(disk);
+	disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
 }
 EXPORT_SYMBOL(blk_integrity_register);
 
@@ -430,26 +430,11 @@ EXPORT_SYMBOL(blk_integrity_register);
  */
 void blk_integrity_unregister(struct gendisk *disk)
 {
-	blk_integrity_revalidate(disk);
+	disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
 	memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
 
-void blk_integrity_revalidate(struct gendisk *disk)
-{
-	struct blk_integrity *bi = &disk->queue->integrity;
-
-	if (!(disk->flags & GENHD_FL_UP))
-		return;
-
-	if (bi->profile)
-		disk->queue->backing_dev_info->capabilities |=
-			BDI_CAP_STABLE_WRITES;
-	else
-		disk->queue->backing_dev_info->capabilities &=
-			~BDI_CAP_STABLE_WRITES;
-}
-
 void blk_integrity_add(struct gendisk *disk)
 {
 	if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821f..0171a2faad68 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -497,7 +497,6 @@ rescan:
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
-	blk_integrity_revalidate(disk);
 	check_disk_size_change(disk, bdev);
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e405d8e58e31..9ccabe3bb7de 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1453,7 +1453,6 @@ int revalidate_disk(struct gendisk *disk)
 
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
-	blk_integrity_revalidate(disk);
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
 		return ret;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9e11082c7f9b..acff9437e5c3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 extern void blk_integrity_add(struct gendisk *);
 extern void blk_integrity_del(struct gendisk *);
-extern void blk_integrity_revalidate(struct gendisk *);
 #else	/* CONFIG_BLK_DEV_INTEGRITY */
 static inline void blk_integrity_add(struct gendisk *disk) { }
 static inline void blk_integrity_del(struct gendisk *disk) { }
-static inline void blk_integrity_revalidate(struct gendisk *disk) { }
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
 
 #else /* CONFIG_BLOCK */
-- 
cgit v1.2.3


From 6c478ae9204b489f6228e4b535c6ac72851e06d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 17 Apr 2017 22:10:04 -0500
Subject: signal: Make kill_proc_info static

There are no users outside of signal.c so make the function static so
the compiler and other developers have that information.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/sched/signal.h | 1 -
 kernel/signal.c              | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 2cf446704cd4..c06d63b3a583 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -293,7 +293,6 @@ extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
 				const struct cred *, u32);
 extern int kill_pgrp(struct pid *pid, int sig, int priv);
 extern int kill_pid(struct pid *pid, int sig, int priv);
-extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern __must_check bool do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
diff --git a/kernel/signal.c b/kernel/signal.c
index 7e59ebc2c25e..a8c54f384553 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1318,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
 	}
 }
 
-int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+static int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
 	rcu_read_lock();
-- 
cgit v1.2.3


From cf0c3e68aa81f992b0301f62e341b710d385bf68 Mon Sep 17 00:00:00 2001
From: Jeroen Hofstee <jeroen@myspectrum.nl>
Date: Fri, 21 Apr 2017 15:21:11 +0900
Subject: kbuild: fix asm-offset generation to work with clang

KBuild abuses the asm statement to write to a file and
clang chokes about these invalid asm statements. Hack it
even more by fooling this is actual valid asm code.

[masahiro:
 Import Jeroen's work for U-Boot:
 http://patchwork.ozlabs.org/patch/375026/
 Tweak sed script a little to avoid garbage '#' for GCC case, like
 #define NR_PAGEFLAGS 23 /* __NR_PAGEFLAGS       # */ ]

Signed-off-by: Jeroen Hofstee <jeroen@myspectrum.nl>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
---
 include/linux/kbuild.h | 6 +++---
 scripts/Makefile.lib   | 8 ++++++--
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h
index 22a72198c14b..4e80f3a9ad58 100644
--- a/include/linux/kbuild.h
+++ b/include/linux/kbuild.h
@@ -2,14 +2,14 @@
 #define __LINUX_KBUILD_H
 
 #define DEFINE(sym, val) \
-        asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+	asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val))
 
-#define BLANK() asm volatile("\n->" : : )
+#define BLANK() asm volatile("\n.ascii \"->\"" : : )
 
 #define OFFSET(sym, str, mem) \
 	DEFINE(sym, offsetof(struct str, mem))
 
 #define COMMENT(x) \
-	asm volatile("\n->#" x)
+	asm volatile("\n.ascii \"->#" x "\"")
 
 #endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 8109c133887d..2209ed696fb4 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -413,10 +413,14 @@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \
 # ---------------------------------------------------------------------------
 
 # Default sed regexp - multiline due to syntax constraints
+#
+# Use [:space:] because LLVM's integrated assembler inserts <tab> around
+# the .ascii directive whereas GCC keeps the <space> as-is.
 define sed-offsets
-	"/^->/{s:->#\(.*\):/* \1 */:; \
+	's:^[[:space:]]*\.ascii[[:space:]]*"\(.*\)".*:\1:; \
+	/^->/{s:->#\(.*\):/* \1 */:; \
 	s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \
-	s:->::; p;}"
+	s:->::; p;}'
 endef
 
 # Use filechk to avoid rebuilds when a header changes, but the resulting file
-- 
cgit v1.2.3


From 8ba4fcdf0f4068407e98cd9cc0f230c2dd8d56de Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Wed, 19 Apr 2017 09:47:22 +0800
Subject: module: Unify the return value type of try_module_get

The prototypes of try_module_get are different with different macro.
When enable module and module unload, it returns bool, but others not.
Make the return type for try_module_get consistent across all module
config options.

Signed-off-by: Gao Feng <fgao@ikuai8.com>
[jeyu: slightly amended changelog to make it clearer]
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/module.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0297c5cd7cdf..6b79eb76a523 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -582,7 +582,7 @@ extern bool try_module_get(struct module *module);
 extern void module_put(struct module *module);
 
 #else /*!CONFIG_MODULE_UNLOAD*/
-static inline int try_module_get(struct module *module)
+static inline bool try_module_get(struct module *module)
 {
 	return !module || module_is_live(module);
 }
@@ -674,9 +674,9 @@ static inline void __module_get(struct module *module)
 {
 }
 
-static inline int try_module_get(struct module *module)
+static inline bool try_module_get(struct module *module)
 {
-	return 1;
+	return true;
 }
 
 static inline void module_put(struct module *module)
-- 
cgit v1.2.3


From e2460f2a4bc740fae9e23f14d653cf53e90b3f9a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 18 Apr 2017 16:51:48 -0400
Subject: dm: mark targets that pass integrity data

A dm-crypt on dm-integrity device incorrectly advertises an integrity
profile on the DM crypt device.  It can be seen in the files
"/sys/block/dm-*/integrity/*" that both dm-integrity and dm-crypt target
advertise the integrity profile.  That is incorrect, only the
dm-integrity target should advertise the integrity profile.

A general problem in DM is that if we have a DM device that depends on
another device with an integrity profile, the upper device will always
advertise the integrity profile, even when the target driver doesn't
support handling integrity data.

Most targets don't support integrity data, so we provide a whitelist of
targets that support it (linear, delay and striped).  The targets that
support passing integrity data to the lower device are marked with the
flag DM_TARGET_PASSES_INTEGRITY.  The DM core will now advertise
integrity data on a DM device only if all the targets support the
integrity data.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-delay.c         |  1 +
 drivers/md/dm-linear.c        |  1 +
 drivers/md/dm-stripe.c        |  1 +
 drivers/md/dm-table.c         |  7 +++++++
 drivers/md/dm.c               | 16 +++++++++++++---
 include/linux/device-mapper.h |  6 ++++++
 6 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index cc70871a6d29..ae3158795d26 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -340,6 +340,7 @@ out:
 static struct target_type delay_target = {
 	.name	     = "delay",
 	.version     = {1, 2, 1},
+	.features    = DM_TARGET_PASSES_INTEGRITY,
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..ffa0c9c5968a 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -162,6 +162,7 @@ static long linear_direct_access(struct dm_target *ti, sector_t sector,
 static struct target_type linear_target = {
 	.name   = "linear",
 	.version = {1, 3, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..d7e1b86e7570 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -440,6 +440,7 @@ static void stripe_io_hints(struct dm_target *ti,
 static struct target_type stripe_target = {
 	.name   = "striped",
 	.version = {1, 6, 0},
+	.features = DM_TARGET_PASSES_INTEGRITY,
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7fb29db478cd..c68757f94e16 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1135,6 +1135,13 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t)
 	struct list_head *devices = dm_table_get_devices(t);
 	struct dm_dev_internal *dd = NULL;
 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
+	unsigned i;
+
+	for (i = 0; i < dm_table_get_num_targets(t); i++) {
+		struct dm_target *ti = dm_table_get_target(t, i);
+		if (!dm_target_passes_integrity(ti->type))
+			goto no_integrity;
+	}
 
 	list_for_each_entry(dd, devices, list) {
 		template_disk = dd->dm_dev->bdev->bd_disk;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..e602ae0d5d75 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1089,8 +1089,18 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 
 	__bio_clone_fast(clone, bio);
 
-	if (bio_integrity(bio)) {
-		int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+	if (unlikely(bio_integrity(bio) != NULL)) {
+		int r;
+
+		if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
+			     !dm_target_passes_integrity(tio->ti->type))) {
+			DMWARN("%s: the target %s doesn't support integrity data.",
+				dm_device_name(tio->io->md),
+				tio->ti->type->name);
+			return -EIO;
+		}
+
+		r = bio_integrity_clone(clone, bio, GFP_NOIO);
 		if (r < 0)
 			return r;
 	}
@@ -1098,7 +1108,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio,
 	bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
 	clone->bi_iter.bi_size = to_bytes(len);
 
-	if (bio_integrity(bio))
+	if (unlikely(bio_integrity(bio) != NULL))
 		bio_integrity_trim(clone, 0, len);
 
 	return 0;
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 874462153f14..98f981026e4e 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -227,6 +227,12 @@ typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio)
 #define DM_TARGET_INTEGRITY		0x00000010
 #define dm_target_has_integrity(type)	((type)->features & DM_TARGET_INTEGRITY)
 
+/*
+ * A target passes integrity data to the lower device.
+ */
+#define DM_TARGET_PASSES_INTEGRITY	0x00000020
+#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY)
+
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
-- 
cgit v1.2.3


From 49632b5822ea2af0e9531f8d20dcd5fb786093a9 Mon Sep 17 00:00:00 2001
From: "sudarsana.kalluru@cavium.com" <sudarsana.kalluru@cavium.com>
Date: Thu, 20 Apr 2017 22:31:20 -0700
Subject: qed: Add support for static dcbx.

The patch adds driver support for static/local dcbx mode. In this mode
adapter brings up the dcbx link with locally configured parameters
instead of performing the dcbx negotiation with the peer. The feature
is useful when peer device/switch doesn't support dcbx.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 24 +++++++++++++++++++-----
 include/linux/qed/qed_if.h                 |  1 +
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
index 40a4975443e4..233ca9727388 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c
@@ -680,8 +680,14 @@ qed_dcbx_get_operational_params(struct qed_hwfn *p_hwfn,
 		 DCBX_CONFIG_VERSION_CEE);
 	p_operational->cee = val;
 
-	DP_VERBOSE(p_hwfn, QED_MSG_DCB, "Version support: ieee %d, cee %d\n",
-		   p_operational->ieee, p_operational->cee);
+	val = !!(QED_MFW_GET_FIELD(flags, DCBX_CONFIG_VERSION) ==
+		 DCBX_CONFIG_VERSION_STATIC);
+	p_operational->local = val;
+
+	DP_VERBOSE(p_hwfn, QED_MSG_DCB,
+		   "Version support: ieee %d, cee %d, static %d\n",
+		   p_operational->ieee, p_operational->cee,
+		   p_operational->local);
 
 	qed_dcbx_get_common_params(p_hwfn, &p_feat->app,
 				   p_feat->app.app_pri_tbl, &p_feat->ets,
@@ -1235,6 +1241,8 @@ int qed_dcbx_get_config_params(struct qed_hwfn *p_hwfn,
 		p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_CEE;
 	if (dcbx_info->operational.ieee)
 		p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_IEEE;
+	if (dcbx_info->operational.local)
+		p_hwfn->p_dcbx_info->set.ver_num |= DCBX_CONFIG_VERSION_STATIC;
 
 	p_hwfn->p_dcbx_info->set.enabled = dcbx_info->operational.enabled;
 	memcpy(&p_hwfn->p_dcbx_info->set.config.params,
@@ -1784,8 +1792,9 @@ static u8 qed_dcbnl_setdcbx(struct qed_dev *cdev, u8 mode)
 
 	DP_VERBOSE(hwfn, QED_MSG_DCB, "new mode = %x\n", mode);
 
-	if (!(mode & DCB_CAP_DCBX_VER_IEEE) && !(mode & DCB_CAP_DCBX_VER_CEE)) {
-		DP_INFO(hwfn, "Allowed mode is cee, ieee or both\n");
+	if (!(mode & DCB_CAP_DCBX_VER_IEEE) &&
+	    !(mode & DCB_CAP_DCBX_VER_CEE) && !(mode & DCB_CAP_DCBX_STATIC)) {
+		DP_INFO(hwfn, "Allowed modes are cee, ieee or static\n");
 		return 1;
 	}
 
@@ -1805,6 +1814,11 @@ static u8 qed_dcbnl_setdcbx(struct qed_dev *cdev, u8 mode)
 		dcbx_set.enabled = true;
 	}
 
+	if (mode & DCB_CAP_DCBX_STATIC) {
+		dcbx_set.ver_num |= DCBX_CONFIG_VERSION_STATIC;
+		dcbx_set.enabled = true;
+	}
+
 	ptt = qed_ptt_acquire(hwfn);
 	if (!ptt)
 		return 1;
@@ -1813,7 +1827,7 @@ static u8 qed_dcbnl_setdcbx(struct qed_dev *cdev, u8 mode)
 
 	qed_ptt_release(hwfn, ptt);
 
-	return 0;
+	return rc;
 }
 
 static u8 qed_dcbnl_getfeatcfg(struct qed_dev *cdev, int featid, u8 *flags)
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index d44933a058ee..9f966be89510 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -144,6 +144,7 @@ struct qed_dcbx_operational_params {
 	bool enabled;
 	bool ieee;
 	bool cee;
+	bool local;
 	u32 err;
 };
 
-- 
cgit v1.2.3


From 82dfb540aeb277d945bf646ff780493b8a520d8a Mon Sep 17 00:00:00 2001
From: Gerard Garcia <ggarcia@deic.uab.cat>
Date: Fri, 21 Apr 2017 10:10:46 +0100
Subject: VSOCK: Add virtio vsock vsockmon hooks

The virtio drivers deal with struct virtio_vsock_pkt.  Add
virtio_transport_deliver_tap_pkt(pkt) for handing packets to the
vsockmon device.

We call virtio_transport_deliver_tap_pkt(pkt) from
net/vmw_vsock/virtio_transport.c and drivers/vhost/vsock.c instead of
common code.  This is because the drivers may drop packets before
handing them to common code - we still want to capture them.

Signed-off-by: Gerard Garcia <ggarcia@deic.uab.cat>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Jorgen Hansen <jhansen@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/vhost/vsock.c                   |  8 +++++
 include/linux/virtio_vsock.h            |  1 +
 net/vmw_vsock/virtio_transport.c        |  3 ++
 net/vmw_vsock/virtio_transport_common.c | 64 +++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 44eed8eb0725..d939ac1a4997 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -176,6 +176,11 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
 				restart_tx = true;
 		}
 
+		/* Deliver to monitoring devices all correctly transmitted
+		 * packets.
+		 */
+		virtio_transport_deliver_tap_pkt(pkt);
+
 		virtio_transport_free_pkt(pkt);
 	}
 	if (added)
@@ -383,6 +388,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
 
 		len = pkt->len;
 
+		/* Deliver to monitoring devices all received packets */
+		virtio_transport_deliver_tap_pkt(pkt);
+
 		/* Only accept correctly addressed packets */
 		if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid)
 			virtio_transport_recv_pkt(pkt);
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 584f9a647ad4..ab13f0743da8 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -153,5 +153,6 @@ void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
 void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
 u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
 void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
+void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt);
 
 #endif /* _LINUX_VIRTIO_VSOCK_H */
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 68675a151f22..9dffe0282ad4 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -144,6 +144,8 @@ virtio_transport_send_pkt_work(struct work_struct *work)
 		list_del_init(&pkt->list);
 		spin_unlock_bh(&vsock->send_pkt_list_lock);
 
+		virtio_transport_deliver_tap_pkt(pkt);
+
 		reply = pkt->reply;
 
 		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
@@ -370,6 +372,7 @@ static void virtio_transport_rx_work(struct work_struct *work)
 			}
 
 			pkt->len = len - sizeof(pkt->hdr);
+			virtio_transport_deliver_tap_pkt(pkt);
 			virtio_transport_recv_pkt(pkt);
 		}
 	} while (!virtqueue_enable_cb(vq));
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index af087b44ceea..18e24793659f 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -16,6 +16,7 @@
 #include <linux/virtio_ids.h>
 #include <linux/virtio_config.h>
 #include <linux/virtio_vsock.h>
+#include <uapi/linux/vsockmon.h>
 
 #include <net/sock.h>
 #include <net/af_vsock.h>
@@ -85,6 +86,69 @@ out_pkt:
 	return NULL;
 }
 
+/* Packet capture */
+static struct sk_buff *virtio_transport_build_skb(void *opaque)
+{
+	struct virtio_vsock_pkt *pkt = opaque;
+	unsigned char *t_hdr, *payload;
+	struct af_vsockmon_hdr *hdr;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
+			GFP_ATOMIC);
+	if (!skb)
+		return NULL;
+
+	hdr = (struct af_vsockmon_hdr *)skb_put(skb, sizeof(*hdr));
+
+	/* pkt->hdr is little-endian so no need to byteswap here */
+	hdr->src_cid = pkt->hdr.src_cid;
+	hdr->src_port = pkt->hdr.src_port;
+	hdr->dst_cid = pkt->hdr.dst_cid;
+	hdr->dst_port = pkt->hdr.dst_port;
+
+	hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO);
+	hdr->len = cpu_to_le16(sizeof(pkt->hdr));
+	memset(hdr->reserved, 0, sizeof(hdr->reserved));
+
+	switch (le16_to_cpu(pkt->hdr.op)) {
+	case VIRTIO_VSOCK_OP_REQUEST:
+	case VIRTIO_VSOCK_OP_RESPONSE:
+		hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT);
+		break;
+	case VIRTIO_VSOCK_OP_RST:
+	case VIRTIO_VSOCK_OP_SHUTDOWN:
+		hdr->op = cpu_to_le16(AF_VSOCK_OP_DISCONNECT);
+		break;
+	case VIRTIO_VSOCK_OP_RW:
+		hdr->op = cpu_to_le16(AF_VSOCK_OP_PAYLOAD);
+		break;
+	case VIRTIO_VSOCK_OP_CREDIT_UPDATE:
+	case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
+		hdr->op = cpu_to_le16(AF_VSOCK_OP_CONTROL);
+		break;
+	default:
+		hdr->op = cpu_to_le16(AF_VSOCK_OP_UNKNOWN);
+		break;
+	}
+
+	t_hdr = skb_put(skb, sizeof(pkt->hdr));
+	memcpy(t_hdr, &pkt->hdr, sizeof(pkt->hdr));
+
+	if (pkt->len) {
+		payload = skb_put(skb, pkt->len);
+		memcpy(payload, pkt->buf, pkt->len);
+	}
+
+	return skb;
+}
+
+void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt)
+{
+	vsock_deliver_tap(virtio_transport_build_skb, pkt);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt);
+
 static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
 					  struct virtio_vsock_pkt_info *info)
 {
-- 
cgit v1.2.3


From 69226896ad636b94f6d2e55d75ff21a29c4de83b Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Fri, 21 Apr 2017 16:15:38 +0300
Subject: mdio_bus: Issue GPIO RESET to PHYs.

Some boards [1] leave the PHYs at an invalid state
during system power-up or reset thus causing unreliability
issues with the PHY which manifests as PHY not being detected
or link not functional. To fix this, these PHYs need to be RESET
via a GPIO connected to the PHY's RESET pin.

Some boards have a single GPIO controlling the PHY RESET pin of all
PHYs on the bus whereas some others have separate GPIOs controlling
individual PHY RESETs.

In both cases, the RESET de-assertion cannot be done in the PHY driver
as the PHY will not probe till its reset is de-asserted.
So do the RESET de-assertion in the MDIO bus driver.

[1] - am572x-idk, am571x-idk, a437x-idk

Signed-off-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/mdio.txt | 33 ++++++++++++++++++
 drivers/net/phy/mdio_bus.c                     | 47 ++++++++++++++++++++++++++
 drivers/of/of_mdio.c                           |  7 ++++
 include/linux/phy.h                            |  7 ++++
 4 files changed, 94 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/mdio.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/mdio.txt b/Documentation/devicetree/bindings/net/mdio.txt
new file mode 100644
index 000000000000..4ffbbacebda1
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/mdio.txt
@@ -0,0 +1,33 @@
+Common MDIO bus properties.
+
+These are generic properties that can apply to any MDIO bus.
+
+Optional properties:
+- reset-gpios: List of one or more GPIOs that control the RESET lines
+  of the PHYs on that MDIO bus.
+- reset-delay-us: RESET pulse width in microseconds as per PHY datasheet.
+
+A list of child nodes, one per device on the bus is expected. These
+should follow the generic phy.txt, or a device specific binding document.
+
+Example :
+This example shows these optional properties, plus other properties
+required for the TI Davinci MDIO driver.
+
+	davinci_mdio: ethernet@0x5c030000 {
+		compatible = "ti,davinci_mdio";
+		reg = <0x5c030000 0x1000>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		reset-gpios = <&gpio2 5 GPIO_ACTIVE_LOW>;
+		reset-delay-us = <2>;   /* PHY datasheet states 1us min */
+
+		ethphy0: ethernet-phy@1 {
+			reg = <1>;
+		};
+
+		ethphy1: ethernet-phy@3 {
+			reg = <3>;
+		};
+	};
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 5a214f3b8671..a898e5c4ef1b 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -22,8 +22,11 @@
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of_device.h>
 #include <linux/of_mdio.h>
+#include <linux/of_gpio.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
@@ -337,6 +340,7 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner)
 {
 	struct mdio_device *mdiodev;
 	int i, err;
+	struct gpio_desc *gpiod;
 
 	if (NULL == bus || NULL == bus->name ||
 	    NULL == bus->read || NULL == bus->write)
@@ -363,6 +367,35 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner)
 	if (bus->reset)
 		bus->reset(bus);
 
+	/* de-assert bus level PHY GPIO resets */
+	if (bus->num_reset_gpios > 0) {
+		bus->reset_gpiod = devm_kcalloc(&bus->dev,
+						 bus->num_reset_gpios,
+						 sizeof(struct gpio_desc *),
+						 GFP_KERNEL);
+		if (!bus->reset_gpiod)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < bus->num_reset_gpios; i++) {
+		gpiod = devm_gpiod_get_index(&bus->dev, "reset", i,
+					     GPIOD_OUT_LOW);
+		if (IS_ERR(gpiod)) {
+			err = PTR_ERR(gpiod);
+			if (err != -ENOENT) {
+				dev_err(&bus->dev,
+					"mii_bus %s couldn't get reset GPIO\n",
+					bus->id);
+				return err;
+			}
+		} else {
+			bus->reset_gpiod[i] = gpiod;
+			gpiod_set_value_cansleep(gpiod, 1);
+			udelay(bus->reset_delay_us);
+			gpiod_set_value_cansleep(gpiod, 0);
+		}
+	}
+
 	for (i = 0; i < PHY_MAX_ADDR; i++) {
 		if ((bus->phy_mask & (1 << i)) == 0) {
 			struct phy_device *phydev;
@@ -390,6 +423,13 @@ error:
 		mdiodev->device_remove(mdiodev);
 		mdiodev->device_free(mdiodev);
 	}
+
+	/* Put PHYs in RESET to save power */
+	for (i = 0; i < bus->num_reset_gpios; i++) {
+		if (bus->reset_gpiod[i])
+			gpiod_set_value_cansleep(bus->reset_gpiod[i], 1);
+	}
+
 	device_del(&bus->dev);
 	return err;
 }
@@ -411,6 +451,13 @@ void mdiobus_unregister(struct mii_bus *bus)
 		mdiodev->device_remove(mdiodev);
 		mdiodev->device_free(mdiodev);
 	}
+
+	/* Put PHYs in RESET to save power */
+	for (i = 0; i < bus->num_reset_gpios; i++) {
+		if (bus->reset_gpiod[i])
+			gpiod_set_value_cansleep(bus->reset_gpiod[i], 1);
+	}
+
 	device_del(&bus->dev);
 }
 EXPORT_SYMBOL(mdiobus_unregister);
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 0b2979816dbf..7e4c80f9b6cd 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -22,6 +22,8 @@
 #include <linux/of_net.h>
 #include <linux/module.h>
 
+#define DEFAULT_GPIO_RESET_DELAY	10	/* in microseconds */
+
 MODULE_AUTHOR("Grant Likely <grant.likely@secretlab.ca>");
 MODULE_LICENSE("GPL");
 
@@ -221,6 +223,11 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 
 	mdio->dev.of_node = np;
 
+	/* Get bus level PHY reset GPIO details */
+	mdio->reset_delay_us = DEFAULT_GPIO_RESET_DELAY;
+	of_property_read_u32(np, "reset-delay-us", &mdio->reset_delay_us);
+	mdio->num_reset_gpios = of_gpio_named_count(np, "reset-gpios");
+
 	/* Register the MDIO bus */
 	rc = mdiobus_register(mdio);
 	if (rc)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 624cecf69c28..37ca77d86983 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -217,6 +217,13 @@ struct mii_bus {
 	 * matching its address
 	 */
 	int irq[PHY_MAX_ADDR];
+
+	/* GPIO reset pulse width in microseconds */
+	int reset_delay_us;
+	/* Number of reset GPIOs */
+	int num_reset_gpios;
+	/* Array of RESET GPIO descriptors */
+	struct gpio_desc **reset_gpiod;
 };
 #define to_mii_bus(d) container_of(d, struct mii_bus, dev)
 
-- 
cgit v1.2.3


From cf1ef3f0719b4dcb74810ed507e2a2540f9811b4 Mon Sep 17 00:00:00 2001
From: Wei Wang <weiwan@google.com>
Date: Thu, 20 Apr 2017 14:45:46 -0700
Subject: net/tcp_fastopen: Disable active side TFO in certain scenarios

Middlebox firewall issues can potentially cause server's data being
blackholed after a successful 3WHS using TFO. Following are the related
reports from Apple:
https://www.nanog.org/sites/default/files/Paasch_Network_Support.pdf
Slide 31 identifies an issue where the client ACK to the server's data
sent during a TFO'd handshake is dropped.
C ---> syn-data ---> S
C <--- syn/ack ----- S
C (accept & write)
C <---- data ------- S
C ----- ACK -> X     S
		[retry and timeout]

https://www.ietf.org/proceedings/94/slides/slides-94-tcpm-13.pdf
Slide 5 shows a similar situation that the server's data gets dropped
after 3WHS.
C ---- syn-data ---> S
C <--- syn/ack ----- S
C ---- ack --------> S
S (accept & write)
C?  X <- data ------ S
		[retry and timeout]

This is the worst failure b/c the client can not detect such behavior to
mitigate the situation (such as disabling TFO). Failing to proceed, the
application (e.g., SSL library) may simply timeout and retry with TFO
again, and the process repeats indefinitely.

The proposed solution is to disable active TFO globally under the
following circumstances:
1. client side TFO socket detects out of order FIN
2. client side TFO socket receives out of order RST

We disable active side TFO globally for 1hr at first. Then if it
happens again, we disable it for 2h, then 4h, 8h, ...
And we reset the timeout to 1hr if a client side TFO sockets not opened
on loopback has successfully received data segs from server.
And we examine this condition during close().

The rational behind it is that when such firewall issue happens,
application running on the client should eventually close the socket as
it is not able to get the data it is expecting. Or application running
on the server should close the socket as it is not able to receive any
response from client.
In both cases, out of order FIN or RST will get received on the client
given that the firewall will not block them as no data are in those
frames.
And we want to disable active TFO globally as it helps if the middle box
is very close to the client and most of the connections are likely to
fail.

Also, add a debug sysctl:
  tcp_fastopen_blackhole_detect_timeout_sec:
    the initial timeout to use when firewall blackhole issue happens.
    This can be set and read.
    When setting it to 0, it means to disable the active disable logic.

Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   8 +++
 include/linux/tcp.h                    |   1 +
 include/net/tcp.h                      |   6 ++
 net/ipv4/sysctl_net_ipv4.c             |  21 +++++++
 net/ipv4/tcp.c                         |   1 +
 net/ipv4/tcp_fastopen.c                | 101 +++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c                   |  23 ++++++--
 net/ipv4/tcp_ipv4.c                    |   3 +
 8 files changed, 160 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index b1c6500e7a8d..974ab47ae53a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -602,6 +602,14 @@ tcp_fastopen - INTEGER
 	Note that that additional client or server features are only
 	effective if the basic support (0x1 and 0x2) are enabled respectively.
 
+tcp_fastopen_blackhole_timeout_sec - INTEGER
+	Initial time period in second to disable Fastopen on active TCP sockets
+	when a TFO firewall blackhole issue happens.
+	This time period will grow exponentially when more blackhole issues
+	get detected right after Fastopen is re-enabled and will reset to
+	initial value when the blackhole issue goes away.
+	By default, it is set to 1hr.
+
 tcp_syn_retries - INTEGER
 	Number of times initial SYNs for an active TCP connection attempt
 	will be retransmitted. Should not be higher than 127. Default value
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index cfc2d9506ce8..cbe5b602a2d3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -233,6 +233,7 @@ struct tcp_sock {
 	u8	syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
+		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
 		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
 		save_syn:1,	/* Save headers of SYN packet */
 		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cc6ae0a95201..c1abc2abbdcb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1506,6 +1506,12 @@ struct tcp_fastopen_context {
 	struct rcu_head		rcu;
 };
 
+extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
+void tcp_fastopen_active_disable(void);
+bool tcp_fastopen_active_should_disable(struct sock *sk);
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
+void tcp_fastopen_active_timeout_reset(void);
+
 /* Latencies incurred by various limits for a sender. They are
  * chronograph-like stats that are mutually exclusive.
  */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index ddac9e64b702..86957e9cd6c6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -350,6 +350,19 @@ static int proc_udp_early_demux(struct ctl_table *table, int write,
 	return ret;
 }
 
+static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
+					     int write,
+					     void __user *buffer,
+					     size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		tcp_fastopen_active_timeout_reset();
+	return ret;
+}
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_timestamps",
@@ -399,6 +412,14 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
 		.proc_handler	= proc_tcp_fastopen_key,
 	},
+	{
+		.procname	= "tcp_fastopen_blackhole_timeout_sec",
+		.data		= &sysctl_tcp_fastopen_blackhole_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_tfo_blackhole_detect_timeout,
+		.extra1		= &zero,
+	},
 	{
 		.procname	= "tcp_abort_on_overflow",
 		.data		= &sysctl_tcp_abort_on_overflow,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 04843ae77b9e..efc976ae66ae 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tcp_clear_xmit_timers(sk);
 	__skb_queue_purge(&sk->sk_receive_queue);
 	tcp_write_queue_purge(sk);
+	tcp_fastopen_active_disable_ofo_check(sk);
 	skb_rbtree_purge(&tp->out_of_order_queue);
 
 	inet->inet_dport = 0;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 8ea4e9787f82..ff2d30ffc6f3 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 		cookie->len = -1;
 		return false;
 	}
+
+	/* Firewall blackhole issue check */
+	if (tcp_fastopen_active_should_disable(sk)) {
+		cookie->len = -1;
+		return false;
+	}
+
 	if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
 		cookie->len = -1;
 		return true;
@@ -380,3 +387,97 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
 	return false;
 }
 EXPORT_SYMBOL(tcp_fastopen_defer_connect);
+
+/*
+ * The following code block is to deal with middle box issues with TFO:
+ * Middlebox firewall issues can potentially cause server's data being
+ * blackholed after a successful 3WHS using TFO.
+ * The proposed solution is to disable active TFO globally under the
+ * following circumstances:
+ *   1. client side TFO socket receives out of order FIN
+ *   2. client side TFO socket receives out of order RST
+ * We disable active side TFO globally for 1hr at first. Then if it
+ * happens again, we disable it for 2h, then 4h, 8h, ...
+ * And we reset the timeout back to 1hr when we see a successful active
+ * TFO connection with data exchanges.
+ */
+
+/* Default to 1hr */
+unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60;
+static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0);
+static unsigned long tfo_active_disable_stamp __read_mostly;
+
+/* Disable active TFO and record current jiffies and
+ * tfo_active_disable_times
+ */
+void tcp_fastopen_active_disable(void)
+{
+	atomic_inc(&tfo_active_disable_times);
+	tfo_active_disable_stamp = jiffies;
+}
+
+/* Reset tfo_active_disable_times to 0 */
+void tcp_fastopen_active_timeout_reset(void)
+{
+	atomic_set(&tfo_active_disable_times, 0);
+}
+
+/* Calculate timeout for tfo active disable
+ * Return true if we are still in the active TFO disable period
+ * Return false if timeout already expired and we should use active TFO
+ */
+bool tcp_fastopen_active_should_disable(struct sock *sk)
+{
+	int tfo_da_times = atomic_read(&tfo_active_disable_times);
+	int multiplier;
+	unsigned long timeout;
+
+	if (!tfo_da_times)
+		return false;
+
+	/* Limit timout to max: 2^6 * initial timeout */
+	multiplier = 1 << min(tfo_da_times - 1, 6);
+	timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ;
+	if (time_before(jiffies, tfo_active_disable_stamp + timeout))
+		return true;
+
+	/* Mark check bit so we can check for successful active TFO
+	 * condition and reset tfo_active_disable_times
+	 */
+	tcp_sk(sk)->syn_fastopen_ch = 1;
+	return false;
+}
+
+/* Disable active TFO if FIN is the only packet in the ofo queue
+ * and no data is received.
+ * Also check if we can reset tfo_active_disable_times if data is
+ * received successfully on a marked active TFO sockets opened on
+ * a non-loopback interface
+ */
+void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct rb_node *p;
+	struct sk_buff *skb;
+	struct dst_entry *dst;
+
+	if (!tp->syn_fastopen)
+		return;
+
+	if (!tp->data_segs_in) {
+		p = rb_first(&tp->out_of_order_queue);
+		if (p && !rb_next(p)) {
+			skb = rb_entry(p, struct sk_buff, rbnode);
+			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+				tcp_fastopen_active_disable();
+				return;
+			}
+		}
+	} else if (tp->syn_fastopen_ch &&
+		   atomic_read(&tfo_active_disable_times)) {
+		dst = sk_dst_get(sk);
+		if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+			tcp_fastopen_active_timeout_reset();
+		dst_release(dst);
+	}
+}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 341f021f02a2..9f342a67dc74 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5300,8 +5300,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 
 		if (rst_seq_match)
 			tcp_reset(sk);
-		else
+		else {
+			/* Disable TFO if RST is out-of-order
+			 * and no data has been received
+			 * for current active TFO socket
+			 */
+			if (tp->syn_fastopen && !tp->data_segs_in &&
+			    sk->sk_state == TCP_ESTABLISHED)
+				tcp_fastopen_active_disable();
 			tcp_send_challenge_ack(sk, skb);
+		}
 		goto discard;
 	}
 
@@ -6044,9 +6052,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 			break;
 		}
 
-		if (tp->linger2 < 0 ||
-		    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
-		     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+		if (tp->linger2 < 0) {
+			tcp_done(sk);
+			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+			return 1;
+		}
+		if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+		    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+			/* Receive out of order FIN after close() */
+			if (tp->syn_fastopen && th->fin)
+				tcp_fastopen_active_disable();
 			tcp_done(sk);
 			NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
 			return 1;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 20cbd2f07f28..cbbafe546c0f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1855,6 +1855,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	/* Cleanup up the write buffer. */
 	tcp_write_queue_purge(sk);
 
+	/* Check if we want to disable active TFO */
+	tcp_fastopen_active_disable_ofo_check(sk);
+
 	/* Cleans up our, hopefully empty, out_of_order_queue. */
 	skb_rbtree_purge(&tp->out_of_order_queue);
 
-- 
cgit v1.2.3


From 029c1ecbb2429cf08c7bd2de81e929f81feea914 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <benjamin.lahaise@netronome.com>
Date: Sat, 22 Apr 2017 16:52:46 -0400
Subject: flow_dissector: add mpls support (v2)

Add support for parsing MPLS flows to the flow dissector in preparation for
adding MPLS match support to cls_flower.

Signed-off-by: Benjamin LaHaise <benjamin.lahaise@netronome.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Simon Horman <simon.horman@netronome.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Cc: Eric Dumazet <jhs@mojatatu.com>
Cc: Hadar Hen Zion <hadarh@mellanox.com>
Cc: Gao Feng <fgao@ikuai8.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mpls.h         |  5 +++++
 include/net/flow_dissector.h |  8 ++++++++
 net/core/flow_dissector.c    | 25 ++++++++++++++++++++++---
 3 files changed, 35 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mpls.h b/include/linux/mpls.h
index 9999145bc190..384fb22b6c43 100644
--- a/include/linux/mpls.h
+++ b/include/linux/mpls.h
@@ -3,4 +3,9 @@
 
 #include <uapi/linux/mpls.h>
 
+#define MPLS_TTL_MASK		(MPLS_LS_TTL_MASK >> MPLS_LS_TTL_SHIFT)
+#define MPLS_BOS_MASK		(MPLS_LS_S_MASK >> MPLS_LS_S_SHIFT)
+#define MPLS_TC_MASK		(MPLS_LS_TC_MASK >> MPLS_LS_TC_SHIFT)
+#define MPLS_LABEL_MASK		(MPLS_LS_LABEL_MASK >> MPLS_LS_LABEL_SHIFT)
+
 #endif  /* _LINUX_MPLS_H */
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index ac9703018a3a..8d21d448daa9 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -41,6 +41,13 @@ struct flow_dissector_key_vlan {
 	u16	padding;
 };
 
+struct flow_dissector_key_mpls {
+	u32	mpls_ttl:8,
+		mpls_bos:1,
+		mpls_tc:3,
+		mpls_label:20;
+};
+
 struct flow_dissector_key_keyid {
 	__be32	keyid;
 };
@@ -169,6 +176,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c9cf425303f8..28d94bce4df8 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -126,9 +126,11 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb,
 {
 	struct flow_dissector_key_keyid *key_keyid;
 	struct mpls_label *hdr, _hdr[2];
+	u32 entry, label;
 
 	if (!dissector_uses_key(flow_dissector,
-				FLOW_DISSECTOR_KEY_MPLS_ENTROPY))
+				FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
+	    !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
 		return FLOW_DISSECT_RET_OUT_GOOD;
 
 	hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
@@ -136,8 +138,25 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb,
 	if (!hdr)
 		return FLOW_DISSECT_RET_OUT_BAD;
 
-	if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
-	    MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
+	entry = ntohl(hdr[0].entry);
+	label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+
+	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
+		struct flow_dissector_key_mpls *key_mpls;
+
+		key_mpls = skb_flow_dissector_target(flow_dissector,
+						     FLOW_DISSECTOR_KEY_MPLS,
+						     target_container);
+		key_mpls->mpls_label = label;
+		key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK)
+					>> MPLS_LS_TTL_SHIFT;
+		key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK)
+					>> MPLS_LS_TC_SHIFT;
+		key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK)
+					>> MPLS_LS_S_SHIFT;
+	}
+
+	if (label == MPLS_LABEL_ENTROPY) {
 		key_keyid = skb_flow_dissector_target(flow_dissector,
 						      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
 						      target_container);
-- 
cgit v1.2.3


From 490cb6ddb17df5ef5f5eb33c9a34f3033b31c204 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Wed, 19 Apr 2017 17:48:55 +0100
Subject: PCI: Implement devm_pci_remap_cfgspace()

The introduction of the pci_remap_cfgspace() interface allows PCI host
controller drivers to map PCI config space through a dedicated kernel
interface. Current PCI host controller drivers use the devm_ioremap_*()
devres interfaces to map PCI configuration space regions so in order to
update them to the new pci_remap_cfgspace() mapping interface a new set of
devres interfaces should be implemented so that PCI host controller drivers
can make use of them.

Introduce two new functions in the PCI kernel layer and Devres
documentation:

- devm_pci_remap_cfgspace()
- devm_pci_remap_cfg_resource()

so that PCI host controller drivers can make use of them to map PCI
configuration space regions.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/driver-model/devres.txt |  6 ++-
 drivers/pci/pci.c                     | 82 +++++++++++++++++++++++++++++++++++
 include/linux/pci.h                   |  5 +++
 3 files changed, 91 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index bf34d5b3a733..e72587fe477d 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -342,8 +342,10 @@ PER-CPU MEM
   devm_free_percpu()
 
 PCI
-  pcim_enable_device()	: after success, all PCI ops become managed
-  pcim_pin_device()	: keep PCI device enabled after release
+  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
+  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
+  pcim_enable_device()		: after success, all PCI ops become managed
+  pcim_pin_device()		: keep PCI device enabled after release
 
 PHY
   devm_usb_get_phy()
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index bd98674a0419..4129f9453861 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3401,6 +3401,88 @@ void pci_unmap_iospace(struct resource *res)
 #endif
 }
 
+/**
+ * devm_pci_remap_cfgspace - Managed pci_remap_cfgspace()
+ * @dev: Generic device to remap IO address for
+ * @offset: Resource address to map
+ * @size: Size of map
+ *
+ * Managed pci_remap_cfgspace().  Map is automatically unmapped on driver
+ * detach.
+ */
+void __iomem *devm_pci_remap_cfgspace(struct device *dev,
+				      resource_size_t offset,
+				      resource_size_t size)
+{
+	void __iomem **ptr, *addr;
+
+	ptr = devres_alloc(devm_ioremap_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return NULL;
+
+	addr = pci_remap_cfgspace(offset, size);
+	if (addr) {
+		*ptr = addr;
+		devres_add(dev, ptr);
+	} else
+		devres_free(ptr);
+
+	return addr;
+}
+EXPORT_SYMBOL(devm_pci_remap_cfgspace);
+
+/**
+ * devm_pci_remap_cfg_resource - check, request region and ioremap cfg resource
+ * @dev: generic device to handle the resource for
+ * @res: configuration space resource to be handled
+ *
+ * Checks that a resource is a valid memory region, requests the memory
+ * region and ioremaps with pci_remap_cfgspace() API that ensures the
+ * proper PCI configuration space memory attributes are guaranteed.
+ *
+ * All operations are managed and will be undone on driver detach.
+ *
+ * Returns a pointer to the remapped memory or an ERR_PTR() encoded error code
+ * on failure. Usage example:
+ *
+ *	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ *	base = devm_pci_remap_cfg_resource(&pdev->dev, res);
+ *	if (IS_ERR(base))
+ *		return PTR_ERR(base);
+ */
+void __iomem *devm_pci_remap_cfg_resource(struct device *dev,
+					  struct resource *res)
+{
+	resource_size_t size;
+	const char *name;
+	void __iomem *dest_ptr;
+
+	BUG_ON(!dev);
+
+	if (!res || resource_type(res) != IORESOURCE_MEM) {
+		dev_err(dev, "invalid resource\n");
+		return IOMEM_ERR_PTR(-EINVAL);
+	}
+
+	size = resource_size(res);
+	name = res->name ?: dev_name(dev);
+
+	if (!devm_request_mem_region(dev, res->start, size, name)) {
+		dev_err(dev, "can't request region for resource %pR\n", res);
+		return IOMEM_ERR_PTR(-EBUSY);
+	}
+
+	dest_ptr = devm_pci_remap_cfgspace(dev, res->start, size);
+	if (!dest_ptr) {
+		dev_err(dev, "ioremap failed for resource %pR\n", res);
+		devm_release_mem_region(dev, res->start, size);
+		dest_ptr = IOMEM_ERR_PTR(-ENOMEM);
+	}
+
+	return dest_ptr;
+}
+EXPORT_SYMBOL(devm_pci_remap_cfg_resource);
+
 static void __pci_set_master(struct pci_dev *dev, bool enable)
 {
 	u16 old_cmd, cmd;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index eb3da1a04e6c..70534d66d18a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1199,6 +1199,11 @@ unsigned long pci_address_to_pio(phys_addr_t addr);
 phys_addr_t pci_pio_to_address(unsigned long pio);
 int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr);
 void pci_unmap_iospace(struct resource *res);
+void __iomem *devm_pci_remap_cfgspace(struct device *dev,
+				      resource_size_t offset,
+				      resource_size_t size);
+void __iomem *devm_pci_remap_cfg_resource(struct device *dev,
+					  struct resource *res);
 
 static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
 {
-- 
cgit v1.2.3


From 6335d68349a85382cc55a5260d5bfda85f8e24a8 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sun, 26 Mar 2017 20:41:32 +0200
Subject: mmc: core: add mmc_get_dma_dir

Add function for determining DMA direction to core.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 include/linux/mmc/host.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 83f1c4a9f03b..21385ac0c9b1 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -17,6 +17,7 @@
 #include <linux/mmc/core.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/pm.h>
+#include <linux/dma-direction.h>
 
 struct mmc_ios {
 	unsigned int	clock;			/* clock rate */
@@ -499,6 +500,11 @@ static inline bool mmc_can_retune(struct mmc_host *host)
 	return host->can_retune == 1;
 }
 
+static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data)
+{
+	return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+}
+
 int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error);
 int mmc_abort_tuning(struct mmc_host *host, u32 opcode);
 
-- 
cgit v1.2.3


From 7b410d074b253a44624497a18e73f666a9574f37 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 13 Mar 2017 14:36:36 +0200
Subject: mmc: queue: Share mmc request array between partitions

eMMC can have multiple internal partitions that are represented as separate
disks / queues. However switching between partitions is only done when the
queue is empty. Consequently the array of mmc requests that are queued can
be shared between partitions saving memory.

Keep a pointer to the mmc request queue on the card, and use that instead
of allocating a new one for each partition.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/block.c |  11 ++-
 drivers/mmc/core/queue.c | 234 ++++++++++++++++++++++++++++-------------------
 drivers/mmc/core/queue.h |   2 +
 include/linux/mmc/card.h |   5 +
 4 files changed, 156 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 16c313a62129..018488e7f194 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2123,6 +2123,7 @@ static int mmc_blk_probe(struct mmc_card *card)
 {
 	struct mmc_blk_data *md, *part_md;
 	char cap_str[10];
+	int ret;
 
 	/*
 	 * Check that the card supports the command class(es) we need.
@@ -2132,9 +2133,15 @@ static int mmc_blk_probe(struct mmc_card *card)
 
 	mmc_fixup_device(card, mmc_blk_fixups);
 
+	ret = mmc_queue_alloc_shared_queue(card);
+	if (ret)
+		return ret;
+
 	md = mmc_blk_alloc(card);
-	if (IS_ERR(md))
+	if (IS_ERR(md)) {
+		mmc_queue_free_shared_queue(card);
 		return PTR_ERR(md);
+	}
 
 	string_get_size((u64)get_capacity(md->disk), 512, STRING_UNITS_2,
 			cap_str, sizeof(cap_str));
@@ -2172,6 +2179,7 @@ static int mmc_blk_probe(struct mmc_card *card)
  out:
 	mmc_blk_remove_parts(card, md);
 	mmc_blk_remove_req(md);
+	mmc_queue_free_shared_queue(card);
 	return 0;
 }
 
@@ -2189,6 +2197,7 @@ static void mmc_blk_remove(struct mmc_card *card)
 	pm_runtime_put_noidle(&card->dev);
 	mmc_blk_remove_req(md);
 	dev_set_drvdata(&card->dev, NULL);
+	mmc_queue_free_shared_queue(card);
 }
 
 static int _mmc_blk_suspend(struct mmc_card *card)
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 4a2045527b62..3423b7acf744 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -149,17 +149,13 @@ static void mmc_request_fn(struct request_queue *q)
 		wake_up_process(mq->thread);
 }
 
-static struct scatterlist *mmc_alloc_sg(int sg_len, int *err)
+static struct scatterlist *mmc_alloc_sg(int sg_len)
 {
 	struct scatterlist *sg;
 
 	sg = kmalloc_array(sg_len, sizeof(*sg), GFP_KERNEL);
-	if (!sg)
-		*err = -ENOMEM;
-	else {
-		*err = 0;
+	if (sg)
 		sg_init_table(sg, sg_len);
-	}
 
 	return sg;
 }
@@ -185,6 +181,32 @@ static void mmc_queue_setup_discard(struct request_queue *q,
 		queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q);
 }
 
+static void mmc_queue_req_free_bufs(struct mmc_queue_req *mqrq)
+{
+	kfree(mqrq->bounce_sg);
+	mqrq->bounce_sg = NULL;
+
+	kfree(mqrq->sg);
+	mqrq->sg = NULL;
+
+	kfree(mqrq->bounce_buf);
+	mqrq->bounce_buf = NULL;
+}
+
+static void mmc_queue_reqs_free_bufs(struct mmc_queue_req *mqrq, int qdepth)
+{
+	int i;
+
+	for (i = 0; i < qdepth; i++)
+		mmc_queue_req_free_bufs(&mqrq[i]);
+}
+
+static void mmc_queue_free_mqrqs(struct mmc_queue_req *mqrq, int qdepth)
+{
+	mmc_queue_reqs_free_bufs(mqrq, qdepth);
+	kfree(mqrq);
+}
+
 static struct mmc_queue_req *mmc_queue_alloc_mqrqs(int qdepth)
 {
 	struct mmc_queue_req *mqrq;
@@ -200,79 +222,137 @@ static struct mmc_queue_req *mmc_queue_alloc_mqrqs(int qdepth)
 }
 
 #ifdef CONFIG_MMC_BLOCK_BOUNCE
-static bool mmc_queue_alloc_bounce_bufs(struct mmc_queue *mq,
-					unsigned int bouncesz)
+static int mmc_queue_alloc_bounce_bufs(struct mmc_queue_req *mqrq, int qdepth,
+				       unsigned int bouncesz)
 {
 	int i;
 
-	for (i = 0; i < mq->qdepth; i++) {
-		mq->mqrq[i].bounce_buf = kmalloc(bouncesz, GFP_KERNEL);
-		if (!mq->mqrq[i].bounce_buf)
-			goto out_err;
-	}
+	for (i = 0; i < qdepth; i++) {
+		mqrq[i].bounce_buf = kmalloc(bouncesz, GFP_KERNEL);
+		if (!mqrq[i].bounce_buf)
+			return -ENOMEM;
 
-	return true;
+		mqrq[i].sg = mmc_alloc_sg(1);
+		if (!mqrq[i].sg)
+			return -ENOMEM;
 
-out_err:
-	while (--i >= 0) {
-		kfree(mq->mqrq[i].bounce_buf);
-		mq->mqrq[i].bounce_buf = NULL;
+		mqrq[i].bounce_sg = mmc_alloc_sg(bouncesz / 512);
+		if (!mqrq[i].bounce_sg)
+			return -ENOMEM;
 	}
-	pr_warn("%s: unable to allocate bounce buffers\n",
-		mmc_card_name(mq->card));
-	return false;
+
+	return 0;
 }
 
-static int mmc_queue_alloc_bounce_sgs(struct mmc_queue *mq,
-				      unsigned int bouncesz)
+static bool mmc_queue_alloc_bounce(struct mmc_queue_req *mqrq, int qdepth,
+				   unsigned int bouncesz)
 {
-	int i, ret;
+	int ret;
 
-	for (i = 0; i < mq->qdepth; i++) {
-		mq->mqrq[i].sg = mmc_alloc_sg(1, &ret);
-		if (ret)
-			return ret;
+	ret = mmc_queue_alloc_bounce_bufs(mqrq, qdepth, bouncesz);
+	if (ret)
+		mmc_queue_reqs_free_bufs(mqrq, qdepth);
 
-		mq->mqrq[i].bounce_sg = mmc_alloc_sg(bouncesz / 512, &ret);
-		if (ret)
-			return ret;
-	}
+	return !ret;
+}
+
+static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host)
+{
+	unsigned int bouncesz = MMC_QUEUE_BOUNCESZ;
+
+	if (host->max_segs != 1)
+		return 0;
+
+	if (bouncesz > host->max_req_size)
+		bouncesz = host->max_req_size;
+	if (bouncesz > host->max_seg_size)
+		bouncesz = host->max_seg_size;
+	if (bouncesz > host->max_blk_count * 512)
+		bouncesz = host->max_blk_count * 512;
+
+	if (bouncesz <= 512)
+		return 0;
+
+	return bouncesz;
+}
+#else
+static inline bool mmc_queue_alloc_bounce(struct mmc_queue_req *mqrq,
+					  int qdepth, unsigned int bouncesz)
+{
+	return false;
+}
 
+static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host)
+{
 	return 0;
 }
 #endif
 
-static int mmc_queue_alloc_sgs(struct mmc_queue *mq, int max_segs)
+static int mmc_queue_alloc_sgs(struct mmc_queue_req *mqrq, int qdepth,
+			       int max_segs)
 {
-	int i, ret;
+	int i;
 
-	for (i = 0; i < mq->qdepth; i++) {
-		mq->mqrq[i].sg = mmc_alloc_sg(max_segs, &ret);
-		if (ret)
-			return ret;
+	for (i = 0; i < qdepth; i++) {
+		mqrq[i].sg = mmc_alloc_sg(max_segs);
+		if (!mqrq[i].sg)
+			return -ENOMEM;
 	}
 
 	return 0;
 }
 
-static void mmc_queue_req_free_bufs(struct mmc_queue_req *mqrq)
+void mmc_queue_free_shared_queue(struct mmc_card *card)
 {
-	kfree(mqrq->bounce_sg);
-	mqrq->bounce_sg = NULL;
+	if (card->mqrq) {
+		mmc_queue_free_mqrqs(card->mqrq, card->qdepth);
+		card->mqrq = NULL;
+	}
+}
 
-	kfree(mqrq->sg);
-	mqrq->sg = NULL;
+static int __mmc_queue_alloc_shared_queue(struct mmc_card *card, int qdepth)
+{
+	struct mmc_host *host = card->host;
+	struct mmc_queue_req *mqrq;
+	unsigned int bouncesz;
+	int ret = 0;
 
-	kfree(mqrq->bounce_buf);
-	mqrq->bounce_buf = NULL;
+	if (card->mqrq)
+		return -EINVAL;
+
+	mqrq = mmc_queue_alloc_mqrqs(qdepth);
+	if (!mqrq)
+		return -ENOMEM;
+
+	card->mqrq = mqrq;
+	card->qdepth = qdepth;
+
+	bouncesz = mmc_queue_calc_bouncesz(host);
+
+	if (bouncesz && !mmc_queue_alloc_bounce(mqrq, qdepth, bouncesz)) {
+		bouncesz = 0;
+		pr_warn("%s: unable to allocate bounce buffers\n",
+			mmc_card_name(card));
+	}
+
+	card->bouncesz = bouncesz;
+
+	if (!bouncesz) {
+		ret = mmc_queue_alloc_sgs(mqrq, qdepth, host->max_segs);
+		if (ret)
+			goto out_err;
+	}
+
+	return ret;
+
+out_err:
+	mmc_queue_free_shared_queue(card);
+	return ret;
 }
 
-static void mmc_queue_reqs_free_bufs(struct mmc_queue *mq)
+int mmc_queue_alloc_shared_queue(struct mmc_card *card)
 {
-	int i;
-
-	for (i = 0; i < mq->qdepth; i++)
-		mmc_queue_req_free_bufs(&mq->mqrq[i]);
+	return __mmc_queue_alloc_shared_queue(card, 2);
 }
 
 /**
@@ -289,7 +369,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 {
 	struct mmc_host *host = card->host;
 	u64 limit = BLK_BOUNCE_HIGH;
-	bool bounce = false;
 	int ret = -ENOMEM;
 
 	if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
@@ -300,10 +379,8 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	if (!mq->queue)
 		return -ENOMEM;
 
-	mq->qdepth = 2;
-	mq->mqrq = mmc_queue_alloc_mqrqs(mq->qdepth);
-	if (!mq->mqrq)
-		goto blk_cleanup;
+	mq->mqrq = card->mqrq;
+	mq->qdepth = card->qdepth;
 	mq->queue->queuedata = mq;
 
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
@@ -312,44 +389,17 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	if (mmc_can_erase(card))
 		mmc_queue_setup_discard(mq->queue, card);
 
-#ifdef CONFIG_MMC_BLOCK_BOUNCE
-	if (host->max_segs == 1) {
-		unsigned int bouncesz;
-
-		bouncesz = MMC_QUEUE_BOUNCESZ;
-
-		if (bouncesz > host->max_req_size)
-			bouncesz = host->max_req_size;
-		if (bouncesz > host->max_seg_size)
-			bouncesz = host->max_seg_size;
-		if (bouncesz > (host->max_blk_count * 512))
-			bouncesz = host->max_blk_count * 512;
-
-		if (bouncesz > 512 &&
-		    mmc_queue_alloc_bounce_bufs(mq, bouncesz)) {
-			blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY);
-			blk_queue_max_hw_sectors(mq->queue, bouncesz / 512);
-			blk_queue_max_segments(mq->queue, bouncesz / 512);
-			blk_queue_max_segment_size(mq->queue, bouncesz);
-
-			ret = mmc_queue_alloc_bounce_sgs(mq, bouncesz);
-			if (ret)
-				goto cleanup_queue;
-			bounce = true;
-		}
-	}
-#endif
-
-	if (!bounce) {
+	if (card->bouncesz) {
+		blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY);
+		blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512);
+		blk_queue_max_segments(mq->queue, card->bouncesz / 512);
+		blk_queue_max_segment_size(mq->queue, card->bouncesz);
+	} else {
 		blk_queue_bounce_limit(mq->queue, limit);
 		blk_queue_max_hw_sectors(mq->queue,
 			min(host->max_blk_count, host->max_req_size / 512));
 		blk_queue_max_segments(mq->queue, host->max_segs);
 		blk_queue_max_segment_size(mq->queue, host->max_seg_size);
-
-		ret = mmc_queue_alloc_sgs(mq, host->max_segs);
-		if (ret)
-			goto cleanup_queue;
 	}
 
 	sema_init(&mq->thread_sem, 1);
@@ -364,11 +414,8 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 
 	return 0;
 
- cleanup_queue:
-	mmc_queue_reqs_free_bufs(mq);
-	kfree(mq->mqrq);
+cleanup_queue:
 	mq->mqrq = NULL;
-blk_cleanup:
 	blk_cleanup_queue(mq->queue);
 	return ret;
 }
@@ -390,10 +437,7 @@ void mmc_cleanup_queue(struct mmc_queue *mq)
 	blk_start_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	mmc_queue_reqs_free_bufs(mq);
-	kfree(mq->mqrq);
 	mq->mqrq = NULL;
-
 	mq->card = NULL;
 }
 EXPORT_SYMBOL(mmc_cleanup_queue);
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 967808df45b8..871796c3f406 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -51,6 +51,8 @@ struct mmc_queue {
 	unsigned long		qslots;
 };
 
+extern int mmc_queue_alloc_shared_queue(struct mmc_card *card);
+extern void mmc_queue_free_shared_queue(struct mmc_card *card);
 extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
 			  const char *);
 extern void mmc_cleanup_queue(struct mmc_queue *);
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 77e61e0a216a..119ef8f0155c 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -208,6 +208,7 @@ struct sdio_cis {
 struct mmc_host;
 struct sdio_func;
 struct sdio_func_tuple;
+struct mmc_queue_req;
 
 #define SDIO_MAX_FUNCS		7
 
@@ -300,6 +301,10 @@ struct mmc_card {
 	struct dentry		*debugfs_root;
 	struct mmc_part	part[MMC_NUM_PHY_PARTITION]; /* physical partitions */
 	unsigned int    nr_parts;
+
+	struct mmc_queue_req	*mqrq;		/* Shared queue structure */
+	unsigned int		bouncesz;	/* Bounce buffer size */
+	int			qdepth;		/* Shared queue depth */
 };
 
 static inline bool mmc_large_sector(struct mmc_card *card)
-- 
cgit v1.2.3


From b658af718465cd1e8011c8da281befdfc2debefd Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 13 Mar 2017 14:36:37 +0200
Subject: mmc: mmc: Add functions to enable / disable the Command Queue

Add helper functions to enable or disable the Command Queue.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 Documentation/mmc/mmc-dev-attrs.txt |  1 +
 drivers/mmc/core/mmc.c              |  2 ++
 drivers/mmc/core/mmc_ops.c          | 28 ++++++++++++++++++++++++++++
 drivers/mmc/core/mmc_ops.h          |  2 ++
 include/linux/mmc/card.h            |  1 +
 5 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/mmc/mmc-dev-attrs.txt b/Documentation/mmc/mmc-dev-attrs.txt
index 404a0e9e92b0..dcd1252877fb 100644
--- a/Documentation/mmc/mmc-dev-attrs.txt
+++ b/Documentation/mmc/mmc-dev-attrs.txt
@@ -30,6 +30,7 @@ All attributes are read-only.
 	rel_sectors		Reliable write sector count
 	ocr 			Operation Conditions Register
 	dsr			Driver Stage Register
+	cmdq_en			Command Queue enabled: 1 => enabled, 0 => not enabled
 
 Note on Erase Size and Preferred Erase Size:
 
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index b502601df228..35b2d0f767ca 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -790,6 +790,7 @@ MMC_DEV_ATTR(enhanced_area_size, "%u\n", card->ext_csd.enhanced_area_size);
 MMC_DEV_ATTR(raw_rpmb_size_mult, "%#x\n", card->ext_csd.raw_rpmb_size_mult);
 MMC_DEV_ATTR(rel_sectors, "%#x\n", card->ext_csd.rel_sectors);
 MMC_DEV_ATTR(ocr, "%08x\n", card->ocr);
+MMC_DEV_ATTR(cmdq_en, "%d\n", card->ext_csd.cmdq_en);
 
 static ssize_t mmc_fwrev_show(struct device *dev,
 			      struct device_attribute *attr,
@@ -845,6 +846,7 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_rel_sectors.attr,
 	&dev_attr_ocr.attr,
 	&dev_attr_dsr.attr,
+	&dev_attr_cmdq_en.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(mmc_std);
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index fe80f26d6971..24c58d24c19a 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -838,3 +838,31 @@ int mmc_can_ext_csd(struct mmc_card *card)
 {
 	return (card && card->csd.mmca_vsn > CSD_SPEC_VER_3);
 }
+
+static int mmc_cmdq_switch(struct mmc_card *card, bool enable)
+{
+	u8 val = enable ? EXT_CSD_CMDQ_MODE_ENABLED : 0;
+	int err;
+
+	if (!card->ext_csd.cmdq_support)
+		return -EOPNOTSUPP;
+
+	err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL, EXT_CSD_CMDQ_MODE_EN,
+			 val, card->ext_csd.generic_cmd6_time);
+	if (!err)
+		card->ext_csd.cmdq_en = enable;
+
+	return err;
+}
+
+int mmc_cmdq_enable(struct mmc_card *card)
+{
+	return mmc_cmdq_switch(card, true);
+}
+EXPORT_SYMBOL_GPL(mmc_cmdq_enable);
+
+int mmc_cmdq_disable(struct mmc_card *card)
+{
+	return mmc_cmdq_switch(card, false);
+}
+EXPORT_SYMBOL_GPL(mmc_cmdq_disable);
diff --git a/drivers/mmc/core/mmc_ops.h b/drivers/mmc/core/mmc_ops.h
index 74beea8a9c7e..978bd2e60f8a 100644
--- a/drivers/mmc/core/mmc_ops.h
+++ b/drivers/mmc/core/mmc_ops.h
@@ -46,6 +46,8 @@ int mmc_read_bkops_status(struct mmc_card *card);
 void mmc_start_bkops(struct mmc_card *card, bool from_exception);
 int mmc_can_reset(struct mmc_card *card);
 int mmc_flush_cache(struct mmc_card *card);
+int mmc_cmdq_enable(struct mmc_card *card);
+int mmc_cmdq_disable(struct mmc_card *card);
 
 #endif
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 119ef8f0155c..94637796b99c 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -89,6 +89,7 @@ struct mmc_ext_csd {
 	unsigned int		boot_ro_lock;		/* ro lock support */
 	bool			boot_ro_lockable;
 	bool			ffu_capable;	/* Firmware upgrade support */
+	bool			cmdq_en;	/* Command Queue enabled */
 	bool			cmdq_support;	/* Command Queue supported */
 	unsigned int		cmdq_depth;	/* Command Queue depth */
 #define MMC_FIRMWARE_LEN 8
-- 
cgit v1.2.3


From 9d4579a85c84340044b10ffa6cd576397f59dc93 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 13 Mar 2017 14:36:38 +0200
Subject: mmc: mmc_test: Disable Command Queue while mmc_test is used

Normal read and write commands may not be used while the command queue is
enabled. Disable the Command Queue when mmc_test is probed and re-enable it
when it is removed.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Reviewed-by: Harjani Ritesh <riteshh@codeaurora.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/mmc.c      |  7 +++++++
 drivers/mmc/core/mmc_test.c | 14 ++++++++++++++
 include/linux/mmc/card.h    |  2 ++
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 35b2d0f767ca..2c87dede5841 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1789,6 +1789,13 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 		}
 	}
 
+	/*
+	 * In some cases (e.g. RPMB or mmc_test), the Command Queue must be
+	 * disabled for a time, so a flag is needed to indicate to re-enable the
+	 * Command Queue.
+	 */
+	card->reenable_cmdq = card->ext_csd.cmdq_en;
+
 	/*
 	 * The mandatory minimum values are defined for packed command.
 	 * read: 5, write: 3
diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c
index f99ac3123fd2..fd1b4b8510b9 100644
--- a/drivers/mmc/core/mmc_test.c
+++ b/drivers/mmc/core/mmc_test.c
@@ -26,6 +26,7 @@
 #include "card.h"
 #include "host.h"
 #include "bus.h"
+#include "mmc_ops.h"
 
 #define RESULT_OK		0
 #define RESULT_FAIL		1
@@ -3264,6 +3265,14 @@ static int mmc_test_probe(struct mmc_card *card)
 	if (ret)
 		return ret;
 
+	if (card->ext_csd.cmdq_en) {
+		mmc_claim_host(card->host);
+		ret = mmc_cmdq_disable(card);
+		mmc_release_host(card->host);
+		if (ret)
+			return ret;
+	}
+
 	dev_info(&card->dev, "Card claimed for testing.\n");
 
 	return 0;
@@ -3271,6 +3280,11 @@ static int mmc_test_probe(struct mmc_card *card)
 
 static void mmc_test_remove(struct mmc_card *card)
 {
+	if (card->reenable_cmdq) {
+		mmc_claim_host(card->host);
+		mmc_cmdq_enable(card);
+		mmc_release_host(card->host);
+	}
 	mmc_test_free_result(card);
 	mmc_test_free_dbgfs_file(card);
 }
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 94637796b99c..85b5f2bc8bb9 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -269,6 +269,8 @@ struct mmc_card {
 #define MMC_QUIRK_TRIM_BROKEN	(1<<12)		/* Skip trim */
 #define MMC_QUIRK_BROKEN_HPI	(1<<13)		/* Disable broken HPI support */
 
+	bool			reenable_cmdq;	/* Re-enable Command Queue */
+
 	unsigned int		erase_size;	/* erase size in sectors */
  	unsigned int		erase_shift;	/* if erase unit is power 2 */
  	unsigned int		pref_erase;	/* in sectors */
-- 
cgit v1.2.3


From 33e6d74d65c358270f00d228877178964aab84b3 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 24 Apr 2017 13:41:55 -0500
Subject: mmc: core: Export API to allow hosts to get the card address

Some hosts controllers, like Cavium, needs to know whether the card
operates in byte- or block-address mode. Therefore export a new API,
mmc_card_is_blockaddr(), which provides this information.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Steven J. Hill <Steven.Hill@cavium.com>
Acked-by: David Daney <david.daney@cavium.com>
---
 drivers/mmc/core/core.c  | 6 ++++++
 include/linux/mmc/card.h | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index e5f6bbfa73dd..82c45ddfa202 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2560,6 +2560,12 @@ unsigned int mmc_calc_max_discard(struct mmc_card *card)
 }
 EXPORT_SYMBOL(mmc_calc_max_discard);
 
+bool mmc_card_is_blockaddr(struct mmc_card *card)
+{
+	return card ? mmc_card_blockaddr(card) : false;
+}
+EXPORT_SYMBOL(mmc_card_is_blockaddr);
+
 int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen)
 {
 	struct mmc_command cmd = {};
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 85b5f2bc8bb9..aad015e0152b 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -315,6 +315,8 @@ static inline bool mmc_large_sector(struct mmc_card *card)
 	return card->ext_csd.data_sector_size == 4096;
 }
 
+bool mmc_card_is_blockaddr(struct mmc_card *card);
+
 #define mmc_card_mmc(c)		((c)->type == MMC_TYPE_MMC)
 #define mmc_card_sd(c)		((c)->type == MMC_TYPE_SD)
 #define mmc_card_sdio(c)	((c)->type == MMC_TYPE_SDIO)
-- 
cgit v1.2.3


From 29fa6c567855eb92d21122162029c5709892106b Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Wed, 19 Apr 2017 17:36:38 -0700
Subject: f2fs: add parentheses for macro variables more

This patch adds parentheses for macro variables more in include/linux/f2fs_fs.h.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index e2d239ed4c60..639cbdf65e2b 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -32,9 +32,9 @@
 /* 0, 1(node nid), 2(meta nid) are reserved node id */
 #define F2FS_RESERVED_NODE_NUM		3
 
-#define F2FS_ROOT_INO(sbi)	(sbi->root_ino_num)
-#define F2FS_NODE_INO(sbi)	(sbi->node_ino_num)
-#define F2FS_META_INO(sbi)	(sbi->meta_ino_num)
+#define F2FS_ROOT_INO(sbi)	((sbi)->root_ino_num)
+#define F2FS_NODE_INO(sbi)	((sbi)->node_ino_num)
+#define F2FS_META_INO(sbi)	((sbi)->meta_ino_num)
 
 #define F2FS_IO_SIZE(sbi)	(1 << (sbi)->write_io_size_bits) /* Blocks */
 #define F2FS_IO_SIZE_KB(sbi)	(1 << ((sbi)->write_io_size_bits + 2)) /* KB */
@@ -161,7 +161,7 @@ struct f2fs_checkpoint {
  */
 #define F2FS_ORPHANS_PER_BLOCK	1020
 
-#define GET_ORPHAN_BLOCKS(n)	((n + F2FS_ORPHANS_PER_BLOCK - 1) / \
+#define GET_ORPHAN_BLOCKS(n)	(((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \
 					F2FS_ORPHANS_PER_BLOCK)
 
 struct f2fs_orphan_block {
@@ -449,7 +449,7 @@ typedef __le32	f2fs_hash_t;
 #define F2FS_SLOT_LEN		8
 #define F2FS_SLOT_LEN_BITS	3
 
-#define GET_DENTRY_SLOTS(x)	((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
+#define GET_DENTRY_SLOTS(x) (((x) + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
 
 /* MAX level for dir lookup */
 #define MAX_DIR_HASH_DEPTH	63
-- 
cgit v1.2.3


From e390b55d5aefe2b51569068b2a505d19d72afbf1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 24 Apr 2017 22:14:35 +0200
Subject: bpf: make bpf_xdp_adjust_head support mandatory

Now that also the last in-tree user of the xdp_adjust_head bit has
been removed, we can remove the flag from struct bpf_prog altogether.

This, at the same time, also makes sure that any future driver for
XDP comes with bpf_xdp_adjust_head() support right away.

A rejection based on this flag would also mean that tail calls
couldn't be used with such driver as per c2002f983767 ("bpf: fix
checking xdp_adjust_head on tail calls") fix, thus lets not allow
for it in the first place.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 3 +--
 kernel/bpf/verifier.c  | 3 ---
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 511fe910bf1d..9a7786db14fa 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -413,8 +413,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				xdp_adjust_head:1; /* Adjusting pkt head? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ca15cf2b85bb..6f8b6ed690be 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3346,8 +3346,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
-		if (insn->imm == BPF_FUNC_xdp_adjust_head)
-			prog->xdp_adjust_head = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
@@ -3355,7 +3353,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 * the program array.
 			 */
 			prog->cb_access = 1;
-			prog->xdp_adjust_head = 1;
 
 			/* mark bpf_tail_call as different opcode to avoid
 			 * conditional branch in the interpeter for every normal
-- 
cgit v1.2.3


From c3df7c5755ee1a53cd56a4efcf3426334ab9eea4 Mon Sep 17 00:00:00 2001
From: Stephane Grosjean <s.grosjean@peak-system.com>
Date: Thu, 19 Jan 2017 16:31:06 +0100
Subject: can: peak: move header file to new can common subdir

The CAN-FD IP from PEAK-System runs into several kinds of PC CAN-FD
interfaces. Up to now, only the USB CAN-FD adapters were supported by
the Kernel. In order to prepare the adding of some new non-USB CAN-FD
interfaces, this patch moves - and rename - the IP definitions file
from its private (usb) sub-directory into a - newly created - CAN specific
one.

Signed-off-by: Stephane Grosjean <s.grosjean@peak-system.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/usb/peak_usb/pcan_ucan.h   | 243 -----------------------------
 drivers/net/can/usb/peak_usb/pcan_usb_fd.c |   2 +-
 include/linux/can/dev/peak_canfd.h         | 243 +++++++++++++++++++++++++++++
 3 files changed, 244 insertions(+), 244 deletions(-)
 delete mode 100644 drivers/net/can/usb/peak_usb/pcan_ucan.h
 create mode 100644 include/linux/can/dev/peak_canfd.h

(limited to 'include/linux')

diff --git a/drivers/net/can/usb/peak_usb/pcan_ucan.h b/drivers/net/can/usb/peak_usb/pcan_ucan.h
deleted file mode 100644
index 25e20ef2fef8..000000000000
--- a/drivers/net/can/usb/peak_usb/pcan_ucan.h
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * CAN driver for PEAK System micro-CAN based adapters
- *
- * Copyright (C) 2003-2011 PEAK System-Technik GmbH
- * Copyright (C) 2011-2013 Stephane Grosjean <s.grosjean@peak-system.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published
- * by the Free Software Foundation; version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#ifndef PUCAN_H
-#define PUCAN_H
-
-/* uCAN commands opcodes list (low-order 10 bits) */
-#define PUCAN_CMD_NOP			0x000
-#define PUCAN_CMD_RESET_MODE		0x001
-#define PUCAN_CMD_NORMAL_MODE		0x002
-#define PUCAN_CMD_LISTEN_ONLY_MODE	0x003
-#define PUCAN_CMD_TIMING_SLOW		0x004
-#define PUCAN_CMD_TIMING_FAST		0x005
-#define PUCAN_CMD_FILTER_STD		0x008
-#define PUCAN_CMD_TX_ABORT		0x009
-#define PUCAN_CMD_WR_ERR_CNT		0x00a
-#define PUCAN_CMD_SET_EN_OPTION		0x00b
-#define PUCAN_CMD_CLR_DIS_OPTION	0x00c
-#define PUCAN_CMD_END_OF_COLLECTION	0x3ff
-
-/* uCAN received messages list */
-#define PUCAN_MSG_CAN_RX		0x0001
-#define PUCAN_MSG_ERROR			0x0002
-#define PUCAN_MSG_STATUS		0x0003
-#define PUCAN_MSG_BUSLOAD		0x0004
-#define PUCAN_MSG_CAN_TX		0x1000
-
-/* uCAN command common header */
-struct __packed pucan_command {
-	__le16	opcode_channel;
-	u16	args[3];
-};
-
-#define PUCAN_TSLOW_BRP_BITS		10
-#define PUCAN_TSLOW_TSGEG1_BITS		8
-#define PUCAN_TSLOW_TSGEG2_BITS		7
-#define PUCAN_TSLOW_SJW_BITS		7
-
-#define PUCAN_TSLOW_BRP_MASK		((1 << PUCAN_TSLOW_BRP_BITS) - 1)
-#define PUCAN_TSLOW_TSEG1_MASK		((1 << PUCAN_TSLOW_TSGEG1_BITS) - 1)
-#define PUCAN_TSLOW_TSEG2_MASK		((1 << PUCAN_TSLOW_TSGEG2_BITS) - 1)
-#define PUCAN_TSLOW_SJW_MASK		((1 << PUCAN_TSLOW_SJW_BITS) - 1)
-
-/* uCAN TIMING_SLOW command fields */
-#define PUCAN_TSLOW_SJW_T(s, t)		(((s) & PUCAN_TSLOW_SJW_MASK) | \
-								((!!(t)) << 7))
-#define PUCAN_TSLOW_TSEG2(t)		((t) & PUCAN_TSLOW_TSEG2_MASK)
-#define PUCAN_TSLOW_TSEG1(t)		((t) & PUCAN_TSLOW_TSEG1_MASK)
-#define PUCAN_TSLOW_BRP(b)		((b) & PUCAN_TSLOW_BRP_MASK)
-
-struct __packed pucan_timing_slow {
-	__le16	opcode_channel;
-
-	u8	ewl;		/* Error Warning limit */
-	u8	sjw_t;		/* Sync Jump Width + Triple sampling */
-	u8	tseg2;		/* Timing SEGment 2 */
-	u8	tseg1;		/* Timing SEGment 1 */
-
-	__le16	brp;		/* BaudRate Prescaler */
-};
-
-#define PUCAN_TFAST_BRP_BITS		10
-#define PUCAN_TFAST_TSGEG1_BITS		5
-#define PUCAN_TFAST_TSGEG2_BITS		4
-#define PUCAN_TFAST_SJW_BITS		4
-
-#define PUCAN_TFAST_BRP_MASK		((1 << PUCAN_TFAST_BRP_BITS) - 1)
-#define PUCAN_TFAST_TSEG1_MASK		((1 << PUCAN_TFAST_TSGEG1_BITS) - 1)
-#define PUCAN_TFAST_TSEG2_MASK		((1 << PUCAN_TFAST_TSGEG2_BITS) - 1)
-#define PUCAN_TFAST_SJW_MASK		((1 << PUCAN_TFAST_SJW_BITS) - 1)
-
-/* uCAN TIMING_FAST command fields */
-#define PUCAN_TFAST_SJW(s)		((s) & PUCAN_TFAST_SJW_MASK)
-#define PUCAN_TFAST_TSEG2(t)		((t) & PUCAN_TFAST_TSEG2_MASK)
-#define PUCAN_TFAST_TSEG1(t)		((t) & PUCAN_TFAST_TSEG1_MASK)
-#define PUCAN_TFAST_BRP(b)		((b) & PUCAN_TFAST_BRP_MASK)
-
-struct __packed pucan_timing_fast {
-	__le16	opcode_channel;
-
-	u8	unused;
-	u8	sjw;		/* Sync Jump Width */
-	u8	tseg2;		/* Timing SEGment 2 */
-	u8	tseg1;		/* Timing SEGment 1 */
-
-	__le16	brp;		/* BaudRate Prescaler */
-};
-
-/* uCAN FILTER_STD command fields */
-#define PUCAN_FLTSTD_ROW_IDX_BITS	6
-
-struct __packed pucan_filter_std {
-	__le16	opcode_channel;
-
-	__le16	idx;
-	__le32	mask;		/* CAN-ID bitmask in idx range */
-};
-
-/* uCAN WR_ERR_CNT command fields */
-#define PUCAN_WRERRCNT_TE		0x4000	/* Tx error cntr write Enable */
-#define PUCAN_WRERRCNT_RE		0x8000	/* Rx error cntr write Enable */
-
-struct __packed pucan_wr_err_cnt {
-	__le16	opcode_channel;
-
-	__le16	sel_mask;
-	u8	tx_counter;	/* Tx error counter new value */
-	u8	rx_counter;	/* Rx error counter new value */
-
-	u16	unused;
-};
-
-/* uCAN SET_EN/CLR_DIS _OPTION command fields */
-#define PUCAN_OPTION_ERROR		0x0001
-#define PUCAN_OPTION_BUSLOAD		0x0002
-#define PUCAN_OPTION_CANDFDISO		0x0004
-
-struct __packed pucan_options {
-	__le16	opcode_channel;
-
-	__le16	options;
-	u32	unused;
-};
-
-/* uCAN received messages global format */
-struct __packed pucan_msg {
-	__le16	size;
-	__le16	type;
-	__le32	ts_low;
-	__le32	ts_high;
-};
-
-/* uCAN flags for CAN/CANFD messages */
-#define PUCAN_MSG_SELF_RECEIVE		0x80
-#define PUCAN_MSG_ERROR_STATE_IND	0x40	/* error state indicator */
-#define PUCAN_MSG_BITRATE_SWITCH	0x20	/* bitrate switch */
-#define PUCAN_MSG_EXT_DATA_LEN		0x10	/* extended data length */
-#define PUCAN_MSG_SINGLE_SHOT		0x08
-#define PUCAN_MSG_LOOPED_BACK		0x04
-#define PUCAN_MSG_EXT_ID		0x02
-#define PUCAN_MSG_RTR			0x01
-
-struct __packed pucan_rx_msg {
-	__le16	size;
-	__le16	type;
-	__le32	ts_low;
-	__le32	ts_high;
-	__le32	tag_low;
-	__le32	tag_high;
-	u8	channel_dlc;
-	u8	client;
-	__le16	flags;
-	__le32	can_id;
-	u8	d[0];
-};
-
-/* uCAN error types */
-#define PUCAN_ERMSG_BIT_ERROR		0
-#define PUCAN_ERMSG_FORM_ERROR		1
-#define PUCAN_ERMSG_STUFF_ERROR		2
-#define PUCAN_ERMSG_OTHER_ERROR		3
-#define PUCAN_ERMSG_ERR_CNT_DEC		4
-
-struct __packed pucan_error_msg {
-	__le16	size;
-	__le16	type;
-	__le32	ts_low;
-	__le32	ts_high;
-	u8	channel_type_d;
-	u8	code_g;
-	u8	tx_err_cnt;
-	u8	rx_err_cnt;
-};
-
-#define PUCAN_BUS_PASSIVE		0x20
-#define PUCAN_BUS_WARNING		0x40
-#define PUCAN_BUS_BUSOFF		0x80
-
-struct __packed pucan_status_msg {
-	__le16	size;
-	__le16	type;
-	__le32	ts_low;
-	__le32	ts_high;
-	u8	channel_p_w_b;
-	u8	unused[3];
-};
-
-/* uCAN transmitted message format */
-#define PUCAN_MSG_CHANNEL_DLC(c, d)	(((c) & 0xf) | ((d) << 4))
-
-struct __packed pucan_tx_msg {
-	__le16	size;
-	__le16	type;
-	__le32	tag_low;
-	__le32	tag_high;
-	u8	channel_dlc;
-	u8	client;
-	__le16	flags;
-	__le32	can_id;
-	u8	d[0];
-};
-
-/* build the cmd opcode_channel field with respect to the correct endianness */
-static inline __le16 pucan_cmd_opcode_channel(int index, int opcode)
-{
-	return cpu_to_le16(((index) << 12) | ((opcode) & 0x3ff));
-}
-
-/* return the channel number part from any received message channel_dlc field */
-static inline int pucan_msg_get_channel(const struct pucan_rx_msg *msg)
-{
-	return msg->channel_dlc & 0xf;
-}
-
-/* return the dlc value from any received message channel_dlc field */
-static inline int pucan_msg_get_dlc(const struct pucan_rx_msg *msg)
-{
-	return msg->channel_dlc >> 4;
-}
-
-static inline int pucan_ermsg_get_channel(const struct pucan_error_msg *msg)
-{
-	return msg->channel_type_d & 0x0f;
-}
-
-static inline int pucan_stmsg_get_channel(const struct pucan_status_msg *msg)
-{
-	return msg->channel_p_w_b & 0x0f;
-}
-
-#endif
diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
index 64cba85230f8..175c205d3f13 100644
--- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
+++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c
@@ -19,10 +19,10 @@
 #include <linux/can.h>
 #include <linux/can/dev.h>
 #include <linux/can/error.h>
+#include <linux/can/dev/peak_canfd.h>
 
 #include "pcan_usb_core.h"
 #include "pcan_usb_pro.h"
-#include "pcan_ucan.h"
 
 MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB FD adapter");
 MODULE_SUPPORTED_DEVICE("PEAK-System PCAN-USB Pro FD adapter");
diff --git a/include/linux/can/dev/peak_canfd.h b/include/linux/can/dev/peak_canfd.h
new file mode 100644
index 000000000000..25e20ef2fef8
--- /dev/null
+++ b/include/linux/can/dev/peak_canfd.h
@@ -0,0 +1,243 @@
+/*
+ * CAN driver for PEAK System micro-CAN based adapters
+ *
+ * Copyright (C) 2003-2011 PEAK System-Technik GmbH
+ * Copyright (C) 2011-2013 Stephane Grosjean <s.grosjean@peak-system.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef PUCAN_H
+#define PUCAN_H
+
+/* uCAN commands opcodes list (low-order 10 bits) */
+#define PUCAN_CMD_NOP			0x000
+#define PUCAN_CMD_RESET_MODE		0x001
+#define PUCAN_CMD_NORMAL_MODE		0x002
+#define PUCAN_CMD_LISTEN_ONLY_MODE	0x003
+#define PUCAN_CMD_TIMING_SLOW		0x004
+#define PUCAN_CMD_TIMING_FAST		0x005
+#define PUCAN_CMD_FILTER_STD		0x008
+#define PUCAN_CMD_TX_ABORT		0x009
+#define PUCAN_CMD_WR_ERR_CNT		0x00a
+#define PUCAN_CMD_SET_EN_OPTION		0x00b
+#define PUCAN_CMD_CLR_DIS_OPTION	0x00c
+#define PUCAN_CMD_END_OF_COLLECTION	0x3ff
+
+/* uCAN received messages list */
+#define PUCAN_MSG_CAN_RX		0x0001
+#define PUCAN_MSG_ERROR			0x0002
+#define PUCAN_MSG_STATUS		0x0003
+#define PUCAN_MSG_BUSLOAD		0x0004
+#define PUCAN_MSG_CAN_TX		0x1000
+
+/* uCAN command common header */
+struct __packed pucan_command {
+	__le16	opcode_channel;
+	u16	args[3];
+};
+
+#define PUCAN_TSLOW_BRP_BITS		10
+#define PUCAN_TSLOW_TSGEG1_BITS		8
+#define PUCAN_TSLOW_TSGEG2_BITS		7
+#define PUCAN_TSLOW_SJW_BITS		7
+
+#define PUCAN_TSLOW_BRP_MASK		((1 << PUCAN_TSLOW_BRP_BITS) - 1)
+#define PUCAN_TSLOW_TSEG1_MASK		((1 << PUCAN_TSLOW_TSGEG1_BITS) - 1)
+#define PUCAN_TSLOW_TSEG2_MASK		((1 << PUCAN_TSLOW_TSGEG2_BITS) - 1)
+#define PUCAN_TSLOW_SJW_MASK		((1 << PUCAN_TSLOW_SJW_BITS) - 1)
+
+/* uCAN TIMING_SLOW command fields */
+#define PUCAN_TSLOW_SJW_T(s, t)		(((s) & PUCAN_TSLOW_SJW_MASK) | \
+								((!!(t)) << 7))
+#define PUCAN_TSLOW_TSEG2(t)		((t) & PUCAN_TSLOW_TSEG2_MASK)
+#define PUCAN_TSLOW_TSEG1(t)		((t) & PUCAN_TSLOW_TSEG1_MASK)
+#define PUCAN_TSLOW_BRP(b)		((b) & PUCAN_TSLOW_BRP_MASK)
+
+struct __packed pucan_timing_slow {
+	__le16	opcode_channel;
+
+	u8	ewl;		/* Error Warning limit */
+	u8	sjw_t;		/* Sync Jump Width + Triple sampling */
+	u8	tseg2;		/* Timing SEGment 2 */
+	u8	tseg1;		/* Timing SEGment 1 */
+
+	__le16	brp;		/* BaudRate Prescaler */
+};
+
+#define PUCAN_TFAST_BRP_BITS		10
+#define PUCAN_TFAST_TSGEG1_BITS		5
+#define PUCAN_TFAST_TSGEG2_BITS		4
+#define PUCAN_TFAST_SJW_BITS		4
+
+#define PUCAN_TFAST_BRP_MASK		((1 << PUCAN_TFAST_BRP_BITS) - 1)
+#define PUCAN_TFAST_TSEG1_MASK		((1 << PUCAN_TFAST_TSGEG1_BITS) - 1)
+#define PUCAN_TFAST_TSEG2_MASK		((1 << PUCAN_TFAST_TSGEG2_BITS) - 1)
+#define PUCAN_TFAST_SJW_MASK		((1 << PUCAN_TFAST_SJW_BITS) - 1)
+
+/* uCAN TIMING_FAST command fields */
+#define PUCAN_TFAST_SJW(s)		((s) & PUCAN_TFAST_SJW_MASK)
+#define PUCAN_TFAST_TSEG2(t)		((t) & PUCAN_TFAST_TSEG2_MASK)
+#define PUCAN_TFAST_TSEG1(t)		((t) & PUCAN_TFAST_TSEG1_MASK)
+#define PUCAN_TFAST_BRP(b)		((b) & PUCAN_TFAST_BRP_MASK)
+
+struct __packed pucan_timing_fast {
+	__le16	opcode_channel;
+
+	u8	unused;
+	u8	sjw;		/* Sync Jump Width */
+	u8	tseg2;		/* Timing SEGment 2 */
+	u8	tseg1;		/* Timing SEGment 1 */
+
+	__le16	brp;		/* BaudRate Prescaler */
+};
+
+/* uCAN FILTER_STD command fields */
+#define PUCAN_FLTSTD_ROW_IDX_BITS	6
+
+struct __packed pucan_filter_std {
+	__le16	opcode_channel;
+
+	__le16	idx;
+	__le32	mask;		/* CAN-ID bitmask in idx range */
+};
+
+/* uCAN WR_ERR_CNT command fields */
+#define PUCAN_WRERRCNT_TE		0x4000	/* Tx error cntr write Enable */
+#define PUCAN_WRERRCNT_RE		0x8000	/* Rx error cntr write Enable */
+
+struct __packed pucan_wr_err_cnt {
+	__le16	opcode_channel;
+
+	__le16	sel_mask;
+	u8	tx_counter;	/* Tx error counter new value */
+	u8	rx_counter;	/* Rx error counter new value */
+
+	u16	unused;
+};
+
+/* uCAN SET_EN/CLR_DIS _OPTION command fields */
+#define PUCAN_OPTION_ERROR		0x0001
+#define PUCAN_OPTION_BUSLOAD		0x0002
+#define PUCAN_OPTION_CANDFDISO		0x0004
+
+struct __packed pucan_options {
+	__le16	opcode_channel;
+
+	__le16	options;
+	u32	unused;
+};
+
+/* uCAN received messages global format */
+struct __packed pucan_msg {
+	__le16	size;
+	__le16	type;
+	__le32	ts_low;
+	__le32	ts_high;
+};
+
+/* uCAN flags for CAN/CANFD messages */
+#define PUCAN_MSG_SELF_RECEIVE		0x80
+#define PUCAN_MSG_ERROR_STATE_IND	0x40	/* error state indicator */
+#define PUCAN_MSG_BITRATE_SWITCH	0x20	/* bitrate switch */
+#define PUCAN_MSG_EXT_DATA_LEN		0x10	/* extended data length */
+#define PUCAN_MSG_SINGLE_SHOT		0x08
+#define PUCAN_MSG_LOOPED_BACK		0x04
+#define PUCAN_MSG_EXT_ID		0x02
+#define PUCAN_MSG_RTR			0x01
+
+struct __packed pucan_rx_msg {
+	__le16	size;
+	__le16	type;
+	__le32	ts_low;
+	__le32	ts_high;
+	__le32	tag_low;
+	__le32	tag_high;
+	u8	channel_dlc;
+	u8	client;
+	__le16	flags;
+	__le32	can_id;
+	u8	d[0];
+};
+
+/* uCAN error types */
+#define PUCAN_ERMSG_BIT_ERROR		0
+#define PUCAN_ERMSG_FORM_ERROR		1
+#define PUCAN_ERMSG_STUFF_ERROR		2
+#define PUCAN_ERMSG_OTHER_ERROR		3
+#define PUCAN_ERMSG_ERR_CNT_DEC		4
+
+struct __packed pucan_error_msg {
+	__le16	size;
+	__le16	type;
+	__le32	ts_low;
+	__le32	ts_high;
+	u8	channel_type_d;
+	u8	code_g;
+	u8	tx_err_cnt;
+	u8	rx_err_cnt;
+};
+
+#define PUCAN_BUS_PASSIVE		0x20
+#define PUCAN_BUS_WARNING		0x40
+#define PUCAN_BUS_BUSOFF		0x80
+
+struct __packed pucan_status_msg {
+	__le16	size;
+	__le16	type;
+	__le32	ts_low;
+	__le32	ts_high;
+	u8	channel_p_w_b;
+	u8	unused[3];
+};
+
+/* uCAN transmitted message format */
+#define PUCAN_MSG_CHANNEL_DLC(c, d)	(((c) & 0xf) | ((d) << 4))
+
+struct __packed pucan_tx_msg {
+	__le16	size;
+	__le16	type;
+	__le32	tag_low;
+	__le32	tag_high;
+	u8	channel_dlc;
+	u8	client;
+	__le16	flags;
+	__le32	can_id;
+	u8	d[0];
+};
+
+/* build the cmd opcode_channel field with respect to the correct endianness */
+static inline __le16 pucan_cmd_opcode_channel(int index, int opcode)
+{
+	return cpu_to_le16(((index) << 12) | ((opcode) & 0x3ff));
+}
+
+/* return the channel number part from any received message channel_dlc field */
+static inline int pucan_msg_get_channel(const struct pucan_rx_msg *msg)
+{
+	return msg->channel_dlc & 0xf;
+}
+
+/* return the dlc value from any received message channel_dlc field */
+static inline int pucan_msg_get_dlc(const struct pucan_rx_msg *msg)
+{
+	return msg->channel_dlc >> 4;
+}
+
+static inline int pucan_ermsg_get_channel(const struct pucan_error_msg *msg)
+{
+	return msg->channel_type_d & 0x0f;
+}
+
+static inline int pucan_stmsg_get_channel(const struct pucan_status_msg *msg)
+{
+	return msg->channel_p_w_b & 0x0f;
+}
+
+#endif
-- 
cgit v1.2.3


From 8ac8321e4a7981312348413b9ec314fd93d71a0c Mon Sep 17 00:00:00 2001
From: Stephane Grosjean <s.grosjean@peak-system.com>
Date: Thu, 19 Jan 2017 16:31:07 +0100
Subject: can: peak: add support for PEAK PCAN-PCIe FD CAN-FD boards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds the support of the PCAN-PCI Express FD boards made
by PEAK-System, for computers using the PCI Express slot.

The PCAN-PCI Express FD has one or two CAN FD channels, depending
on the model. A galvanic isolation of the CAN ports protects
the electronics of the card and the respective computer against
disturbances of up to 500 Volts. The PCAN-PCI Express FD can be operated
with ambient temperatures in a range of -40 to +85 °C.

Such boards run an extented version of the CAN-FD IP running into USB
CAN-FD interfaces from PEAK-System, so this patch adds several new commands
and their corresponding data types to the PEAK CAN-FD common definitions
header file too.

Signed-off-by: Stephane Grosjean <s.grosjean@peak-system.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/Kconfig                       |   1 +
 drivers/net/can/Makefile                      |   1 +
 drivers/net/can/peak_canfd/Kconfig            |  13 +
 drivers/net/can/peak_canfd/Makefile           |   5 +
 drivers/net/can/peak_canfd/peak_canfd.c       | 801 ++++++++++++++++++++++++
 drivers/net/can/peak_canfd/peak_canfd_user.h  |  55 ++
 drivers/net/can/peak_canfd/peak_pciefd_main.c | 842 ++++++++++++++++++++++++++
 include/linux/can/dev/peak_canfd.h            |  65 ++
 8 files changed, 1783 insertions(+)
 create mode 100644 drivers/net/can/peak_canfd/Kconfig
 create mode 100644 drivers/net/can/peak_canfd/Makefile
 create mode 100644 drivers/net/can/peak_canfd/peak_canfd.c
 create mode 100644 drivers/net/can/peak_canfd/peak_canfd_user.h
 create mode 100644 drivers/net/can/peak_canfd/peak_pciefd_main.c

(limited to 'include/linux')

diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
index 22570ea3a8d2..aa20b69d2a1a 100644
--- a/drivers/net/can/Kconfig
+++ b/drivers/net/can/Kconfig
@@ -142,6 +142,7 @@ source "drivers/net/can/cc770/Kconfig"
 source "drivers/net/can/ifi_canfd/Kconfig"
 source "drivers/net/can/m_can/Kconfig"
 source "drivers/net/can/mscan/Kconfig"
+source "drivers/net/can/peak_canfd/Kconfig"
 source "drivers/net/can/rcar/Kconfig"
 source "drivers/net/can/sja1000/Kconfig"
 source "drivers/net/can/softing/Kconfig"
diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
index 0da4f2f5c7e3..8581e2b3e87f 100644
--- a/drivers/net/can/Makefile
+++ b/drivers/net/can/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_CAN_IFI_CANFD)	+= ifi_canfd/
 obj-$(CONFIG_CAN_JANZ_ICAN3)	+= janz-ican3.o
 obj-$(CONFIG_CAN_MSCAN)		+= mscan/
 obj-$(CONFIG_CAN_M_CAN)		+= m_can/
+obj-$(CONFIG_CAN_PEAK_PCIEFD)	+= peak_canfd/
 obj-$(CONFIG_CAN_SJA1000)	+= sja1000/
 obj-$(CONFIG_CAN_SUN4I)		+= sun4i_can.o
 obj-$(CONFIG_CAN_TI_HECC)	+= ti_hecc.o
diff --git a/drivers/net/can/peak_canfd/Kconfig b/drivers/net/can/peak_canfd/Kconfig
new file mode 100644
index 000000000000..84b30978a19f
--- /dev/null
+++ b/drivers/net/can/peak_canfd/Kconfig
@@ -0,0 +1,13 @@
+config CAN_PEAK_PCIEFD
+	depends on PCI
+	tristate "PEAK-System PCAN-PCIe FD cards"
+	---help---
+	  This driver adds support for the PEAK-System PCI Express FD
+	  CAN-FD cards family.
+	  These 1x or 2x CAN-FD channels cards offer CAN 2.0 a/b as well as
+	  CAN-FD access to the CAN bus. Besides the nominal bitrate of up to
+	  1 Mbit/s, the data bytes of CAN-FD frames can be transmitted with
+	  up to 12 Mbit/s. A galvanic isolation of the CAN ports protects the
+	  electronics of the card and the respective computer against
+	  disturbances of up to 500 Volts. The PCAN-PCI Express FD can be
+	  operated with ambient temperatures in a range of -40 to +85 °C.
diff --git a/drivers/net/can/peak_canfd/Makefile b/drivers/net/can/peak_canfd/Makefile
new file mode 100644
index 000000000000..3dc7a6a0ba59
--- /dev/null
+++ b/drivers/net/can/peak_canfd/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the PEAK-System CAN-FD IP module drivers
+#
+obj-$(CONFIG_CAN_PEAK_PCIEFD) += peak_pciefd.o
+peak_pciefd-y := peak_pciefd_main.o peak_canfd.o
diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c
new file mode 100644
index 000000000000..0d57be5ea97b
--- /dev/null
+++ b/drivers/net/can/peak_canfd/peak_canfd.c
@@ -0,0 +1,801 @@
+/*
+ * Copyright (C) 2007, 2011 Wolfgang Grandegger <wg@grandegger.com>
+ * Copyright (C) 2012 Stephane Grosjean <s.grosjean@peak-system.com>
+ *
+ * Copyright (C) 2016  PEAK System-Technik GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/can.h>
+#include <linux/can/dev.h>
+
+#include "peak_canfd_user.h"
+
+/* internal IP core cache size (used as default echo skbs max number) */
+#define PCANFD_ECHO_SKB_MAX		24
+
+/* bittiming ranges of the PEAK-System PC CAN-FD interfaces */
+static const struct can_bittiming_const peak_canfd_nominal_const = {
+	.name = "peak_canfd",
+	.tseg1_min = 1,
+	.tseg1_max = (1 << PUCAN_TSLOW_TSGEG1_BITS),
+	.tseg2_min = 1,
+	.tseg2_max = (1 << PUCAN_TSLOW_TSGEG2_BITS),
+	.sjw_max = (1 << PUCAN_TSLOW_SJW_BITS),
+	.brp_min = 1,
+	.brp_max = (1 << PUCAN_TSLOW_BRP_BITS),
+	.brp_inc = 1,
+};
+
+static const struct can_bittiming_const peak_canfd_data_const = {
+	.name = "peak_canfd",
+	.tseg1_min = 1,
+	.tseg1_max = (1 << PUCAN_TFAST_TSGEG1_BITS),
+	.tseg2_min = 1,
+	.tseg2_max = (1 << PUCAN_TFAST_TSGEG2_BITS),
+	.sjw_max = (1 << PUCAN_TFAST_SJW_BITS),
+	.brp_min = 1,
+	.brp_max = (1 << PUCAN_TFAST_BRP_BITS),
+	.brp_inc = 1,
+};
+
+static struct peak_canfd_priv *pucan_init_cmd(struct peak_canfd_priv *priv)
+{
+	priv->cmd_len = 0;
+	return priv;
+}
+
+static void *pucan_add_cmd(struct peak_canfd_priv *priv, int cmd_op)
+{
+	struct pucan_command *cmd;
+
+	if (priv->cmd_len + sizeof(*cmd) > priv->cmd_maxlen)
+		return NULL;
+
+	cmd = priv->cmd_buffer + priv->cmd_len;
+
+	/* reset all unused bit to default */
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->opcode_channel = pucan_cmd_opcode_channel(priv->index, cmd_op);
+	priv->cmd_len += sizeof(*cmd);
+
+	return cmd;
+}
+
+static int pucan_write_cmd(struct peak_canfd_priv *priv)
+{
+	int err;
+
+	if (priv->pre_cmd) {
+		err = priv->pre_cmd(priv);
+		if (err)
+			return err;
+	}
+
+	err = priv->write_cmd(priv);
+	if (err)
+		return err;
+
+	if (priv->post_cmd)
+		err = priv->post_cmd(priv);
+
+	return err;
+}
+
+/* uCAN commands interface functions */
+static int pucan_set_reset_mode(struct peak_canfd_priv *priv)
+{
+	pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_RESET_MODE);
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_set_normal_mode(struct peak_canfd_priv *priv)
+{
+	int err;
+
+	pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_NORMAL_MODE);
+	err = pucan_write_cmd(priv);
+	if (!err)
+		priv->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	return err;
+}
+
+static int pucan_set_listen_only_mode(struct peak_canfd_priv *priv)
+{
+	int err;
+
+	pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_LISTEN_ONLY_MODE);
+	err = pucan_write_cmd(priv);
+	if (!err)
+		priv->can.state = CAN_STATE_ERROR_ACTIVE;
+
+	return err;
+}
+
+static int pucan_set_timing_slow(struct peak_canfd_priv *priv,
+				 const struct can_bittiming *pbt)
+{
+	struct pucan_timing_slow *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_TIMING_SLOW);
+
+	cmd->sjw_t = PUCAN_TSLOW_SJW_T(pbt->sjw - 1,
+				priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES);
+	cmd->tseg1 = PUCAN_TSLOW_TSEG1(pbt->prop_seg + pbt->phase_seg1 - 1);
+	cmd->tseg2 = PUCAN_TSLOW_TSEG2(pbt->phase_seg2 - 1);
+	cmd->brp = cpu_to_le16(PUCAN_TSLOW_BRP(pbt->brp - 1));
+
+	cmd->ewl = 96;	/* default */
+
+	netdev_dbg(priv->ndev,
+		   "nominal: brp=%u tseg1=%u tseg2=%u sjw=%u\n",
+		   le16_to_cpu(cmd->brp), cmd->tseg1, cmd->tseg2, cmd->sjw_t);
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_set_timing_fast(struct peak_canfd_priv *priv,
+				 const struct can_bittiming *pbt)
+{
+	struct pucan_timing_fast *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_TIMING_FAST);
+
+	cmd->sjw = PUCAN_TFAST_SJW(pbt->sjw - 1);
+	cmd->tseg1 = PUCAN_TFAST_TSEG1(pbt->prop_seg + pbt->phase_seg1 - 1);
+	cmd->tseg2 = PUCAN_TFAST_TSEG2(pbt->phase_seg2 - 1);
+	cmd->brp = cpu_to_le16(PUCAN_TFAST_BRP(pbt->brp - 1));
+
+	netdev_dbg(priv->ndev,
+		   "data: brp=%u tseg1=%u tseg2=%u sjw=%u\n",
+		   le16_to_cpu(cmd->brp), cmd->tseg1, cmd->tseg2, cmd->sjw);
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_set_std_filter(struct peak_canfd_priv *priv, u8 row, u32 mask)
+{
+	struct pucan_std_filter *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_SET_STD_FILTER);
+
+	/* all the 11-bits CAN ID values are represented by one bit in a
+	 * 64 rows array of 32 bits: the upper 6 bits of the CAN ID select the
+	 * row while the lowest 5 bits select the bit in that row.
+	 *
+	 * bit	filter
+	 * 1	passed
+	 * 0	discarded
+	 */
+
+	/* select the row */
+	cmd->idx = row;
+
+	/* set/unset bits in the row */
+	cmd->mask = cpu_to_le32(mask);
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_tx_abort(struct peak_canfd_priv *priv, u16 flags)
+{
+	struct pucan_tx_abort *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_TX_ABORT);
+
+	cmd->flags = cpu_to_le16(flags);
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_clr_err_counters(struct peak_canfd_priv *priv)
+{
+	struct pucan_wr_err_cnt *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_WR_ERR_CNT);
+
+	cmd->sel_mask = cpu_to_le16(PUCAN_WRERRCNT_TE | PUCAN_WRERRCNT_RE);
+	cmd->tx_counter = 0;
+	cmd->rx_counter = 0;
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_set_options(struct peak_canfd_priv *priv, u16 opt_mask)
+{
+	struct pucan_options *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_SET_EN_OPTION);
+
+	cmd->options = cpu_to_le16(opt_mask);
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_clr_options(struct peak_canfd_priv *priv, u16 opt_mask)
+{
+	struct pucan_options *cmd;
+
+	cmd = pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_CLR_DIS_OPTION);
+
+	cmd->options = cpu_to_le16(opt_mask);
+
+	return pucan_write_cmd(priv);
+}
+
+static int pucan_setup_rx_barrier(struct peak_canfd_priv *priv)
+{
+	pucan_add_cmd(pucan_init_cmd(priv), PUCAN_CMD_RX_BARRIER);
+
+	return pucan_write_cmd(priv);
+}
+
+/* handle the reception of one CAN frame */
+static int pucan_handle_can_rx(struct peak_canfd_priv *priv,
+			       struct pucan_rx_msg *msg)
+{
+	struct net_device_stats *stats = &priv->ndev->stats;
+	struct canfd_frame *cf;
+	struct sk_buff *skb;
+	const u16 rx_msg_flags = le16_to_cpu(msg->flags);
+	u8 cf_len;
+
+	if (rx_msg_flags & PUCAN_MSG_EXT_DATA_LEN)
+		cf_len = can_dlc2len(get_canfd_dlc(pucan_msg_get_dlc(msg)));
+	else
+		cf_len = get_can_dlc(pucan_msg_get_dlc(msg));
+
+	/* if this frame is an echo, */
+	if ((rx_msg_flags & PUCAN_MSG_LOOPED_BACK) &&
+	    !(rx_msg_flags & PUCAN_MSG_SELF_RECEIVE)) {
+		int n;
+		unsigned long flags;
+
+		spin_lock_irqsave(&priv->echo_lock, flags);
+		n = can_get_echo_skb(priv->ndev, msg->client);
+		spin_unlock_irqrestore(&priv->echo_lock, flags);
+
+		/* count bytes of the echo instead of skb */
+		stats->tx_bytes += cf_len;
+		stats->tx_packets++;
+
+		if (n) {
+			/* restart tx queue only if a slot is free */
+			netif_wake_queue(priv->ndev);
+		}
+
+		return 0;
+	}
+
+	/* otherwise, it should be pushed into rx fifo */
+	if (rx_msg_flags & PUCAN_MSG_EXT_DATA_LEN) {
+		/* CANFD frame case */
+		skb = alloc_canfd_skb(priv->ndev, &cf);
+		if (!skb)
+			return -ENOMEM;
+
+		if (rx_msg_flags & PUCAN_MSG_BITRATE_SWITCH)
+			cf->flags |= CANFD_BRS;
+
+		if (rx_msg_flags & PUCAN_MSG_ERROR_STATE_IND)
+			cf->flags |= CANFD_ESI;
+	} else {
+		/* CAN 2.0 frame case */
+		skb = alloc_can_skb(priv->ndev, (struct can_frame **)&cf);
+		if (!skb)
+			return -ENOMEM;
+	}
+
+	cf->can_id = le32_to_cpu(msg->can_id);
+	cf->len = cf_len;
+
+	if (rx_msg_flags & PUCAN_MSG_EXT_ID)
+		cf->can_id |= CAN_EFF_FLAG;
+
+	if (rx_msg_flags & PUCAN_MSG_RTR)
+		cf->can_id |= CAN_RTR_FLAG;
+	else
+		memcpy(cf->data, msg->d, cf->len);
+
+	stats->rx_bytes += cf->len;
+	stats->rx_packets++;
+
+	netif_rx(skb);
+
+	return 0;
+}
+
+/* handle rx/tx error counters notification */
+static int pucan_handle_error(struct peak_canfd_priv *priv,
+			      struct pucan_error_msg *msg)
+{
+	priv->bec.txerr = msg->tx_err_cnt;
+	priv->bec.rxerr = msg->rx_err_cnt;
+
+	return 0;
+}
+
+/* handle status notification */
+static int pucan_handle_status(struct peak_canfd_priv *priv,
+			       struct pucan_status_msg *msg)
+{
+	struct net_device *ndev = priv->ndev;
+	struct net_device_stats *stats = &ndev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+
+	/* this STATUS is the CNF of the RX_BARRIER: Tx path can be setup */
+	if (pucan_status_is_rx_barrier(msg)) {
+		unsigned long flags;
+
+		if (priv->enable_tx_path) {
+			int err = priv->enable_tx_path(priv);
+
+			if (err)
+				return err;
+		}
+
+		/* restart network queue only if echo skb array is free */
+		spin_lock_irqsave(&priv->echo_lock, flags);
+
+		if (!priv->can.echo_skb[priv->echo_idx]) {
+			spin_unlock_irqrestore(&priv->echo_lock, flags);
+
+			netif_wake_queue(ndev);
+		} else {
+			spin_unlock_irqrestore(&priv->echo_lock, flags);
+		}
+
+		return 0;
+	}
+
+	skb = alloc_can_err_skb(ndev, &cf);
+
+	/* test state error bits according to their priority */
+	if (pucan_status_is_busoff(msg)) {
+		netdev_dbg(ndev, "Bus-off entry status\n");
+		priv->can.state = CAN_STATE_BUS_OFF;
+		priv->can.can_stats.bus_off++;
+		can_bus_off(ndev);
+		if (skb)
+			cf->can_id |= CAN_ERR_BUSOFF;
+
+	} else if (pucan_status_is_passive(msg)) {
+		netdev_dbg(ndev, "Error passive status\n");
+		priv->can.state = CAN_STATE_ERROR_PASSIVE;
+		priv->can.can_stats.error_passive++;
+		if (skb) {
+			cf->can_id |= CAN_ERR_CRTL;
+			cf->data[1] = (priv->bec.txerr > priv->bec.rxerr) ?
+					CAN_ERR_CRTL_TX_PASSIVE :
+					CAN_ERR_CRTL_RX_PASSIVE;
+			cf->data[6] = priv->bec.txerr;
+			cf->data[7] = priv->bec.rxerr;
+		}
+
+	} else if (pucan_status_is_warning(msg)) {
+		netdev_dbg(ndev, "Error warning status\n");
+		priv->can.state = CAN_STATE_ERROR_WARNING;
+		priv->can.can_stats.error_warning++;
+		if (skb) {
+			cf->can_id |= CAN_ERR_CRTL;
+			cf->data[1] = (priv->bec.txerr > priv->bec.rxerr) ?
+					CAN_ERR_CRTL_TX_WARNING :
+					CAN_ERR_CRTL_RX_WARNING;
+			cf->data[6] = priv->bec.txerr;
+			cf->data[7] = priv->bec.rxerr;
+		}
+
+	} else if (priv->can.state != CAN_STATE_ERROR_ACTIVE) {
+		/* back to ERROR_ACTIVE */
+		netdev_dbg(ndev, "Error active status\n");
+		can_change_state(ndev, cf, CAN_STATE_ERROR_ACTIVE,
+				 CAN_STATE_ERROR_ACTIVE);
+	} else {
+		dev_kfree_skb(skb);
+		return 0;
+	}
+
+	if (!skb) {
+		stats->rx_dropped++;
+		return -ENOMEM;
+	}
+
+	stats->rx_packets++;
+	stats->rx_bytes += cf->can_dlc;
+	netif_rx(skb);
+
+	return 0;
+}
+
+/* handle uCAN Rx overflow notification */
+static int pucan_handle_cache_critical(struct peak_canfd_priv *priv)
+{
+	struct net_device_stats *stats = &priv->ndev->stats;
+	struct can_frame *cf;
+	struct sk_buff *skb;
+
+	stats->rx_over_errors++;
+	stats->rx_errors++;
+
+	skb = alloc_can_err_skb(priv->ndev, &cf);
+	if (!skb) {
+		stats->rx_dropped++;
+		return -ENOMEM;
+	}
+
+	cf->can_id |= CAN_ERR_CRTL;
+	cf->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
+
+	cf->data[6] = priv->bec.txerr;
+	cf->data[7] = priv->bec.rxerr;
+
+	stats->rx_bytes += cf->can_dlc;
+	stats->rx_packets++;
+	netif_rx(skb);
+
+	return 0;
+}
+
+/* handle a single uCAN message */
+int peak_canfd_handle_msg(struct peak_canfd_priv *priv,
+			  struct pucan_rx_msg *msg)
+{
+	u16 msg_type = le16_to_cpu(msg->type);
+	int msg_size = le16_to_cpu(msg->size);
+	int err;
+
+	if (!msg_size || !msg_type) {
+		/* null packet found: end of list */
+		goto exit;
+	}
+
+	switch (msg_type) {
+	case PUCAN_MSG_CAN_RX:
+		err = pucan_handle_can_rx(priv, (struct pucan_rx_msg *)msg);
+		break;
+	case PUCAN_MSG_ERROR:
+		err = pucan_handle_error(priv, (struct pucan_error_msg *)msg);
+		break;
+	case PUCAN_MSG_STATUS:
+		err = pucan_handle_status(priv, (struct pucan_status_msg *)msg);
+		break;
+	case PUCAN_MSG_CACHE_CRITICAL:
+		err = pucan_handle_cache_critical(priv);
+		break;
+	default:
+		err = 0;
+	}
+
+	if (err < 0)
+		return err;
+
+exit:
+	return msg_size;
+}
+
+/* handle a list of rx_count messages from rx_msg memory address */
+int peak_canfd_handle_msgs_list(struct peak_canfd_priv *priv,
+				struct pucan_rx_msg *msg_list, int msg_count)
+{
+	void *msg_ptr = msg_list;
+	int i, msg_size;
+
+	for (i = 0; i < msg_count; i++) {
+		msg_size = peak_canfd_handle_msg(priv, msg_ptr);
+
+		/* a null packet can be found at the end of a list */
+		if (msg_size <= 0)
+			break;
+
+		msg_ptr += msg_size;
+	}
+
+	if (msg_size < 0)
+		return msg_size;
+
+	return i;
+}
+
+static int peak_canfd_start(struct peak_canfd_priv *priv)
+{
+	int err;
+
+	err = pucan_clr_err_counters(priv);
+	if (err)
+		goto err_exit;
+
+	priv->echo_idx = 0;
+
+	priv->bec.txerr = 0;
+	priv->bec.rxerr = 0;
+
+	if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY)
+		err = pucan_set_listen_only_mode(priv);
+	else
+		err = pucan_set_normal_mode(priv);
+
+err_exit:
+	return err;
+}
+
+static void peak_canfd_stop(struct peak_canfd_priv *priv)
+{
+	int err;
+
+	/* go back to RESET mode */
+	err = pucan_set_reset_mode(priv);
+	if (err) {
+		netdev_err(priv->ndev, "channel %u reset failed\n",
+			   priv->index);
+	} else {
+		/* abort last Tx (MUST be done in RESET mode only!) */
+		pucan_tx_abort(priv, PUCAN_TX_ABORT_FLUSH);
+	}
+}
+
+static int peak_canfd_set_mode(struct net_device *ndev, enum can_mode mode)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+
+	switch (mode) {
+	case CAN_MODE_START:
+		peak_canfd_start(priv);
+		netif_wake_queue(ndev);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int peak_canfd_get_berr_counter(const struct net_device *ndev,
+				       struct can_berr_counter *bec)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+
+	*bec = priv->bec;
+	return 0;
+}
+
+static int peak_canfd_open(struct net_device *ndev)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+	int i, err = 0;
+
+	err = open_candev(ndev);
+	if (err) {
+		netdev_err(ndev, "open_candev() failed, error %d\n", err);
+		goto err_exit;
+	}
+
+	err = pucan_set_reset_mode(priv);
+	if (err)
+		goto err_close;
+
+	if (priv->can.ctrlmode & CAN_CTRLMODE_FD) {
+		if (priv->can.ctrlmode & CAN_CTRLMODE_FD_NON_ISO)
+			err = pucan_clr_options(priv, PUCAN_OPTION_CANDFDISO);
+		else
+			err = pucan_set_options(priv, PUCAN_OPTION_CANDFDISO);
+
+		if (err)
+			goto err_close;
+	}
+
+	/* set option: get rx/tx error counters */
+	err = pucan_set_options(priv, PUCAN_OPTION_ERROR);
+	if (err)
+		goto err_close;
+
+	/* accept all standard CAN ID */
+	for (i = 0; i <= PUCAN_FLTSTD_ROW_IDX_MAX; i++)
+		pucan_set_std_filter(priv, i, 0xffffffff);
+
+	err = peak_canfd_start(priv);
+	if (err)
+		goto err_close;
+
+	/* receiving the RB status says when Tx path is ready */
+	err = pucan_setup_rx_barrier(priv);
+	if (!err)
+		goto err_exit;
+
+err_close:
+	close_candev(ndev);
+err_exit:
+	return err;
+}
+
+static int peak_canfd_set_bittiming(struct net_device *ndev)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+
+	return pucan_set_timing_slow(priv, &priv->can.bittiming);
+}
+
+static int peak_canfd_set_data_bittiming(struct net_device *ndev)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+
+	return pucan_set_timing_fast(priv, &priv->can.data_bittiming);
+}
+
+static int peak_canfd_close(struct net_device *ndev)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+
+	netif_stop_queue(ndev);
+	peak_canfd_stop(priv);
+	close_candev(ndev);
+
+	return 0;
+}
+
+static netdev_tx_t peak_canfd_start_xmit(struct sk_buff *skb,
+					 struct net_device *ndev)
+{
+	struct peak_canfd_priv *priv = netdev_priv(ndev);
+	struct net_device_stats *stats = &ndev->stats;
+	struct canfd_frame *cf = (struct canfd_frame *)skb->data;
+	struct pucan_tx_msg *msg;
+	u16 msg_size, msg_flags;
+	unsigned long flags;
+	bool should_stop_tx_queue;
+	int room_left;
+	u8 can_dlc;
+
+	if (can_dropped_invalid_skb(ndev, skb))
+		return NETDEV_TX_OK;
+
+	msg_size = ALIGN(sizeof(*msg) + cf->len, 4);
+	msg = priv->alloc_tx_msg(priv, msg_size, &room_left);
+
+	/* should never happen except under bus-off condition and (auto-)restart
+	 * mechanism
+	 */
+	if (!msg) {
+		stats->tx_dropped++;
+		netif_stop_queue(ndev);
+		return NETDEV_TX_BUSY;
+	}
+
+	msg->size = cpu_to_le16(msg_size);
+	msg->type = cpu_to_le16(PUCAN_MSG_CAN_TX);
+	msg_flags = 0;
+
+	if (cf->can_id & CAN_EFF_FLAG) {
+		msg_flags |= PUCAN_MSG_EXT_ID;
+		msg->can_id = cpu_to_le32(cf->can_id & CAN_EFF_MASK);
+	} else {
+		msg->can_id = cpu_to_le32(cf->can_id & CAN_SFF_MASK);
+	}
+
+	if (can_is_canfd_skb(skb)) {
+		/* CAN FD frame format */
+		can_dlc = can_len2dlc(cf->len);
+
+		msg_flags |= PUCAN_MSG_EXT_DATA_LEN;
+
+		if (cf->flags & CANFD_BRS)
+			msg_flags |= PUCAN_MSG_BITRATE_SWITCH;
+
+		if (cf->flags & CANFD_ESI)
+			msg_flags |= PUCAN_MSG_ERROR_STATE_IND;
+	} else {
+		/* CAN 2.0 frame format */
+		can_dlc = cf->len;
+
+		if (cf->can_id & CAN_RTR_FLAG)
+			msg_flags |= PUCAN_MSG_RTR;
+	}
+
+	/* always ask loopback for echo management */
+	msg_flags |= PUCAN_MSG_LOOPED_BACK;
+
+	/* set driver specific bit to differentiate with application loopback */
+	if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK)
+		msg_flags |= PUCAN_MSG_SELF_RECEIVE;
+
+	msg->flags = cpu_to_le16(msg_flags);
+	msg->channel_dlc = PUCAN_MSG_CHANNEL_DLC(priv->index, can_dlc);
+	memcpy(msg->d, cf->data, cf->len);
+
+	/* struct msg client field is used as an index in the echo skbs ring */
+	msg->client = priv->echo_idx;
+
+	spin_lock_irqsave(&priv->echo_lock, flags);
+
+	/* prepare and save echo skb in internal slot */
+	can_put_echo_skb(skb, ndev, priv->echo_idx);
+
+	/* move echo index to the next slot */
+	priv->echo_idx = (priv->echo_idx + 1) % priv->can.echo_skb_max;
+
+	/* if next slot is not free, stop network queue (no slot free in echo
+	 * skb ring means that the controller did not write these frames on
+	 * the bus: no need to continue).
+	 */
+	should_stop_tx_queue = !!(priv->can.echo_skb[priv->echo_idx]);
+
+	spin_unlock_irqrestore(&priv->echo_lock, flags);
+
+	/* write the skb on the interface */
+	priv->write_tx_msg(priv, msg);
+
+	/* stop network tx queue if not enough room to save one more msg too */
+	if (priv->can.ctrlmode & CAN_CTRLMODE_FD)
+		should_stop_tx_queue |= (room_left <
+					(sizeof(*msg) + CANFD_MAX_DLEN));
+	else
+		should_stop_tx_queue |= (room_left <
+					(sizeof(*msg) + CAN_MAX_DLEN));
+
+	if (should_stop_tx_queue)
+		netif_stop_queue(ndev);
+
+	return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops peak_canfd_netdev_ops = {
+	.ndo_open = peak_canfd_open,
+	.ndo_stop = peak_canfd_close,
+	.ndo_start_xmit = peak_canfd_start_xmit,
+	.ndo_change_mtu = can_change_mtu,
+};
+
+struct net_device *alloc_peak_canfd_dev(int sizeof_priv, int index,
+					int echo_skb_max)
+{
+	struct net_device *ndev;
+	struct peak_canfd_priv *priv;
+
+	/* we DO support local echo */
+	if (echo_skb_max < 0)
+		echo_skb_max = PCANFD_ECHO_SKB_MAX;
+
+	/* allocate the candev object */
+	ndev = alloc_candev(sizeof_priv, echo_skb_max);
+	if (!ndev)
+		return NULL;
+
+	priv = netdev_priv(ndev);
+
+	/* complete now socket-can initialization side */
+	priv->can.state = CAN_STATE_STOPPED;
+	priv->can.bittiming_const = &peak_canfd_nominal_const;
+	priv->can.data_bittiming_const = &peak_canfd_data_const;
+
+	priv->can.do_set_mode = peak_canfd_set_mode;
+	priv->can.do_get_berr_counter = peak_canfd_get_berr_counter;
+	priv->can.do_set_bittiming = peak_canfd_set_bittiming;
+	priv->can.do_set_data_bittiming = peak_canfd_set_data_bittiming;
+	priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK |
+				       CAN_CTRLMODE_LISTENONLY |
+				       CAN_CTRLMODE_3_SAMPLES |
+				       CAN_CTRLMODE_FD |
+				       CAN_CTRLMODE_FD_NON_ISO |
+				       CAN_CTRLMODE_BERR_REPORTING;
+
+	priv->ndev = ndev;
+	priv->index = index;
+	priv->cmd_len = 0;
+	spin_lock_init(&priv->echo_lock);
+
+	ndev->flags |= IFF_ECHO;
+	ndev->netdev_ops = &peak_canfd_netdev_ops;
+	ndev->dev_id = index;
+
+	return ndev;
+}
diff --git a/drivers/net/can/peak_canfd/peak_canfd_user.h b/drivers/net/can/peak_canfd/peak_canfd_user.h
new file mode 100644
index 000000000000..bf6de47f69c2
--- /dev/null
+++ b/drivers/net/can/peak_canfd/peak_canfd_user.h
@@ -0,0 +1,55 @@
+/*
+ * CAN driver for PEAK System micro-CAN based adapters
+ *
+ * Copyright (C) 2003-2011 PEAK System-Technik GmbH
+ * Copyright (C) 2011-2013 Stephane Grosjean <s.grosjean@peak-system.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef PEAK_CANFD_USER_H
+#define PEAK_CANFD_USER_H
+
+#include <linux/can/dev/peak_canfd.h>
+
+#define PCANFD_ECHO_SKB_DEF		-1
+
+/* data structure private to each uCAN interface */
+struct peak_canfd_priv {
+	struct can_priv can;		/* socket-can private data */
+	struct net_device *ndev;	/* network device */
+	int index;			/* channel index */
+
+	struct can_berr_counter bec;	/* rx/tx err counters */
+
+	int echo_idx;			/* echo skb free slot index */
+	spinlock_t echo_lock;
+
+	int cmd_len;
+	void *cmd_buffer;
+	int cmd_maxlen;
+
+	int (*pre_cmd)(struct peak_canfd_priv *priv);
+	int (*write_cmd)(struct peak_canfd_priv *priv);
+	int (*post_cmd)(struct peak_canfd_priv *priv);
+
+	int (*enable_tx_path)(struct peak_canfd_priv *priv);
+	void *(*alloc_tx_msg)(struct peak_canfd_priv *priv, u16 msg_size,
+			      int *room_left);
+	int (*write_tx_msg)(struct peak_canfd_priv *priv,
+			    struct pucan_tx_msg *msg);
+};
+
+struct net_device *alloc_peak_canfd_dev(int sizeof_priv, int index,
+					int echo_skb_max);
+int peak_canfd_handle_msg(struct peak_canfd_priv *priv,
+			  struct pucan_rx_msg *msg);
+int peak_canfd_handle_msgs_list(struct peak_canfd_priv *priv,
+				struct pucan_rx_msg *rx_msg, int rx_count);
+#endif
diff --git a/drivers/net/can/peak_canfd/peak_pciefd_main.c b/drivers/net/can/peak_canfd/peak_pciefd_main.c
new file mode 100644
index 000000000000..51c2d182a33a
--- /dev/null
+++ b/drivers/net/can/peak_canfd/peak_pciefd_main.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (C) 2007, 2011 Wolfgang Grandegger <wg@grandegger.com>
+ * Copyright (C) 2012 Stephane Grosjean <s.grosjean@peak-system.com>
+ *
+ * Derived from the PCAN project file driver/src/pcan_pci.c:
+ *
+ * Copyright (C) 2001-2006  PEAK System-Technik GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the version 2 of the GNU General Public License
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/can.h>
+#include <linux/can/dev.h>
+
+#include "peak_canfd_user.h"
+
+MODULE_AUTHOR("Stephane Grosjean <s.grosjean@peak-system.com>");
+MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCIe FD family cards");
+MODULE_SUPPORTED_DEVICE("PEAK PCAN PCIe FD CAN cards");
+MODULE_LICENSE("GPL v2");
+
+#define PCIEFD_DRV_NAME		"peak_pciefd"
+
+#define PEAK_PCI_VENDOR_ID	0x001c	/* The PCI device and vendor IDs */
+#define PEAK_PCIEFD_ID		0x0013	/* for PCIe slot cards */
+
+/* PEAK PCIe board access description */
+#define PCIEFD_BAR0_SIZE		(64 * 1024)
+#define PCIEFD_RX_DMA_SIZE		(4 * 1024)
+#define PCIEFD_TX_DMA_SIZE		(4 * 1024)
+
+#define PCIEFD_TX_PAGE_SIZE		(2 * 1024)
+
+/* System Control Registers */
+#define PCIEFD_REG_SYS_CTL_SET		0x0000	/* set bits */
+#define PCIEFD_REG_SYS_CTL_CLR		0x0004	/* clear bits */
+
+/* Version info registers */
+#define PCIEFD_REG_SYS_VER1		0x0040	/* version reg #1 */
+#define PCIEFD_REG_SYS_VER2		0x0044	/* version reg #2 */
+
+/* System Control Registers Bits */
+#define PCIEFD_SYS_CTL_TS_RST		0x00000001	/* timestamp clock */
+#define PCIEFD_SYS_CTL_CLK_EN		0x00000002	/* system clock */
+
+/* CAN-FD channel addresses */
+#define PCIEFD_CANX_OFF(c)		(((c) + 1) * 0x1000)
+
+#define PCIEFD_ECHO_SKB_MAX		PCANFD_ECHO_SKB_DEF
+
+/* CAN-FD channel registers */
+#define PCIEFD_REG_CAN_MISC		0x0000	/* Misc. control */
+#define PCIEFD_REG_CAN_CLK_SEL		0x0008	/* Clock selector */
+#define PCIEFD_REG_CAN_CMD_PORT_L	0x0010	/* 64-bits command port */
+#define PCIEFD_REG_CAN_CMD_PORT_H	0x0014
+#define PCIEFD_REG_CAN_TX_REQ_ACC	0x0020	/* Tx request accumulator */
+#define PCIEFD_REG_CAN_TX_CTL_SET	0x0030	/* Tx control set register */
+#define PCIEFD_REG_CAN_TX_CTL_CLR	0x0038	/* Tx control clear register */
+#define PCIEFD_REG_CAN_TX_DMA_ADDR_L	0x0040	/* 64-bits addr for Tx DMA */
+#define PCIEFD_REG_CAN_TX_DMA_ADDR_H	0x0044
+#define PCIEFD_REG_CAN_RX_CTL_SET	0x0050	/* Rx control set register */
+#define PCIEFD_REG_CAN_RX_CTL_CLR	0x0058	/* Rx control clear register */
+#define PCIEFD_REG_CAN_RX_CTL_WRT	0x0060	/* Rx control write register */
+#define PCIEFD_REG_CAN_RX_CTL_ACK	0x0068	/* Rx control ACK register */
+#define PCIEFD_REG_CAN_RX_DMA_ADDR_L	0x0070	/* 64-bits addr for Rx DMA */
+#define PCIEFD_REG_CAN_RX_DMA_ADDR_H	0x0074
+
+/* CAN-FD channel misc register bits */
+#define CANFD_MISC_TS_RST		0x00000001	/* timestamp cnt rst */
+
+/* CAN-FD channel Clock SELector Source & DIVider */
+#define CANFD_CLK_SEL_DIV_MASK		0x00000007
+#define CANFD_CLK_SEL_DIV_60MHZ		0x00000000	/* SRC=240MHz only */
+#define CANFD_CLK_SEL_DIV_40MHZ		0x00000001	/* SRC=240MHz only */
+#define CANFD_CLK_SEL_DIV_30MHZ		0x00000002	/* SRC=240MHz only */
+#define CANFD_CLK_SEL_DIV_24MHZ		0x00000003	/* SRC=240MHz only */
+#define CANFD_CLK_SEL_DIV_20MHZ		0x00000004	/* SRC=240MHz only */
+
+#define CANFD_CLK_SEL_SRC_MASK		0x00000008	/* 0=80MHz, 1=240MHz */
+#define CANFD_CLK_SEL_SRC_240MHZ	0x00000008
+#define CANFD_CLK_SEL_SRC_80MHZ		(~CANFD_CLK_SEL_SRC_240MHZ & \
+							CANFD_CLK_SEL_SRC_MASK)
+
+#define CANFD_CLK_SEL_20MHZ		(CANFD_CLK_SEL_SRC_240MHZ |\
+						CANFD_CLK_SEL_DIV_20MHZ)
+#define CANFD_CLK_SEL_24MHZ		(CANFD_CLK_SEL_SRC_240MHZ |\
+						CANFD_CLK_SEL_DIV_24MHZ)
+#define CANFD_CLK_SEL_30MHZ		(CANFD_CLK_SEL_SRC_240MHZ |\
+						CANFD_CLK_SEL_DIV_30MHZ)
+#define CANFD_CLK_SEL_40MHZ		(CANFD_CLK_SEL_SRC_240MHZ |\
+						CANFD_CLK_SEL_DIV_40MHZ)
+#define CANFD_CLK_SEL_60MHZ		(CANFD_CLK_SEL_SRC_240MHZ |\
+						CANFD_CLK_SEL_DIV_60MHZ)
+#define CANFD_CLK_SEL_80MHZ		(CANFD_CLK_SEL_SRC_80MHZ)
+
+/* CAN-FD channel Rx/Tx control register bits */
+#define CANFD_CTL_UNC_BIT		0x00010000	/* Uncached DMA mem */
+#define CANFD_CTL_RST_BIT		0x00020000	/* reset DMA action */
+#define CANFD_CTL_IEN_BIT		0x00040000	/* IRQ enable */
+
+/* Rx IRQ Count and Time Limits */
+#define CANFD_CTL_IRQ_CL_DEF	16	/* Rx msg max nb per IRQ in Rx DMA */
+#define CANFD_CTL_IRQ_TL_DEF	10	/* Time before IRQ if < CL (x100 µs) */
+
+#define CANFD_OPTIONS_SET	(CANFD_OPTION_ERROR | CANFD_OPTION_BUSLOAD)
+
+/* Tx anticipation window (link logical address should be aligned on 2K
+ * boundary)
+ */
+#define PCIEFD_TX_PAGE_COUNT	(PCIEFD_TX_DMA_SIZE / PCIEFD_TX_PAGE_SIZE)
+
+#define CANFD_MSG_LNK_TX	0x1001	/* Tx msgs link */
+
+/* 32-bits IRQ status fields, heading Rx DMA area */
+static inline int pciefd_irq_tag(u32 irq_status)
+{
+	return irq_status & 0x0000000f;
+}
+
+static inline int pciefd_irq_rx_cnt(u32 irq_status)
+{
+	return (irq_status & 0x000007f0) >> 4;
+}
+
+static inline int pciefd_irq_is_lnk(u32 irq_status)
+{
+	return irq_status & 0x00010000;
+}
+
+/* Rx record */
+struct pciefd_rx_dma {
+	__le32 irq_status;
+	__le32 sys_time_low;
+	__le32 sys_time_high;
+	struct pucan_rx_msg msg[0];
+} __packed __aligned(4);
+
+/* Tx Link record */
+struct pciefd_tx_link {
+	__le16 size;
+	__le16 type;
+	__le32 laddr_lo;
+	__le32 laddr_hi;
+} __packed __aligned(4);
+
+/* Tx page descriptor */
+struct pciefd_page {
+	void *vbase;			/* page virtual address */
+	dma_addr_t lbase;		/* page logical address */
+	u32 offset;
+	u32 size;
+};
+
+#define CANFD_IRQ_SET		0x00000001
+#define CANFD_TX_PATH_SET	0x00000002
+
+/* CAN-FD channel object */
+struct pciefd_board;
+struct pciefd_can {
+	struct peak_canfd_priv ucan;	/* must be the first member */
+	void __iomem *reg_base;		/* channel config base addr */
+	struct pciefd_board *board;	/* reverse link */
+
+	struct pucan_command pucan_cmd;	/* command buffer */
+
+	dma_addr_t rx_dma_laddr;	/* DMA virtual and logical addr */
+	void *rx_dma_vaddr;		/* for Rx and Tx areas */
+	dma_addr_t tx_dma_laddr;
+	void *tx_dma_vaddr;
+
+	struct pciefd_page tx_pages[PCIEFD_TX_PAGE_COUNT];
+	u16 tx_pages_free;		/* free Tx pages counter */
+	u16 tx_page_index;		/* current page used for Tx */
+	spinlock_t tx_lock;
+
+	u32 irq_status;
+	u32 irq_tag;				/* next irq tag */
+};
+
+/* PEAK-PCIe FD board object */
+struct pciefd_board {
+	void __iomem *reg_base;
+	struct pci_dev *pci_dev;
+	int can_count;
+	spinlock_t cmd_lock;		/* 64-bits cmds must be atomic */
+	struct pciefd_can *can[0];	/* array of network devices */
+};
+
+/* supported device ids. */
+static const struct pci_device_id peak_pciefd_tbl[] = {
+	{PEAK_PCI_VENDOR_ID, PEAK_PCIEFD_ID, PCI_ANY_ID, PCI_ANY_ID,},
+	{0,}
+};
+
+MODULE_DEVICE_TABLE(pci, peak_pciefd_tbl);
+
+/* read a 32 bits value from a SYS block register */
+static inline u32 pciefd_sys_readreg(const struct pciefd_board *priv, u16 reg)
+{
+	return readl(priv->reg_base + reg);
+}
+
+/* write a 32 bits value into a SYS block register */
+static inline void pciefd_sys_writereg(const struct pciefd_board *priv,
+				       u32 val, u16 reg)
+{
+	writel(val, priv->reg_base + reg);
+}
+
+/* read a 32 bits value from CAN-FD block register */
+static inline u32 pciefd_can_readreg(const struct pciefd_can *priv, u16 reg)
+{
+	return readl(priv->reg_base + reg);
+}
+
+/* write a 32 bits value into a CAN-FD block register */
+static inline void pciefd_can_writereg(const struct pciefd_can *priv,
+				       u32 val, u16 reg)
+{
+	writel(val, priv->reg_base + reg);
+}
+
+/* give a channel logical Rx DMA address to the board */
+static void pciefd_can_setup_rx_dma(struct pciefd_can *priv)
+{
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+	const u32 dma_addr_h = (u32)(priv->rx_dma_laddr >> 32);
+#else
+	const u32 dma_addr_h = 0;
+#endif
+
+	/* (DMA must be reset for Rx) */
+	pciefd_can_writereg(priv, CANFD_CTL_RST_BIT, PCIEFD_REG_CAN_RX_CTL_SET);
+
+	/* write the logical address of the Rx DMA area for this channel */
+	pciefd_can_writereg(priv, (u32)priv->rx_dma_laddr,
+			    PCIEFD_REG_CAN_RX_DMA_ADDR_L);
+	pciefd_can_writereg(priv, dma_addr_h, PCIEFD_REG_CAN_RX_DMA_ADDR_H);
+
+	/* also indicates that Rx DMA is cacheable */
+	pciefd_can_writereg(priv, CANFD_CTL_UNC_BIT, PCIEFD_REG_CAN_RX_CTL_CLR);
+}
+
+/* clear channel logical Rx DMA address from the board */
+static void pciefd_can_clear_rx_dma(struct pciefd_can *priv)
+{
+	/* DMA must be reset for Rx */
+	pciefd_can_writereg(priv, CANFD_CTL_RST_BIT, PCIEFD_REG_CAN_RX_CTL_SET);
+
+	/* clear the logical address of the Rx DMA area for this channel */
+	pciefd_can_writereg(priv, 0, PCIEFD_REG_CAN_RX_DMA_ADDR_L);
+	pciefd_can_writereg(priv, 0, PCIEFD_REG_CAN_RX_DMA_ADDR_H);
+}
+
+/* give a channel logical Tx DMA address to the board */
+static void pciefd_can_setup_tx_dma(struct pciefd_can *priv)
+{
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+	const u32 dma_addr_h = (u32)(priv->tx_dma_laddr >> 32);
+#else
+	const u32 dma_addr_h = 0;
+#endif
+
+	/* (DMA must be reset for Tx) */
+	pciefd_can_writereg(priv, CANFD_CTL_RST_BIT, PCIEFD_REG_CAN_TX_CTL_SET);
+
+	/* write the logical address of the Tx DMA area for this channel */
+	pciefd_can_writereg(priv, (u32)priv->tx_dma_laddr,
+			    PCIEFD_REG_CAN_TX_DMA_ADDR_L);
+	pciefd_can_writereg(priv, dma_addr_h, PCIEFD_REG_CAN_TX_DMA_ADDR_H);
+
+	/* also indicates that Tx DMA is cacheable */
+	pciefd_can_writereg(priv, CANFD_CTL_UNC_BIT, PCIEFD_REG_CAN_TX_CTL_CLR);
+}
+
+/* clear channel logical Tx DMA address from the board */
+static void pciefd_can_clear_tx_dma(struct pciefd_can *priv)
+{
+	/* DMA must be reset for Tx */
+	pciefd_can_writereg(priv, CANFD_CTL_RST_BIT, PCIEFD_REG_CAN_TX_CTL_SET);
+
+	/* clear the logical address of the Tx DMA area for this channel */
+	pciefd_can_writereg(priv, 0, PCIEFD_REG_CAN_TX_DMA_ADDR_L);
+	pciefd_can_writereg(priv, 0, PCIEFD_REG_CAN_TX_DMA_ADDR_H);
+}
+
+static void pciefd_can_ack_rx_dma(struct pciefd_can *priv)
+{
+	/* read value of current IRQ tag and inc it for next one */
+	priv->irq_tag = le32_to_cpu(*(__le32 *)priv->rx_dma_vaddr);
+	priv->irq_tag++;
+	priv->irq_tag &= 0xf;
+
+	/* write the next IRQ tag for this CAN */
+	pciefd_can_writereg(priv, priv->irq_tag, PCIEFD_REG_CAN_RX_CTL_ACK);
+}
+
+/* IRQ handler */
+static irqreturn_t pciefd_irq_handler(int irq, void *arg)
+{
+	struct pciefd_can *priv = arg;
+	struct pciefd_rx_dma *rx_dma = priv->rx_dma_vaddr;
+
+	/* INTA mode only to sync with PCIe transaction */
+	if (!pci_dev_msi_enabled(priv->board->pci_dev))
+		(void)pciefd_sys_readreg(priv->board, PCIEFD_REG_SYS_VER1);
+
+	/* read IRQ status from the first 32-bits of the Rx DMA area */
+	priv->irq_status = le32_to_cpu(rx_dma->irq_status);
+
+	/* check if this (shared) IRQ is for this CAN */
+	if (pciefd_irq_tag(priv->irq_status) != priv->irq_tag)
+		return IRQ_NONE;
+
+	/* handle rx messages (if any) */
+	peak_canfd_handle_msgs_list(&priv->ucan,
+				    rx_dma->msg,
+				    pciefd_irq_rx_cnt(priv->irq_status));
+
+	/* handle tx link interrupt (if any) */
+	if (pciefd_irq_is_lnk(priv->irq_status)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&priv->tx_lock, flags);
+		priv->tx_pages_free++;
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+		/* wake producer up */
+		netif_wake_queue(priv->ucan.ndev);
+	}
+
+	/* re-enable Rx DMA transfer for this CAN */
+	pciefd_can_ack_rx_dma(priv);
+
+	return IRQ_HANDLED;
+}
+
+static int pciefd_enable_tx_path(struct peak_canfd_priv *ucan)
+{
+	struct pciefd_can *priv = (struct pciefd_can *)ucan;
+	int i;
+
+	/* initialize the Tx pages descriptors */
+	priv->tx_pages_free = PCIEFD_TX_PAGE_COUNT - 1;
+	priv->tx_page_index = 0;
+
+	priv->tx_pages[0].vbase = priv->tx_dma_vaddr;
+	priv->tx_pages[0].lbase = priv->tx_dma_laddr;
+
+	for (i = 0; i < PCIEFD_TX_PAGE_COUNT; i++) {
+		priv->tx_pages[i].offset = 0;
+		priv->tx_pages[i].size = PCIEFD_TX_PAGE_SIZE -
+					 sizeof(struct pciefd_tx_link);
+		if (i) {
+			priv->tx_pages[i].vbase =
+					  priv->tx_pages[i - 1].vbase +
+					  PCIEFD_TX_PAGE_SIZE;
+			priv->tx_pages[i].lbase =
+					  priv->tx_pages[i - 1].lbase +
+					  PCIEFD_TX_PAGE_SIZE;
+		}
+	}
+
+	/* setup Tx DMA addresses into IP core */
+	pciefd_can_setup_tx_dma(priv);
+
+	/* start (TX_RST=0) Tx Path */
+	pciefd_can_writereg(priv, CANFD_CTL_RST_BIT, PCIEFD_REG_CAN_TX_CTL_CLR);
+
+	return 0;
+}
+
+/* board specific CANFD command pre-processing */
+static int pciefd_pre_cmd(struct peak_canfd_priv *ucan)
+{
+	struct pciefd_can *priv = (struct pciefd_can *)ucan;
+	u16 cmd = pucan_cmd_get_opcode(&priv->pucan_cmd);
+	int err;
+
+	/* pre-process command */
+	switch (cmd) {
+	case PUCAN_CMD_NORMAL_MODE:
+	case PUCAN_CMD_LISTEN_ONLY_MODE:
+
+		if (ucan->can.state == CAN_STATE_BUS_OFF)
+			break;
+
+		/* going into operational mode: setup IRQ handler */
+		err = request_irq(priv->board->pci_dev->irq,
+				  pciefd_irq_handler,
+				  IRQF_SHARED,
+				  PCIEFD_DRV_NAME,
+				  priv);
+		if (err)
+			return err;
+
+		/* setup Rx DMA address */
+		pciefd_can_setup_rx_dma(priv);
+
+		/* setup max count of msgs per IRQ */
+		pciefd_can_writereg(priv, (CANFD_CTL_IRQ_TL_DEF) << 8 |
+				    CANFD_CTL_IRQ_CL_DEF,
+				    PCIEFD_REG_CAN_RX_CTL_WRT);
+
+		/* clear DMA RST for Rx (Rx start) */
+		pciefd_can_writereg(priv, CANFD_CTL_RST_BIT,
+				    PCIEFD_REG_CAN_RX_CTL_CLR);
+
+		/* reset timestamps */
+		pciefd_can_writereg(priv, !CANFD_MISC_TS_RST,
+				    PCIEFD_REG_CAN_MISC);
+
+		/* do an initial ACK */
+		pciefd_can_ack_rx_dma(priv);
+
+		/* enable IRQ for this CAN after having set next irq_tag */
+		pciefd_can_writereg(priv, CANFD_CTL_IEN_BIT,
+				    PCIEFD_REG_CAN_RX_CTL_SET);
+
+		/* Tx path will be setup as soon as RX_BARRIER is received */
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+/* write a command */
+static int pciefd_write_cmd(struct peak_canfd_priv *ucan)
+{
+	struct pciefd_can *priv = (struct pciefd_can *)ucan;
+	unsigned long flags;
+
+	/* 64-bits command is atomic */
+	spin_lock_irqsave(&priv->board->cmd_lock, flags);
+
+	pciefd_can_writereg(priv, *(u32 *)ucan->cmd_buffer,
+			    PCIEFD_REG_CAN_CMD_PORT_L);
+	pciefd_can_writereg(priv, *(u32 *)(ucan->cmd_buffer + 4),
+			    PCIEFD_REG_CAN_CMD_PORT_H);
+
+	spin_unlock_irqrestore(&priv->board->cmd_lock, flags);
+
+	return 0;
+}
+
+/* board specific CANFD command post-processing */
+static int pciefd_post_cmd(struct peak_canfd_priv *ucan)
+{
+	struct pciefd_can *priv = (struct pciefd_can *)ucan;
+	u16 cmd = pucan_cmd_get_opcode(&priv->pucan_cmd);
+
+	switch (cmd) {
+	case PUCAN_CMD_RESET_MODE:
+
+		if (ucan->can.state == CAN_STATE_STOPPED)
+			break;
+
+		/* controller now in reset mode: */
+
+		/* stop and reset DMA addresses in Tx/Rx engines */
+		pciefd_can_clear_tx_dma(priv);
+		pciefd_can_clear_rx_dma(priv);
+
+		/* disable IRQ for this CAN */
+		pciefd_can_writereg(priv, CANFD_CTL_IEN_BIT,
+				    PCIEFD_REG_CAN_RX_CTL_CLR);
+
+		free_irq(priv->board->pci_dev->irq, priv);
+
+		ucan->can.state = CAN_STATE_STOPPED;
+
+		break;
+	}
+
+	return 0;
+}
+
+static void *pciefd_alloc_tx_msg(struct peak_canfd_priv *ucan, u16 msg_size,
+				 int *room_left)
+{
+	struct pciefd_can *priv = (struct pciefd_can *)ucan;
+	struct pciefd_page *page = priv->tx_pages + priv->tx_page_index;
+	unsigned long flags;
+	void *msg;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+
+	if (page->offset + msg_size > page->size) {
+		struct pciefd_tx_link *lk;
+
+		/* not enough space in this page: try another one */
+		if (!priv->tx_pages_free) {
+			spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+			/* Tx overflow */
+			return NULL;
+		}
+
+		priv->tx_pages_free--;
+
+		/* keep address of the very last free slot of current page */
+		lk = page->vbase + page->offset;
+
+		/* next, move on a new free page */
+		priv->tx_page_index = (priv->tx_page_index + 1) %
+				      PCIEFD_TX_PAGE_COUNT;
+		page = priv->tx_pages + priv->tx_page_index;
+
+		/* put link record to this new page at the end of prev one */
+		lk->size = cpu_to_le16(sizeof(*lk));
+		lk->type = cpu_to_le16(CANFD_MSG_LNK_TX);
+		lk->laddr_lo = cpu_to_le32(page->lbase);
+
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+		lk->laddr_hi = cpu_to_le32(page->lbase >> 32);
+#else
+		lk->laddr_hi = 0;
+#endif
+		/* next msgs will be put from the begininng of this new page */
+		page->offset = 0;
+	}
+
+	*room_left = priv->tx_pages_free * page->size;
+
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	msg = page->vbase + page->offset;
+
+	/* give back room left in the tx ring */
+	*room_left += page->size - (page->offset + msg_size);
+
+	return msg;
+}
+
+static int pciefd_write_tx_msg(struct peak_canfd_priv *ucan,
+			       struct pucan_tx_msg *msg)
+{
+	struct pciefd_can *priv = (struct pciefd_can *)ucan;
+	struct pciefd_page *page = priv->tx_pages + priv->tx_page_index;
+
+	/* this slot is now reserved for writing the frame */
+	page->offset += le16_to_cpu(msg->size);
+
+	/* tell the board a frame has been written in Tx DMA area */
+	pciefd_can_writereg(priv, 1, PCIEFD_REG_CAN_TX_REQ_ACC);
+
+	return 0;
+}
+
+/* probe for CAN-FD channel #pciefd_board->can_count */
+static int pciefd_can_probe(struct pciefd_board *pciefd)
+{
+	struct net_device *ndev;
+	struct pciefd_can *priv;
+	u32 clk;
+	int err;
+
+	/* allocate the candev object with default isize of echo skbs ring */
+	ndev = alloc_peak_canfd_dev(sizeof(*priv), pciefd->can_count,
+				    PCIEFD_ECHO_SKB_MAX);
+	if (!ndev) {
+		dev_err(&pciefd->pci_dev->dev,
+			"failed to alloc candev object\n");
+		goto failure;
+	}
+
+	priv = netdev_priv(ndev);
+
+	/* fill-in candev private object: */
+
+	/* setup PCIe-FD own callbacks */
+	priv->ucan.pre_cmd = pciefd_pre_cmd;
+	priv->ucan.write_cmd = pciefd_write_cmd;
+	priv->ucan.post_cmd = pciefd_post_cmd;
+	priv->ucan.enable_tx_path = pciefd_enable_tx_path;
+	priv->ucan.alloc_tx_msg = pciefd_alloc_tx_msg;
+	priv->ucan.write_tx_msg = pciefd_write_tx_msg;
+
+	/* setup PCIe-FD own command buffer */
+	priv->ucan.cmd_buffer = &priv->pucan_cmd;
+	priv->ucan.cmd_maxlen = sizeof(priv->pucan_cmd);
+
+	priv->board = pciefd;
+
+	/* CAN config regs block address */
+	priv->reg_base = pciefd->reg_base + PCIEFD_CANX_OFF(priv->ucan.index);
+
+	/* allocate non-cacheable DMA'able 4KB memory area for Rx */
+	priv->rx_dma_vaddr = dmam_alloc_coherent(&pciefd->pci_dev->dev,
+						 PCIEFD_RX_DMA_SIZE,
+						 &priv->rx_dma_laddr,
+						 GFP_KERNEL);
+	if (!priv->rx_dma_vaddr) {
+		dev_err(&pciefd->pci_dev->dev,
+			"Rx dmam_alloc_coherent(%u) failure\n",
+			PCIEFD_RX_DMA_SIZE);
+		goto err_free_candev;
+	}
+
+	/* allocate non-cacheable DMA'able 4KB memory area for Tx */
+	priv->tx_dma_vaddr = dmam_alloc_coherent(&pciefd->pci_dev->dev,
+						 PCIEFD_TX_DMA_SIZE,
+						 &priv->tx_dma_laddr,
+						 GFP_KERNEL);
+	if (!priv->tx_dma_vaddr) {
+		dev_err(&pciefd->pci_dev->dev,
+			"Tx dmaim_alloc_coherent(%u) failure\n",
+			PCIEFD_TX_DMA_SIZE);
+		goto err_free_candev;
+	}
+
+	/* CAN clock in RST mode */
+	pciefd_can_writereg(priv, CANFD_MISC_TS_RST, PCIEFD_REG_CAN_MISC);
+
+	/* read current clock value */
+	clk = pciefd_can_readreg(priv, PCIEFD_REG_CAN_CLK_SEL);
+	switch (clk) {
+	case CANFD_CLK_SEL_20MHZ:
+		priv->ucan.can.clock.freq = 20 * 1000 * 1000;
+		break;
+	case CANFD_CLK_SEL_24MHZ:
+		priv->ucan.can.clock.freq = 24 * 1000 * 1000;
+		break;
+	case CANFD_CLK_SEL_30MHZ:
+		priv->ucan.can.clock.freq = 30 * 1000 * 1000;
+		break;
+	case CANFD_CLK_SEL_40MHZ:
+		priv->ucan.can.clock.freq = 40 * 1000 * 1000;
+		break;
+	case CANFD_CLK_SEL_60MHZ:
+		priv->ucan.can.clock.freq = 60 * 1000 * 1000;
+		break;
+	default:
+		pciefd_can_writereg(priv, CANFD_CLK_SEL_80MHZ,
+				    PCIEFD_REG_CAN_CLK_SEL);
+
+		/* fallthough */
+	case CANFD_CLK_SEL_80MHZ:
+		priv->ucan.can.clock.freq = 80 * 1000 * 1000;
+		break;
+	}
+
+	ndev->irq = pciefd->pci_dev->irq;
+
+	SET_NETDEV_DEV(ndev, &pciefd->pci_dev->dev);
+
+	err = register_candev(ndev);
+	if (err) {
+		dev_err(&pciefd->pci_dev->dev,
+			"couldn't register CAN device: %d\n", err);
+		goto err_free_candev;
+	}
+
+	spin_lock_init(&priv->tx_lock);
+
+	/* save the object address in the board structure */
+	pciefd->can[pciefd->can_count] = priv;
+
+	dev_info(&pciefd->pci_dev->dev, "%s at reg_base=0x%p irq=%d\n",
+		 ndev->name, priv->reg_base, pciefd->pci_dev->irq);
+
+	return 0;
+
+err_free_candev:
+	free_candev(ndev);
+
+failure:
+	return -ENOMEM;
+}
+
+/* remove a CAN-FD channel by releasing all of its resources */
+static void pciefd_can_remove(struct pciefd_can *priv)
+{
+	/* unregister (close) the can device to go back to RST mode first */
+	unregister_candev(priv->ucan.ndev);
+
+	/* finally, free the candev object */
+	free_candev(priv->ucan.ndev);
+}
+
+/* remove all CAN-FD channels by releasing their own resources */
+static void pciefd_can_remove_all(struct pciefd_board *pciefd)
+{
+	while (pciefd->can_count > 0)
+		pciefd_can_remove(pciefd->can[--pciefd->can_count]);
+}
+
+/* probe for the entire device */
+static int peak_pciefd_probe(struct pci_dev *pdev,
+			     const struct pci_device_id *ent)
+{
+	struct pciefd_board *pciefd;
+	int err, can_count;
+	u16 sub_sys_id;
+	u8 hw_ver_major;
+	u8 hw_ver_minor;
+	u8 hw_ver_sub;
+	u32 v2;
+
+	err = pci_enable_device(pdev);
+	if (err)
+		return err;
+	err = pci_request_regions(pdev, PCIEFD_DRV_NAME);
+	if (err)
+		goto err_disable_pci;
+
+	/* the number of channels depends on sub-system id */
+	err = pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &sub_sys_id);
+	if (err)
+		goto err_release_regions;
+
+	dev_dbg(&pdev->dev, "probing device %04x:%04x:%04x\n",
+		pdev->vendor, pdev->device, sub_sys_id);
+
+	if (sub_sys_id >= 0x0012)
+		can_count = 4;
+	else if (sub_sys_id >= 0x0010)
+		can_count = 3;
+	else if (sub_sys_id >= 0x0004)
+		can_count = 2;
+	else
+		can_count = 1;
+
+	/* allocate board structure object */
+	pciefd = devm_kzalloc(&pdev->dev, sizeof(*pciefd) +
+			      can_count * sizeof(*pciefd->can),
+			      GFP_KERNEL);
+	if (!pciefd) {
+		err = -ENOMEM;
+		goto err_release_regions;
+	}
+
+	/* initialize the board structure */
+	pciefd->pci_dev = pdev;
+	spin_lock_init(&pciefd->cmd_lock);
+
+	/* save the PCI BAR0 virtual address for further system regs access */
+	pciefd->reg_base = pci_iomap(pdev, 0, PCIEFD_BAR0_SIZE);
+	if (!pciefd->reg_base) {
+		dev_err(&pdev->dev, "failed to map PCI resource #0\n");
+		err = -ENOMEM;
+		goto err_release_regions;
+	}
+
+	/* read the firmware version number */
+	v2 = pciefd_sys_readreg(pciefd, PCIEFD_REG_SYS_VER2);
+
+	hw_ver_major = (v2 & 0x0000f000) >> 12;
+	hw_ver_minor = (v2 & 0x00000f00) >> 8;
+	hw_ver_sub = (v2 & 0x000000f0) >> 4;
+
+	dev_info(&pdev->dev,
+		 "%ux CAN-FD PCAN-PCIe FPGA v%u.%u.%u:\n", can_count,
+		 hw_ver_major, hw_ver_minor, hw_ver_sub);
+
+	/* stop system clock */
+	pciefd_sys_writereg(pciefd, PCIEFD_SYS_CTL_CLK_EN,
+			    PCIEFD_REG_SYS_CTL_CLR);
+
+	pci_set_master(pdev);
+
+	/* create now the corresponding channels objects */
+	while (pciefd->can_count < can_count) {
+		err = pciefd_can_probe(pciefd);
+		if (err)
+			goto err_free_canfd;
+
+		pciefd->can_count++;
+	}
+
+	/* set system timestamps counter in RST mode */
+	pciefd_sys_writereg(pciefd, PCIEFD_SYS_CTL_TS_RST,
+			    PCIEFD_REG_SYS_CTL_SET);
+
+	/* wait a bit (read cycle) */
+	(void)pciefd_sys_readreg(pciefd, PCIEFD_REG_SYS_VER1);
+
+	/* free all clocks */
+	pciefd_sys_writereg(pciefd, PCIEFD_SYS_CTL_TS_RST,
+			    PCIEFD_REG_SYS_CTL_CLR);
+
+	/* start system clock */
+	pciefd_sys_writereg(pciefd, PCIEFD_SYS_CTL_CLK_EN,
+			    PCIEFD_REG_SYS_CTL_SET);
+
+	/* remember the board structure address in the device user data */
+	pci_set_drvdata(pdev, pciefd);
+
+	return 0;
+
+err_free_canfd:
+	pciefd_can_remove_all(pciefd);
+
+	pci_iounmap(pdev, pciefd->reg_base);
+
+err_release_regions:
+	pci_release_regions(pdev);
+
+err_disable_pci:
+	pci_disable_device(pdev);
+
+	return err;
+}
+
+/* free the board structure object, as well as its resources: */
+static void peak_pciefd_remove(struct pci_dev *pdev)
+{
+	struct pciefd_board *pciefd = pci_get_drvdata(pdev);
+
+	/* release CAN-FD channels resources */
+	pciefd_can_remove_all(pciefd);
+
+	pci_iounmap(pdev, pciefd->reg_base);
+
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+}
+
+static struct pci_driver peak_pciefd_driver = {
+	.name = PCIEFD_DRV_NAME,
+	.id_table = peak_pciefd_tbl,
+	.probe = peak_pciefd_probe,
+	.remove = peak_pciefd_remove,
+};
+
+module_pci_driver(peak_pciefd_driver);
diff --git a/include/linux/can/dev/peak_canfd.h b/include/linux/can/dev/peak_canfd.h
index 25e20ef2fef8..46dceef2cfa6 100644
--- a/include/linux/can/dev/peak_canfd.h
+++ b/include/linux/can/dev/peak_canfd.h
@@ -23,11 +23,14 @@
 #define PUCAN_CMD_LISTEN_ONLY_MODE	0x003
 #define PUCAN_CMD_TIMING_SLOW		0x004
 #define PUCAN_CMD_TIMING_FAST		0x005
+#define PUCAN_CMD_SET_STD_FILTER	0x006
+#define PUCAN_CMD_RESERVED2		0x007
 #define PUCAN_CMD_FILTER_STD		0x008
 #define PUCAN_CMD_TX_ABORT		0x009
 #define PUCAN_CMD_WR_ERR_CNT		0x00a
 #define PUCAN_CMD_SET_EN_OPTION		0x00b
 #define PUCAN_CMD_CLR_DIS_OPTION	0x00c
+#define PUCAN_CMD_RX_BARRIER		0x010
 #define PUCAN_CMD_END_OF_COLLECTION	0x3ff
 
 /* uCAN received messages list */
@@ -35,6 +38,10 @@
 #define PUCAN_MSG_ERROR			0x0002
 #define PUCAN_MSG_STATUS		0x0003
 #define PUCAN_MSG_BUSLOAD		0x0004
+
+#define PUCAN_MSG_CACHE_CRITICAL	0x0102
+
+/* uCAN transmitted messages */
 #define PUCAN_MSG_CAN_TX		0x1000
 
 /* uCAN command common header */
@@ -43,6 +50,12 @@ struct __packed pucan_command {
 	u16	args[3];
 };
 
+/* return the opcode from the opcode_channel field of a command */
+static inline u16 pucan_cmd_get_opcode(struct pucan_command *c)
+{
+	return le16_to_cpu(c->opcode_channel) & 0x3ff;
+}
+
 #define PUCAN_TSLOW_BRP_BITS		10
 #define PUCAN_TSLOW_TSGEG1_BITS		8
 #define PUCAN_TSLOW_TSGEG2_BITS		7
@@ -108,6 +121,27 @@ struct __packed pucan_filter_std {
 	__le32	mask;		/* CAN-ID bitmask in idx range */
 };
 
+#define PUCAN_FLTSTD_ROW_IDX_MAX	((1 << PUCAN_FLTSTD_ROW_IDX_BITS) - 1)
+
+/* uCAN SET_STD_FILTER command fields */
+struct __packed pucan_std_filter {
+	__le16	opcode_channel;
+
+	u8	unused;
+	u8	idx;
+	__le32	mask;		/* CAN-ID bitmask in idx range */
+};
+
+/* uCAN TX_ABORT commands fields */
+#define PUCAN_TX_ABORT_FLUSH		0x0001
+
+struct __packed pucan_tx_abort {
+	__le16	opcode_channel;
+
+	__le16	flags;
+	u32	unused;
+};
+
 /* uCAN WR_ERR_CNT command fields */
 #define PUCAN_WRERRCNT_TE		0x4000	/* Tx error cntr write Enable */
 #define PUCAN_WRERRCNT_RE		0x8000	/* Rx error cntr write Enable */
@@ -184,6 +218,12 @@ struct __packed pucan_error_msg {
 	u8	rx_err_cnt;
 };
 
+static inline int pucan_error_get_channel(const struct pucan_error_msg *msg)
+{
+	return msg->channel_type_d & 0x0f;
+}
+
+#define PUCAN_RX_BARRIER		0x10
 #define PUCAN_BUS_PASSIVE		0x20
 #define PUCAN_BUS_WARNING		0x40
 #define PUCAN_BUS_BUSOFF		0x80
@@ -197,6 +237,31 @@ struct __packed pucan_status_msg {
 	u8	unused[3];
 };
 
+static inline int pucan_status_get_channel(const struct pucan_status_msg *msg)
+{
+	return msg->channel_p_w_b & 0x0f;
+}
+
+static inline int pucan_status_is_rx_barrier(const struct pucan_status_msg *msg)
+{
+	return msg->channel_p_w_b & PUCAN_RX_BARRIER;
+}
+
+static inline int pucan_status_is_passive(const struct pucan_status_msg *msg)
+{
+	return msg->channel_p_w_b & PUCAN_BUS_PASSIVE;
+}
+
+static inline int pucan_status_is_warning(const struct pucan_status_msg *msg)
+{
+	return msg->channel_p_w_b & PUCAN_BUS_WARNING;
+}
+
+static inline int pucan_status_is_busoff(const struct pucan_status_msg *msg)
+{
+	return msg->channel_p_w_b & PUCAN_BUS_BUSOFF;
+}
+
 /* uCAN transmitted message format */
 #define PUCAN_MSG_CHANNEL_DLC(c, d)	(((c) & 0xf) | ((d) << 4))
 
-- 
cgit v1.2.3


From cb5635a3677679666e4e81ecbb209d32f13dedcd Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 25 Apr 2017 08:19:41 +0200
Subject: can: complete initial namespace support

The statistics and its proc output was not implemented as per-net in the
initial network namespace support by Mario Kicherer (8e8cda6d737d).
This patch adds the missing per-net statistics for the CAN subsystem.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 include/linux/can/core.h |   4 +-
 include/net/netns/can.h  |   5 ++
 net/can/af_can.c         |  71 +++++++++++++-----------
 net/can/af_can.h         |   5 --
 net/can/proc.c           | 141 +++++++++++++++++++++++++----------------------
 5 files changed, 121 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/can/core.h b/include/linux/can/core.h
index 319a0da827b8..c9a17bb1221c 100644
--- a/include/linux/can/core.h
+++ b/include/linux/can/core.h
@@ -5,7 +5,7 @@
  *
  * Authors: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
  *          Urs Thuermann   <urs.thuermann@volkswagen.de>
- * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
  * All rights reserved.
  *
  */
@@ -17,7 +17,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 
-#define CAN_VERSION "20120528"
+#define CAN_VERSION "20170425"
 
 /* increment this number each time you change some user-space interface */
 #define CAN_ABI_VERSION "9"
diff --git a/include/net/netns/can.h b/include/net/netns/can.h
index e8beba772f1a..574157dbc43a 100644
--- a/include/net/netns/can.h
+++ b/include/net/netns/can.h
@@ -8,6 +8,8 @@
 #include <linux/spinlock.h>
 
 struct dev_rcv_lists;
+struct s_stats;
+struct s_pstats;
 
 struct netns_can {
 #if IS_ENABLED(CONFIG_PROC_FS)
@@ -26,6 +28,9 @@ struct netns_can {
 	/* receive filters subscribed for 'all' CAN devices */
 	struct dev_rcv_lists *can_rx_alldev_list;
 	spinlock_t can_rcvlists_lock;
+	struct timer_list can_stattimer;/* timer for statistics update */
+	struct s_stats *can_stats;	/* packet statistics */
+	struct s_pstats *can_pstats;	/* receive list statistics */
 };
 
 #endif /* __NETNS_CAN_H__ */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 421b60fc42c3..b6406fe33c76 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -2,7 +2,7 @@
  * af_can.c - Protocol family CAN core module
  *            (used by different CAN protocol modules)
  *
- * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -81,10 +81,6 @@ static struct kmem_cache *rcv_cache __read_mostly;
 static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly;
 static DEFINE_MUTEX(proto_tab_lock);
 
-struct timer_list can_stattimer;   /* timer for statistics update */
-struct s_stats    can_stats;       /* packet statistics */
-struct s_pstats   can_pstats;      /* receive list statistics */
-
 static atomic_t skbcounter = ATOMIC_INIT(0);
 
 /*
@@ -221,6 +217,7 @@ int can_send(struct sk_buff *skb, int loop)
 {
 	struct sk_buff *newskb = NULL;
 	struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+	struct s_stats *can_stats = dev_net(skb->dev)->can.can_stats;
 	int err = -EINVAL;
 
 	if (skb->len == CAN_MTU) {
@@ -309,8 +306,8 @@ int can_send(struct sk_buff *skb, int loop)
 		netif_rx_ni(newskb);
 
 	/* update statistics */
-	can_stats.tx_frames++;
-	can_stats.tx_frames_delta++;
+	can_stats->tx_frames++;
+	can_stats->tx_frames_delta++;
 
 	return 0;
 
@@ -468,6 +465,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
 	struct receiver *r;
 	struct hlist_head *rl;
 	struct dev_rcv_lists *d;
+	struct s_pstats *can_pstats = net->can.can_pstats;
 	int err = 0;
 
 	/* insert new receiver  (dev,canid,mask) -> (func,data) */
@@ -499,9 +497,9 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
 		hlist_add_head_rcu(&r->list, rl);
 		d->entries++;
 
-		can_pstats.rcv_entries++;
-		if (can_pstats.rcv_entries_max < can_pstats.rcv_entries)
-			can_pstats.rcv_entries_max = can_pstats.rcv_entries;
+		can_pstats->rcv_entries++;
+		if (can_pstats->rcv_entries_max < can_pstats->rcv_entries)
+			can_pstats->rcv_entries_max = can_pstats->rcv_entries;
 	} else {
 		kmem_cache_free(rcv_cache, r);
 		err = -ENODEV;
@@ -543,6 +541,7 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
 {
 	struct receiver *r = NULL;
 	struct hlist_head *rl;
+	struct s_pstats *can_pstats = net->can.can_pstats;
 	struct dev_rcv_lists *d;
 
 	if (dev && dev->type != ARPHRD_CAN)
@@ -589,8 +588,8 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
 	hlist_del_rcu(&r->list);
 	d->entries--;
 
-	if (can_pstats.rcv_entries > 0)
-		can_pstats.rcv_entries--;
+	if (can_pstats->rcv_entries > 0)
+		can_pstats->rcv_entries--;
 
 	/* remove device structure requested by NETDEV_UNREGISTER */
 	if (d->remove_on_zero_entries && !d->entries) {
@@ -684,11 +683,13 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb)
 static void can_receive(struct sk_buff *skb, struct net_device *dev)
 {
 	struct dev_rcv_lists *d;
+	struct net *net = dev_net(dev);
+	struct s_stats *can_stats = net->can.can_stats;
 	int matches;
 
 	/* update statistics */
-	can_stats.rx_frames++;
-	can_stats.rx_frames_delta++;
+	can_stats->rx_frames++;
+	can_stats->rx_frames_delta++;
 
 	/* create non-zero unique skb identifier together with *skb */
 	while (!(can_skb_prv(skb)->skbcnt))
@@ -697,10 +698,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_lock();
 
 	/* deliver the packet to sockets listening on all devices */
-	matches = can_rcv_filter(dev_net(dev)->can.can_rx_alldev_list, skb);
+	matches = can_rcv_filter(net->can.can_rx_alldev_list, skb);
 
 	/* find receive list for this device */
-	d = find_dev_rcv_lists(dev_net(dev), dev);
+	d = find_dev_rcv_lists(net, dev);
 	if (d)
 		matches += can_rcv_filter(d, skb);
 
@@ -710,8 +711,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 	consume_skb(skb);
 
 	if (matches > 0) {
-		can_stats.matches++;
-		can_stats.matches_delta++;
+		can_stats->matches++;
+		can_stats->matches_delta++;
 	}
 }
 
@@ -876,8 +877,20 @@ static int can_pernet_init(struct net *net)
 	net->can.can_rx_alldev_list =
 		kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL);
 
-	if (IS_ENABLED(CONFIG_PROC_FS))
+	net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL);
+	net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL);
+
+	if (IS_ENABLED(CONFIG_PROC_FS)) {
+		/* the statistics are updated every second (timer triggered) */
+		if (stats_timer) {
+			setup_timer(&net->can.can_stattimer, can_stat_update,
+				    (unsigned long)net);
+			mod_timer(&net->can.can_stattimer,
+				  round_jiffies(jiffies + HZ));
+		}
+		net->can.can_stats->jiffies_init = jiffies;
 		can_init_proc(net);
+	}
 
 	return 0;
 }
@@ -886,8 +899,11 @@ static void can_pernet_exit(struct net *net)
 {
 	struct net_device *dev;
 
-	if (IS_ENABLED(CONFIG_PROC_FS))
+	if (IS_ENABLED(CONFIG_PROC_FS)) {
 		can_remove_proc(net);
+		if (stats_timer)
+			del_timer_sync(&net->can.can_stattimer);
+	}
 
 	/* remove created dev_rcv_lists from still registered CAN devices */
 	rcu_read_lock();
@@ -903,6 +919,8 @@ static void can_pernet_exit(struct net *net)
 	rcu_read_unlock();
 
 	kfree(net->can.can_rx_alldev_list);
+	kfree(net->can.can_stats);
+	kfree(net->can.can_pstats);
 }
 
 /*
@@ -950,14 +968,6 @@ static __init int can_init(void)
 	if (!rcv_cache)
 		return -ENOMEM;
 
-	if (IS_ENABLED(CONFIG_PROC_FS)) {
-		if (stats_timer) {
-		/* the statistics are updated every second (timer triggered) */
-			setup_timer(&can_stattimer, can_stat_update, 0);
-			mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
-		}
-	}
-
 	register_pernet_subsys(&can_pernet_ops);
 
 	/* protocol register */
@@ -971,11 +981,6 @@ static __init int can_init(void)
 
 static __exit void can_exit(void)
 {
-	if (IS_ENABLED(CONFIG_PROC_FS)) {
-		if (stats_timer)
-			del_timer_sync(&can_stattimer);
-	}
-
 	/* protocol unregister */
 	dev_remove_pack(&canfd_packet);
 	dev_remove_pack(&can_packet);
diff --git a/net/can/af_can.h b/net/can/af_can.h
index 84a35e97c5e0..d0ef45bb2a72 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -115,9 +115,4 @@ void can_init_proc(struct net *net);
 void can_remove_proc(struct net *net);
 void can_stat_update(unsigned long data);
 
-/* structures and variables from af_can.c needed in proc.c for reading */
-extern struct timer_list can_stattimer;    /* timer for statistics update */
-extern struct s_stats    can_stats;        /* packet statistics */
-extern struct s_pstats   can_pstats;       /* receive list statistics */
-
 #endif /* AF_CAN_H */
diff --git a/net/can/proc.c b/net/can/proc.c
index 9a8d54d57b22..83045f00c63c 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -75,21 +75,23 @@ static const char rx_list_name[][8] = {
  * af_can statistics stuff
  */
 
-static void can_init_stats(void)
+static void can_init_stats(struct net *net)
 {
+	struct s_stats *can_stats = net->can.can_stats;
+	struct s_pstats *can_pstats = net->can.can_pstats;
 	/*
 	 * This memset function is called from a timer context (when
 	 * can_stattimer is active which is the default) OR in a process
 	 * context (reading the proc_fs when can_stattimer is disabled).
 	 */
-	memset(&can_stats, 0, sizeof(can_stats));
-	can_stats.jiffies_init = jiffies;
+	memset(can_stats, 0, sizeof(struct s_stats));
+	can_stats->jiffies_init = jiffies;
 
-	can_pstats.stats_reset++;
+	can_pstats->stats_reset++;
 
 	if (user_reset) {
 		user_reset = 0;
-		can_pstats.user_reset++;
+		can_pstats->user_reset++;
 	}
 }
 
@@ -115,64 +117,66 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
 
 void can_stat_update(unsigned long data)
 {
+	struct net *net = (struct net *)data;
+	struct s_stats *can_stats = net->can.can_stats;
 	unsigned long j = jiffies; /* snapshot */
 
 	/* restart counting in timer context on user request */
 	if (user_reset)
-		can_init_stats();
+		can_init_stats(net);
 
 	/* restart counting on jiffies overflow */
-	if (j < can_stats.jiffies_init)
-		can_init_stats();
+	if (j < can_stats->jiffies_init)
+		can_init_stats(net);
 
 	/* prevent overflow in calc_rate() */
-	if (can_stats.rx_frames > (ULONG_MAX / HZ))
-		can_init_stats();
+	if (can_stats->rx_frames > (ULONG_MAX / HZ))
+		can_init_stats(net);
 
 	/* prevent overflow in calc_rate() */
-	if (can_stats.tx_frames > (ULONG_MAX / HZ))
-		can_init_stats();
+	if (can_stats->tx_frames > (ULONG_MAX / HZ))
+		can_init_stats(net);
 
 	/* matches overflow - very improbable */
-	if (can_stats.matches > (ULONG_MAX / 100))
-		can_init_stats();
+	if (can_stats->matches > (ULONG_MAX / 100))
+		can_init_stats(net);
 
 	/* calc total values */
-	if (can_stats.rx_frames)
-		can_stats.total_rx_match_ratio = (can_stats.matches * 100) /
-			can_stats.rx_frames;
+	if (can_stats->rx_frames)
+		can_stats->total_rx_match_ratio = (can_stats->matches * 100) /
+			can_stats->rx_frames;
 
-	can_stats.total_tx_rate = calc_rate(can_stats.jiffies_init, j,
-					    can_stats.tx_frames);
-	can_stats.total_rx_rate = calc_rate(can_stats.jiffies_init, j,
-					    can_stats.rx_frames);
+	can_stats->total_tx_rate = calc_rate(can_stats->jiffies_init, j,
+					    can_stats->tx_frames);
+	can_stats->total_rx_rate = calc_rate(can_stats->jiffies_init, j,
+					    can_stats->rx_frames);
 
 	/* calc current values */
-	if (can_stats.rx_frames_delta)
-		can_stats.current_rx_match_ratio =
-			(can_stats.matches_delta * 100) /
-			can_stats.rx_frames_delta;
+	if (can_stats->rx_frames_delta)
+		can_stats->current_rx_match_ratio =
+			(can_stats->matches_delta * 100) /
+			can_stats->rx_frames_delta;
 
-	can_stats.current_tx_rate = calc_rate(0, HZ, can_stats.tx_frames_delta);
-	can_stats.current_rx_rate = calc_rate(0, HZ, can_stats.rx_frames_delta);
+	can_stats->current_tx_rate = calc_rate(0, HZ, can_stats->tx_frames_delta);
+	can_stats->current_rx_rate = calc_rate(0, HZ, can_stats->rx_frames_delta);
 
 	/* check / update maximum values */
-	if (can_stats.max_tx_rate < can_stats.current_tx_rate)
-		can_stats.max_tx_rate = can_stats.current_tx_rate;
+	if (can_stats->max_tx_rate < can_stats->current_tx_rate)
+		can_stats->max_tx_rate = can_stats->current_tx_rate;
 
-	if (can_stats.max_rx_rate < can_stats.current_rx_rate)
-		can_stats.max_rx_rate = can_stats.current_rx_rate;
+	if (can_stats->max_rx_rate < can_stats->current_rx_rate)
+		can_stats->max_rx_rate = can_stats->current_rx_rate;
 
-	if (can_stats.max_rx_match_ratio < can_stats.current_rx_match_ratio)
-		can_stats.max_rx_match_ratio = can_stats.current_rx_match_ratio;
+	if (can_stats->max_rx_match_ratio < can_stats->current_rx_match_ratio)
+		can_stats->max_rx_match_ratio = can_stats->current_rx_match_ratio;
 
 	/* clear values for 'current rate' calculation */
-	can_stats.tx_frames_delta = 0;
-	can_stats.rx_frames_delta = 0;
-	can_stats.matches_delta   = 0;
+	can_stats->tx_frames_delta = 0;
+	can_stats->rx_frames_delta = 0;
+	can_stats->matches_delta   = 0;
 
 	/* restart timer (one second) */
-	mod_timer(&can_stattimer, round_jiffies(jiffies + HZ));
+	mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ));
 }
 
 /*
@@ -206,57 +210,61 @@ static void can_print_recv_banner(struct seq_file *m)
 
 static int can_stats_proc_show(struct seq_file *m, void *v)
 {
+	struct net *net = m->private;
+	struct s_stats *can_stats = net->can.can_stats;
+	struct s_pstats *can_pstats = net->can.can_pstats;
+
 	seq_putc(m, '\n');
-	seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats.tx_frames);
-	seq_printf(m, " %8ld received frames (RXF)\n", can_stats.rx_frames);
-	seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats.matches);
+	seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats->tx_frames);
+	seq_printf(m, " %8ld received frames (RXF)\n", can_stats->rx_frames);
+	seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats->matches);
 
 	seq_putc(m, '\n');
 
-	if (can_stattimer.function == can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
-				can_stats.total_rx_match_ratio);
+				can_stats->total_rx_match_ratio);
 
 		seq_printf(m, " %8ld frames/s total tx rate (TXR)\n",
-				can_stats.total_tx_rate);
+				can_stats->total_tx_rate);
 		seq_printf(m, " %8ld frames/s total rx rate (RXR)\n",
-				can_stats.total_rx_rate);
+				can_stats->total_rx_rate);
 
 		seq_putc(m, '\n');
 
 		seq_printf(m, " %8ld %% current match ratio (CRXMR)\n",
-				can_stats.current_rx_match_ratio);
+				can_stats->current_rx_match_ratio);
 
 		seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n",
-				can_stats.current_tx_rate);
+				can_stats->current_tx_rate);
 		seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n",
-				can_stats.current_rx_rate);
+				can_stats->current_rx_rate);
 
 		seq_putc(m, '\n');
 
 		seq_printf(m, " %8ld %% max match ratio (MRXMR)\n",
-				can_stats.max_rx_match_ratio);
+				can_stats->max_rx_match_ratio);
 
 		seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n",
-				can_stats.max_tx_rate);
+				can_stats->max_tx_rate);
 		seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n",
-				can_stats.max_rx_rate);
+				can_stats->max_rx_rate);
 
 		seq_putc(m, '\n');
 	}
 
 	seq_printf(m, " %8ld current receive list entries (CRCV)\n",
-			can_pstats.rcv_entries);
+			can_pstats->rcv_entries);
 	seq_printf(m, " %8ld maximum receive list entries (MRCV)\n",
-			can_pstats.rcv_entries_max);
+			can_pstats->rcv_entries_max);
 
-	if (can_pstats.stats_reset)
+	if (can_pstats->stats_reset)
 		seq_printf(m, "\n %8ld statistic resets (STR)\n",
-				can_pstats.stats_reset);
+				can_pstats->stats_reset);
 
-	if (can_pstats.user_reset)
+	if (can_pstats->user_reset)
 		seq_printf(m, " %8ld user statistic resets (USTR)\n",
-				can_pstats.user_reset);
+				can_pstats->user_reset);
 
 	seq_putc(m, '\n');
 	return 0;
@@ -264,7 +272,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
 
 static int can_stats_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_stats_proc_show, NULL);
+	return single_open_net(inode, file, can_stats_proc_show);
 }
 
 static const struct file_operations can_stats_proc_fops = {
@@ -277,25 +285,28 @@ static const struct file_operations can_stats_proc_fops = {
 
 static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 {
+	struct net *net = m->private;
+	struct s_pstats *can_pstats = net->can.can_pstats;
+	struct s_stats *can_stats = net->can.can_stats;
+
 	user_reset = 1;
 
-	if (can_stattimer.function == can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, "Scheduled statistic reset #%ld.\n",
-				can_pstats.stats_reset + 1);
-
+				can_pstats->stats_reset + 1);
 	} else {
-		if (can_stats.jiffies_init != jiffies)
-			can_init_stats();
+		if (can_stats->jiffies_init != jiffies)
+			can_init_stats(net);
 
 		seq_printf(m, "Performed statistic reset #%ld.\n",
-				can_pstats.stats_reset);
+				can_pstats->stats_reset);
 	}
 	return 0;
 }
 
 static int can_reset_stats_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_reset_stats_proc_show, NULL);
+	return single_open_net(inode, file, can_reset_stats_proc_show);
 }
 
 static const struct file_operations can_reset_stats_proc_fops = {
@@ -314,7 +325,7 @@ static int can_version_proc_show(struct seq_file *m, void *v)
 
 static int can_version_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_version_proc_show, NULL);
+	return single_open_net(inode, file, can_version_proc_show);
 }
 
 static const struct file_operations can_version_proc_fops = {
-- 
cgit v1.2.3


From f107d7a43923a83d837b3ea3c7b7de58cd014bbd Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 16 Mar 2017 09:02:42 +0100
Subject: mtd: nand: Remove unused chip->write_page() hook

The last/only user of the chip->write_page() hook (the Atmel NAND
controller driver) has been reworked and is no longer specifying a custom
->write_page() implementation.
Drop this hook before someone else start abusing it.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 drivers/mtd/nand/nand_base.c | 12 +++++-------
 include/linux/mtd/nand.h     |  4 ----
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index c4c3386e4c0a..36258e69a389 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2633,7 +2633,7 @@ static int nand_write_page_syndrome(struct mtd_info *mtd,
 }
 
 /**
- * nand_write_page - [REPLACEABLE] write one page
+ * nand_write_page - write one page
  * @mtd: MTD device structure
  * @chip: NAND chip descriptor
  * @offset: address offset within the page
@@ -2836,9 +2836,10 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 			/* We still need to erase leftover OOB data */
 			memset(chip->oob_poi, 0xff, mtd->oobsize);
 		}
-		ret = chip->write_page(mtd, chip, column, bytes, wbuf,
-					oob_required, page, cached,
-					(ops->mode == MTD_OPS_RAW));
+
+		ret = nand_write_page(mtd, chip, column, bytes, wbuf,
+				      oob_required, page, cached,
+				      (ops->mode == MTD_OPS_RAW));
 		if (ret)
 			break;
 
@@ -4548,9 +4549,6 @@ int nand_scan_tail(struct mtd_info *mtd)
 		}
 	}
 
-	if (!chip->write_page)
-		chip->write_page = nand_write_page;
-
 	/*
 	 * Check ECC mode, default to software if 3byte/512byte hardware ECC is
 	 * selected and we have 256 byte pagesize fallback to software ECC
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c7de017c7f4c..40657939797c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -828,7 +828,6 @@ struct nand_manufacturer_ops {
  * @errstat:		[OPTIONAL] hardware specific function to perform
  *			additional error status checks (determine if errors are
  *			correctable).
- * @write_page:		[REPLACEABLE] High-level page write function
  * @manufacturer:	[INTERN] Contains manufacturer information
  */
 
@@ -854,9 +853,6 @@ struct nand_chip {
 	int (*scan_bbt)(struct mtd_info *mtd);
 	int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state,
 			int status, int page);
-	int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
-			uint32_t offset, int data_len, const uint8_t *buf,
-			int oob_required, int page, int cached, int raw);
 	int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip,
 			int feature_addr, uint8_t *subfeature_para);
 	int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip,
-- 
cgit v1.2.3


From 07604686e808cd93d352172806a7828860f048f5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 30 Mar 2017 15:45:47 +0900
Subject: mtd: nand: relax ecc.read_page() return value for uncorrectable ECC

The comment for ecc.read_page() requires that it should return
"0 if bitflips uncorrectable".

Actually, drivers could return positive values when uncorrectable
bitflips occur.  For example, nand_read_page_swecc() is the case.
If ecc.correct() returns -EBADMSG for the first ECC sector, and
a positive value for the second one, nand_read_page_swecc() returns
a positive max_bitflips and increments ecc_stats.failed for the same
page.

The requirement can be relaxed by tweaking nand_do_read_ops().
Move the max_bitflips calculation below the retry.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Suggested-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 3 +--
 include/linux/mtd/nand.h     | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 36258e69a389..de6c8045c85b 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1993,8 +1993,6 @@ read_retry:
 				break;
 			}
 
-			max_bitflips = max_t(unsigned int, max_bitflips, ret);
-
 			/* Transfer not aligned data */
 			if (use_bufpoi) {
 				if (!NAND_HAS_SUBPAGE_READ(chip) && !oob &&
@@ -2045,6 +2043,7 @@ read_retry:
 			}
 
 			buf += bytes;
+			max_bitflips = max_t(unsigned int, max_bitflips, ret);
 		} else {
 			memcpy(buf, chip->buffers->databuf + col, bytes);
 			buf += bytes;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 40657939797c..9e0c93c44bef 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -516,7 +516,7 @@ static inline void nand_hw_control_init(struct nand_hw_control *nfc)
  *			out-of-band data).
  * @read_page:	function to read a page according to the ECC generator
  *		requirements; returns maximum number of bitflips corrected in
- *		any single ECC step, 0 if bitflips uncorrectable, -EIO hw error
+ *		any single ECC step, -EIO hw error
  * @read_subpage:	function to read parts of the page covered by ECC;
  *			returns same as read_page()
  * @write_subpage:	function to write parts of the page covered by ECC.
-- 
cgit v1.2.3


From 477544c62a84d3bacd9f90ba75ffc16c04d78071 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 30 Mar 2017 17:15:05 +0900
Subject: mtd: nand: allow drivers to request minimum alignment for passed
 buffer

In some cases, nand_do_{read,write}_ops is passed with unaligned
ops->datbuf.  Drivers using DMA will be unhappy about unaligned
buffer.

The new struct member, buf_align, represents the minimum alignment
the driver require for the buffer.  If the buffer passed from the
upper MTD layer does not have enough alignment, nand_do_*_ops will
use bufpoi.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
 drivers/mtd/nand/nand_base.c | 10 ++++++++--
 include/linux/mtd/nand.h     |  2 ++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index c796d0e4039a..ed49a1d634b0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1954,7 +1954,9 @@ static int nand_do_read_ops(struct mtd_info *mtd, loff_t from,
 		if (!aligned)
 			use_bufpoi = 1;
 		else if (chip->options & NAND_USE_BOUNCE_BUFFER)
-			use_bufpoi = !virt_addr_valid(buf);
+			use_bufpoi = !virt_addr_valid(buf) ||
+				     !IS_ALIGNED((unsigned long)buf,
+						 chip->buf_align);
 		else
 			use_bufpoi = 0;
 
@@ -2810,7 +2812,9 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 		if (part_pagewr)
 			use_bufpoi = 1;
 		else if (chip->options & NAND_USE_BOUNCE_BUFFER)
-			use_bufpoi = !virt_addr_valid(buf);
+			use_bufpoi = !virt_addr_valid(buf) ||
+				     !IS_ALIGNED((unsigned long)buf,
+						 chip->buf_align);
 		else
 			use_bufpoi = 0;
 
@@ -3429,6 +3433,8 @@ static void nand_set_defaults(struct nand_chip *chip)
 		nand_hw_control_init(chip->controller);
 	}
 
+	if (!chip->buf_align)
+		chip->buf_align = 1;
 }
 
 /* Sanitize ONFI strings so we can safely print them */
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9e0c93c44bef..8f67b1581683 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -755,6 +755,7 @@ struct nand_manufacturer_ops {
  *			setting the read-retry mode. Mostly needed for MLC NAND.
  * @ecc:		[BOARDSPECIFIC] ECC control structure
  * @buffers:		buffer structure for read/write
+ * @buf_align:		minimum buffer alignment required by a platform
  * @hwcontrol:		platform-specific hardware control structure
  * @erase:		[REPLACEABLE] erase function
  * @scan_bbt:		[REPLACEABLE] function to scan bad block table
@@ -905,6 +906,7 @@ struct nand_chip {
 
 	struct nand_ecc_ctrl ecc;
 	struct nand_buffers *buffers;
+	unsigned long buf_align;
 	struct nand_hw_control hwcontrol;
 
 	uint8_t *bbt;
-- 
cgit v1.2.3


From 22161f3eb65dc29434325736c4d780908fe3bf6a Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Date: Tue, 18 Apr 2017 11:43:49 +0100
Subject: regulator: arizona-micsupp: Move pdata into a separate structure

In preparation for sharing this driver with Madera, move the pdata
for the micsupp regulator out of struct arizona_pdata into a dedicated
pdata struct for this driver. As a result the code in
arizona_micsupp_of_get_pdata() can be made independent of struct arizona.

This patch also updates the definition of struct arizona_pdata and
the use of this pdata in mach-crag6410-module.c

Signed-off-by: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 MAINTAINERS                               |  1 +
 drivers/regulator/arizona-micsupp.c       | 21 +++++++++++----------
 include/linux/mfd/arizona/pdata.h         |  3 ++-
 include/linux/regulator/arizona-micsupp.h | 21 +++++++++++++++++++++
 4 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/regulator/arizona-micsupp.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index c35e0cea7831..6ed8ef18e7b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13623,6 +13623,7 @@ F:	include/linux/mfd/arizona/
 F:	include/linux/mfd/wm831x/
 F:	include/linux/mfd/wm8350/
 F:	include/linux/mfd/wm8400*
+F:	include/linux/regulator/arizona*
 F:	include/linux/wm97xx.h
 F:	include/sound/wm????.h
 F:	sound/soc/codecs/arizona.?
diff --git a/drivers/regulator/arizona-micsupp.c b/drivers/regulator/arizona-micsupp.c
index 5e38861e71d8..5f8b5a713311 100644
--- a/drivers/regulator/arizona-micsupp.c
+++ b/drivers/regulator/arizona-micsupp.c
@@ -30,6 +30,8 @@
 #include <linux/mfd/arizona/pdata.h>
 #include <linux/mfd/arizona/registers.h>
 
+#include <linux/regulator/arizona-micsupp.h>
+
 struct arizona_micsupp {
 	struct regulator_dev *regulator;
 	struct arizona *arizona;
@@ -199,28 +201,26 @@ static const struct regulator_init_data arizona_micsupp_ext_default = {
 	.num_consumer_supplies = 1,
 };
 
-static int arizona_micsupp_of_get_pdata(struct device *dev,
-					struct arizona *arizona,
+static int arizona_micsupp_of_get_pdata(struct arizona_micsupp_pdata *pdata,
 					struct regulator_config *config,
 					const struct regulator_desc *desc)
 {
-	struct arizona_pdata *pdata = &arizona->pdata;
 	struct arizona_micsupp *micsupp = config->driver_data;
 	struct device_node *np;
 	struct regulator_init_data *init_data;
 
-	np = of_get_child_by_name(arizona->dev->of_node, "micvdd");
+	np = of_get_child_by_name(config->dev->of_node, "micvdd");
 
 	if (np) {
 		config->of_node = np;
 
-		init_data = of_get_regulator_init_data(dev, np, desc);
+		init_data = of_get_regulator_init_data(config->dev, np, desc);
 
 		if (init_data) {
 			init_data->consumer_supplies = &micsupp->supply;
 			init_data->num_consumer_supplies = 1;
 
-			pdata->micvdd = init_data;
+			pdata->init_data = init_data;
 		}
 	}
 
@@ -232,6 +232,7 @@ static int arizona_micsupp_probe(struct platform_device *pdev)
 	struct arizona *arizona = dev_get_drvdata(pdev->dev.parent);
 	const struct regulator_desc *desc;
 	struct regulator_config config = { };
+	struct arizona_micsupp_pdata *pdata = &arizona->pdata.micvdd;
 	struct arizona_micsupp *micsupp;
 	int ret;
 
@@ -269,15 +270,15 @@ static int arizona_micsupp_probe(struct platform_device *pdev)
 
 	if (IS_ENABLED(CONFIG_OF)) {
 		if (!dev_get_platdata(arizona->dev)) {
-			ret = arizona_micsupp_of_get_pdata(&pdev->dev, arizona,
-							   &config, desc);
+			ret = arizona_micsupp_of_get_pdata(pdata, &config,
+							   desc);
 			if (ret < 0)
 				return ret;
 		}
 	}
 
-	if (arizona->pdata.micvdd)
-		config.init_data = arizona->pdata.micvdd;
+	if (pdata->init_data)
+		config.init_data = pdata->init_data;
 	else
 		config.init_data = &micsupp->init_data;
 
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 64faeeff698c..43e875f9850c 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -12,6 +12,7 @@
 #define _ARIZONA_PDATA_H
 
 #include <dt-bindings/mfd/arizona.h>
+#include <linux/regulator/arizona-micsupp.h>
 
 #define ARIZONA_GPN_DIR_MASK                     0x8000  /* GPN_DIR */
 #define ARIZONA_GPN_DIR_SHIFT                        15  /* GPN_DIR */
@@ -79,7 +80,7 @@ struct arizona_pdata {
 	int ldoena;     /** GPIO controlling LODENA, if any */
 
 	/** Regulator configuration for MICVDD */
-	struct regulator_init_data *micvdd;
+	struct arizona_micsupp_pdata micvdd;
 
 	/** Regulator configuration for LDO1 */
 	struct regulator_init_data *ldo1;
diff --git a/include/linux/regulator/arizona-micsupp.h b/include/linux/regulator/arizona-micsupp.h
new file mode 100644
index 000000000000..616842619c00
--- /dev/null
+++ b/include/linux/regulator/arizona-micsupp.h
@@ -0,0 +1,21 @@
+/*
+ * Platform data for Arizona micsupp regulator
+ *
+ * Copyright 2017 Cirrus Logic
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ARIZONA_MICSUPP_H
+#define ARIZONA_MICSUPP_H
+
+struct regulator_init_data;
+
+struct arizona_micsupp_pdata {
+	/** Regulator configuration for micsupp */
+	const struct regulator_init_data *init_data;
+};
+
+#endif
-- 
cgit v1.2.3


From aaa84e6a0399df374634c42590e644a698fcc3ff Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Date: Tue, 18 Apr 2017 11:43:52 +0100
Subject: regulator: arizona-ldo1: Move pdata into a separate structure

In preparation for sharing this driver with Madera, move the pdata
for the LDO1 regulator out of struct arizona_pdata into a dedicated
pdata struct for this driver. As a result the code in
arizona_ldo1_of_get_pdata() can be made independent of struct arizona.

This patch also updates the definition of struct arizona_pdata and
the use of this pdata in mach-crag6410-module.c

Signed-off-by: Richard Fitzgerald <rf@opensource.wolfsonmicro.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-s3c64xx/mach-crag6410-module.c |  8 ++++--
 drivers/regulator/arizona-ldo1.c             | 39 +++++++++++++++-------------
 include/linux/mfd/arizona/pdata.h            |  4 +--
 include/linux/regulator/arizona-ldo1.h       | 24 +++++++++++++++++
 4 files changed, 53 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/regulator/arizona-ldo1.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c64xx/mach-crag6410-module.c b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
index ccc3ab8d58e7..ea5f2169c850 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410-module.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
@@ -209,7 +209,9 @@ static const struct i2c_board_info wm1277_devs[] = {
 };
 
 static struct arizona_pdata wm5102_reva_pdata = {
-	.ldoena = S3C64XX_GPN(7),
+	.ldo1 = {
+		.ldoena = S3C64XX_GPN(7),
+	},
 	.gpio_base = CODEC_GPIO_BASE,
 	.irq_flags = IRQF_TRIGGER_HIGH,
 	.micd_pol_gpio = CODEC_GPIO_BASE + 4,
@@ -239,7 +241,9 @@ static struct spi_board_info wm5102_reva_spi_devs[] = {
 };
 
 static struct arizona_pdata wm5102_pdata = {
-	.ldoena = S3C64XX_GPN(7),
+	.ldo1 = {
+		.ldoena = S3C64XX_GPN(7),
+	},
 	.gpio_base = CODEC_GPIO_BASE,
 	.irq_flags = IRQF_TRIGGER_HIGH,
 	.micd_pol_gpio = CODEC_GPIO_BASE + 2,
diff --git a/drivers/regulator/arizona-ldo1.c b/drivers/regulator/arizona-ldo1.c
index b726fa17f7b2..f5bc75ab85fa 100644
--- a/drivers/regulator/arizona-ldo1.c
+++ b/drivers/regulator/arizona-ldo1.c
@@ -25,6 +25,8 @@
 #include <linux/gpio.h>
 #include <linux/slab.h>
 
+#include <linux/regulator/arizona-ldo1.h>
+
 #include <linux/mfd/arizona/core.h>
 #include <linux/mfd/arizona/pdata.h>
 #include <linux/mfd/arizona/registers.h>
@@ -186,20 +188,19 @@ static const struct regulator_init_data arizona_ldo1_wm5110 = {
 	.num_consumer_supplies = 1,
 };
 
-static int arizona_ldo1_of_get_pdata(struct device *dev,
-				     struct arizona *arizona,
+static int arizona_ldo1_of_get_pdata(struct arizona_ldo1_pdata *pdata,
 				     struct regulator_config *config,
-				     const struct regulator_desc *desc)
+				     const struct regulator_desc *desc,
+				     bool *external_dcvdd)
 {
-	struct arizona_pdata *pdata = &arizona->pdata;
 	struct arizona_ldo1 *ldo1 = config->driver_data;
-	struct device_node *np = arizona->dev->of_node;
+	struct device_node *np = config->dev->of_node;
 	struct device_node *init_node, *dcvdd_node;
 	struct regulator_init_data *init_data;
 
 	pdata->ldoena = of_get_named_gpio(np, "wlf,ldoena", 0);
 	if (pdata->ldoena < 0) {
-		dev_warn(arizona->dev,
+		dev_warn(config->dev,
 			 "LDOENA GPIO property missing/malformed: %d\n",
 			 pdata->ldoena);
 		pdata->ldoena = 0;
@@ -213,19 +214,19 @@ static int arizona_ldo1_of_get_pdata(struct device *dev,
 	if (init_node) {
 		config->of_node = init_node;
 
-		init_data = of_get_regulator_init_data(dev, init_node, desc);
-
+		init_data = of_get_regulator_init_data(config->dev, init_node,
+						       desc);
 		if (init_data) {
 			init_data->consumer_supplies = &ldo1->supply;
 			init_data->num_consumer_supplies = 1;
 
 			if (dcvdd_node && dcvdd_node != init_node)
-				arizona->external_dcvdd = true;
+				*external_dcvdd = true;
 
-			pdata->ldo1 = init_data;
+			pdata->init_data = init_data;
 		}
 	} else if (dcvdd_node) {
-		arizona->external_dcvdd = true;
+		*external_dcvdd = true;
 	}
 
 	of_node_put(dcvdd_node);
@@ -239,10 +240,9 @@ static int arizona_ldo1_probe(struct platform_device *pdev)
 	const struct regulator_desc *desc;
 	struct regulator_config config = { };
 	struct arizona_ldo1 *ldo1;
+	bool external_dcvdd = false;
 	int ret;
 
-	arizona->external_dcvdd = false;
-
 	ldo1 = devm_kzalloc(&pdev->dev, sizeof(*ldo1), GFP_KERNEL);
 	if (!ldo1)
 		return -ENOMEM;
@@ -283,17 +283,18 @@ static int arizona_ldo1_probe(struct platform_device *pdev)
 
 	if (IS_ENABLED(CONFIG_OF)) {
 		if (!dev_get_platdata(arizona->dev)) {
-			ret = arizona_ldo1_of_get_pdata(&pdev->dev, arizona,
-							&config, desc);
+			ret = arizona_ldo1_of_get_pdata(&arizona->pdata.ldo1,
+							&config, desc,
+							&external_dcvdd);
 			if (ret < 0)
 				return ret;
 		}
 	}
 
-	config.ena_gpio = arizona->pdata.ldoena;
+	config.ena_gpio = arizona->pdata.ldo1.ldoena;
 
-	if (arizona->pdata.ldo1)
-		config.init_data = arizona->pdata.ldo1;
+	if (arizona->pdata.ldo1.init_data)
+		config.init_data = arizona->pdata.ldo1.init_data;
 	else
 		config.init_data = &ldo1->init_data;
 
@@ -303,6 +304,8 @@ static int arizona_ldo1_probe(struct platform_device *pdev)
 	 */
 	if (config.init_data->num_consumer_supplies == 0)
 		arizona->external_dcvdd = true;
+	else
+		arizona->external_dcvdd = external_dcvdd;
 
 	ldo1->regulator = devm_regulator_register(&pdev->dev, desc, &config);
 
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 43e875f9850c..bfeecf179895 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -12,6 +12,7 @@
 #define _ARIZONA_PDATA_H
 
 #include <dt-bindings/mfd/arizona.h>
+#include <linux/regulator/arizona-ldo1.h>
 #include <linux/regulator/arizona-micsupp.h>
 
 #define ARIZONA_GPN_DIR_MASK                     0x8000  /* GPN_DIR */
@@ -77,13 +78,12 @@ struct arizona_micd_range {
 
 struct arizona_pdata {
 	int reset;      /** GPIO controlling /RESET, if any */
-	int ldoena;     /** GPIO controlling LODENA, if any */
 
 	/** Regulator configuration for MICVDD */
 	struct arizona_micsupp_pdata micvdd;
 
 	/** Regulator configuration for LDO1 */
-	struct regulator_init_data *ldo1;
+	struct arizona_ldo1_pdata ldo1;
 
 	/** If a direct 32kHz clock is provided on an MCLK specify it here */
 	int clk32k_src;
diff --git a/include/linux/regulator/arizona-ldo1.h b/include/linux/regulator/arizona-ldo1.h
new file mode 100644
index 000000000000..c685f1277c63
--- /dev/null
+++ b/include/linux/regulator/arizona-ldo1.h
@@ -0,0 +1,24 @@
+/*
+ * Platform data for Arizona LDO1 regulator
+ *
+ * Copyright 2017 Cirrus Logic
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ARIZONA_LDO1_H
+#define ARIZONA_LDO1_H
+
+struct regulator_init_data;
+
+struct arizona_ldo1_pdata {
+	/** GPIO controlling LDOENA, if any */
+	int ldoena;
+
+	/** Regulator configuration for LDO1 */
+	const struct regulator_init_data *init_data;
+};
+
+#endif
-- 
cgit v1.2.3


From 19489c7f0d9040ed2ffc23747e14af95dba479d2 Mon Sep 17 00:00:00 2001
From: "Chopra, Manish" <Manish.Chopra@cavium.com>
Date: Mon, 24 Apr 2017 10:00:45 -0700
Subject: qed/qede: Enable tunnel offloads based on hw configuration

This patch enables tunnel feature offloads based on hw configuration
at initialization time instead of enabling them always.

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c     | 15 +++++++++++
 drivers/net/ethernet/qlogic/qede/qede_filter.c |  6 +++++
 drivers/net/ethernet/qlogic/qede/qede_main.c   | 36 ++++++++++++++++++--------
 include/linux/qed/qed_if.h                     |  5 ++++
 4 files changed, 51 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index a622d75e2547..e6502810ccd7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -230,10 +230,25 @@ err0:
 int qed_fill_dev_info(struct qed_dev *cdev,
 		      struct qed_dev_info *dev_info)
 {
+	struct qed_tunnel_info *tun = &cdev->tunnel;
 	struct qed_ptt  *ptt;
 
 	memset(dev_info, 0, sizeof(struct qed_dev_info));
 
+	if (tun->vxlan.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->vxlan.b_mode_enabled)
+		dev_info->vxlan_enable = true;
+
+	if (tun->l2_gre.b_mode_enabled && tun->ip_gre.b_mode_enabled &&
+	    tun->l2_gre.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->ip_gre.tun_cls == QED_TUNN_CLSS_MAC_VLAN)
+		dev_info->gre_enable = true;
+
+	if (tun->l2_geneve.b_mode_enabled && tun->ip_geneve.b_mode_enabled &&
+	    tun->l2_geneve.tun_cls == QED_TUNN_CLSS_MAC_VLAN &&
+	    tun->ip_geneve.tun_cls == QED_TUNN_CLSS_MAC_VLAN)
+		dev_info->geneve_enable = true;
+
 	dev_info->num_hwfns = cdev->num_hwfns;
 	dev_info->pci_mem_start = cdev->pci_params.mem_start;
 	dev_info->pci_mem_end = cdev->pci_params.mem_end;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 34473fbac798..23e0c1696c86 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -887,6 +887,9 @@ void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 
 	switch (ti->type) {
 	case UDP_TUNNEL_TYPE_VXLAN:
+		if (!edev->dev_info.common.vxlan_enable)
+			return;
+
 		if (edev->vxlan_dst_port)
 			return;
 
@@ -898,6 +901,9 @@ void qede_udp_tunnel_add(struct net_device *dev, struct udp_tunnel_info *ti)
 		set_bit(QEDE_SP_VXLAN_PORT_CONFIG, &edev->sp_flags);
 		break;
 	case UDP_TUNNEL_TYPE_GENEVE:
+		if (!edev->dev_info.common.geneve_enable)
+			return;
+
 		if (edev->geneve_dst_port)
 			return;
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 02b305c19f38..42f043b1524f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -609,6 +609,7 @@ static void qede_init_ndev(struct qede_dev *edev)
 {
 	struct net_device *ndev = edev->ndev;
 	struct pci_dev *pdev = edev->pdev;
+	bool udp_tunnel_enable = false;
 	netdev_features_t hw_features;
 
 	pci_set_drvdata(pdev, ndev);
@@ -631,20 +632,33 @@ static void qede_init_ndev(struct qede_dev *edev)
 		      NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		      NETIF_F_TSO | NETIF_F_TSO6;
 
-	/* Encap features*/
-	hw_features |= NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL |
-		       NETIF_F_TSO_ECN | NETIF_F_GSO_UDP_TUNNEL_CSUM |
-		       NETIF_F_GSO_GRE_CSUM;
-
 	if (!IS_VF(edev) && edev->dev_info.common.num_hwfns == 1)
 		hw_features |= NETIF_F_NTUPLE;
 
-	ndev->hw_enc_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-				NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO_ECN |
-				NETIF_F_TSO6 | NETIF_F_GSO_GRE |
-				NETIF_F_GSO_UDP_TUNNEL | NETIF_F_RXCSUM |
-				NETIF_F_GSO_UDP_TUNNEL_CSUM |
-				NETIF_F_GSO_GRE_CSUM;
+	if (edev->dev_info.common.vxlan_enable ||
+	    edev->dev_info.common.geneve_enable)
+		udp_tunnel_enable = true;
+
+	if (udp_tunnel_enable || edev->dev_info.common.gre_enable) {
+		hw_features |= NETIF_F_TSO_ECN;
+		ndev->hw_enc_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
+					NETIF_F_SG | NETIF_F_TSO |
+					NETIF_F_TSO_ECN | NETIF_F_TSO6 |
+					NETIF_F_RXCSUM;
+	}
+
+	if (udp_tunnel_enable) {
+		hw_features |= (NETIF_F_GSO_UDP_TUNNEL |
+				NETIF_F_GSO_UDP_TUNNEL_CSUM);
+		ndev->hw_enc_features |= (NETIF_F_GSO_UDP_TUNNEL |
+					  NETIF_F_GSO_UDP_TUNNEL_CSUM);
+	}
+
+	if (edev->dev_info.common.gre_enable) {
+		hw_features |= (NETIF_F_GSO_GRE | NETIF_F_GSO_GRE_CSUM);
+		ndev->hw_enc_features |= (NETIF_F_GSO_GRE |
+					  NETIF_F_GSO_GRE_CSUM);
+	}
 
 	ndev->vlan_features = hw_features | NETIF_F_RXHASH | NETIF_F_RXCSUM |
 			      NETIF_F_HIGHDMA;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 9f966be89510..5544d7b2f2bb 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -338,6 +338,11 @@ struct qed_dev_info {
 	bool wol_support;
 
 	enum qed_dev_type dev_type;
+
+	/* Output parameters for qede */
+	bool		vxlan_enable;
+	bool		gre_enable;
+	bool		geneve_enable;
 };
 
 enum qed_sb_type {
-- 
cgit v1.2.3


From 97379f15c21e7ae27eb1ecf84adcace42c960c87 Mon Sep 17 00:00:00 2001
From: "Chopra, Manish" <Manish.Chopra@cavium.com>
Date: Mon, 24 Apr 2017 10:00:48 -0700
Subject: qed/qede: Add UDP ports in bulletin board

This patch adds support for UDP ports in bulletin board
to notify UDP ports change to the VFs

Signed-off-by: Manish Chopra <manish.chopra@cavium.com>
Signed-off-by: Yuval Mintz <yuval.mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_l2.c       | 19 +++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.c    | 23 +++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_sriov.h    |  9 +++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.c       | 16 ++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_vf.h       |  4 +++-
 drivers/net/ethernet/qlogic/qede/qede.h        |  1 +
 drivers/net/ethernet/qlogic/qede/qede_filter.c | 11 +++++++++++
 drivers/net/ethernet/qlogic/qede/qede_main.c   |  1 +
 include/linux/qed/qed_eth_if.h                 |  1 +
 9 files changed, 84 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 03216babb06f..b8bd1193fb9c 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -2304,11 +2304,30 @@ static int qed_tunn_configure(struct qed_dev *cdev,
 
 	for_each_hwfn(cdev, i) {
 		struct qed_hwfn *hwfn = &cdev->hwfns[i];
+		struct qed_tunnel_info *tun;
+
+		tun = &hwfn->cdev->tunnel;
 
 		rc = qed_sp_pf_update_tunn_cfg(hwfn, &tunn_info,
 					       QED_SPQ_MODE_EBLOCK, NULL);
 		if (rc)
 			return rc;
+
+		if (IS_PF_SRIOV(hwfn)) {
+			u16 vxlan_port, geneve_port;
+			int j;
+
+			vxlan_port = tun->vxlan_port.port;
+			geneve_port = tun->geneve_port.port;
+
+			qed_for_each_vf(hwfn, j) {
+				qed_iov_bulletin_set_udp_ports(hwfn, j,
+							       vxlan_port,
+							       geneve_port);
+			}
+
+			qed_schedule_iov(hwfn, QED_IOV_WQ_BULLETIN_UPDATE_FLAG);
+		}
 	}
 
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index 92a3ee1715d9..85672f6cf629 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -3511,6 +3511,29 @@ static void qed_iov_bulletin_set_forced_vlan(struct qed_hwfn *p_hwfn,
 	qed_iov_configure_vport_forced(p_hwfn, vf_info, feature);
 }
 
+void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
+				    int vfid, u16 vxlan_port, u16 geneve_port)
+{
+	struct qed_vf_info *vf_info;
+
+	vf_info = qed_iov_get_vf_info(p_hwfn, (u16)vfid, true);
+	if (!vf_info) {
+		DP_NOTICE(p_hwfn->cdev,
+			  "Can not set udp ports, invalid vfid [%d]\n", vfid);
+		return;
+	}
+
+	if (vf_info->b_malicious) {
+		DP_VERBOSE(p_hwfn, QED_MSG_IOV,
+			   "Can not set udp ports to malicious VF [%d]\n",
+			   vfid);
+		return;
+	}
+
+	vf_info->bulletin.p_virt->vxlan_udp_port = vxlan_port;
+	vf_info->bulletin.p_virt->geneve_udp_port = geneve_port;
+}
+
 static bool qed_iov_vf_has_vport_instance(struct qed_hwfn *p_hwfn, int vfid)
 {
 	struct qed_vf_info *p_vf_info;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.h b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
index 8e96b1d19308..81a497ce6585 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.h
@@ -270,6 +270,9 @@ enum qed_iov_wq_flag {
  */
 u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn, u16 rel_vf_id);
 
+void qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn,
+				    int vfid, u16 vxlan_port, u16 geneve_port);
+
 /**
  * @brief Read sriov related information and allocated resources
  *  reads from configuraiton space, shmem, etc.
@@ -378,6 +381,12 @@ static inline u16 qed_iov_get_next_active_vf(struct qed_hwfn *p_hwfn,
 	return MAX_NUM_VFS;
 }
 
+static inline void
+qed_iov_bulletin_set_udp_ports(struct qed_hwfn *p_hwfn, int vfid,
+			       u16 vxlan_port, u16 geneve_port)
+{
+}
+
 static inline int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 {
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.c b/drivers/net/ethernet/qlogic/qed/qed_vf.c
index 798786562b1b..01cbbe655af4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.c
@@ -1251,6 +1251,18 @@ static bool qed_vf_bulletin_get_forced_mac(struct qed_hwfn *hwfn,
 	return true;
 }
 
+static void
+qed_vf_bulletin_get_udp_ports(struct qed_hwfn *p_hwfn,
+			      u16 *p_vxlan_port, u16 *p_geneve_port)
+{
+	struct qed_bulletin_content *p_bulletin;
+
+	p_bulletin = &p_hwfn->vf_iov_info->bulletin_shadow;
+
+	*p_vxlan_port = p_bulletin->vxlan_udp_port;
+	*p_geneve_port = p_bulletin->geneve_udp_port;
+}
+
 void qed_vf_get_fw_version(struct qed_hwfn *p_hwfn,
 			   u16 *fw_major, u16 *fw_minor,
 			   u16 *fw_rev, u16 *fw_eng)
@@ -1270,12 +1282,16 @@ static void qed_handle_bulletin_change(struct qed_hwfn *hwfn)
 	struct qed_eth_cb_ops *ops = hwfn->cdev->protocol_ops.eth;
 	u8 mac[ETH_ALEN], is_mac_exist, is_mac_forced;
 	void *cookie = hwfn->cdev->ops_cookie;
+	u16 vxlan_port, geneve_port;
 
+	qed_vf_bulletin_get_udp_ports(hwfn, &vxlan_port, &geneve_port);
 	is_mac_exist = qed_vf_bulletin_get_forced_mac(hwfn, mac,
 						      &is_mac_forced);
 	if (is_mac_exist && cookie)
 		ops->force_mac(cookie, mac, !!is_mac_forced);
 
+	ops->ports_update(cookie, vxlan_port, geneve_port);
+
 	/* Always update link configuration according to bulletin */
 	qed_link_update(hwfn);
 }
diff --git a/drivers/net/ethernet/qlogic/qed/qed_vf.h b/drivers/net/ethernet/qlogic/qed/qed_vf.h
index 105c0edd2a01..6cca145331cd 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_vf.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_vf.h
@@ -513,7 +513,9 @@ struct qed_bulletin_content {
 	u8 partner_rx_flow_ctrl_en;
 	u8 partner_adv_pause;
 	u8 sfp_tx_fault;
-	u8 padding4[6];
+	u16 vxlan_udp_port;
+	u16 geneve_udp_port;
+	u8 padding4[2];
 
 	u32 speed;
 	u32 partner_adv_speed;
diff --git a/drivers/net/ethernet/qlogic/qede/qede.h b/drivers/net/ethernet/qlogic/qede/qede.h
index 26699a752768..766a79d2ed75 100644
--- a/drivers/net/ethernet/qlogic/qede/qede.h
+++ b/drivers/net/ethernet/qlogic/qede/qede.h
@@ -480,6 +480,7 @@ irqreturn_t qede_msix_fp_int(int irq, void *fp_cookie);
 
 /* Filtering function definitions */
 void qede_force_mac(void *dev, u8 *mac, bool forced);
+void qede_udp_ports_update(void *dev, u16 vxlan_port, u16 geneve_port);
 int qede_set_mac_addr(struct net_device *ndev, void *p);
 
 int qede_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid);
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index 4fa2c88e9693..eb5652073ca8 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -480,6 +480,17 @@ ret_unlock:
 }
 #endif
 
+void qede_udp_ports_update(void *dev, u16 vxlan_port, u16 geneve_port)
+{
+	struct qede_dev *edev = dev;
+
+	if (edev->vxlan_dst_port != vxlan_port)
+		edev->vxlan_dst_port = 0;
+
+	if (edev->geneve_dst_port != geneve_port)
+		edev->geneve_dst_port = 0;
+}
+
 void qede_force_mac(void *dev, u8 *mac, bool forced)
 {
 	struct qede_dev *edev = dev;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index f57c823730c0..292e2dc3f8ae 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -231,6 +231,7 @@ static struct qed_eth_cb_ops qede_ll_ops = {
 		.link_update = qede_link_update,
 	},
 	.force_mac = qede_force_mac,
+	.ports_update = qede_udp_ports_update,
 };
 
 static int qede_netdev_event(struct notifier_block *this, unsigned long event,
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 1eba803cb7f1..15fa7c6e4c6f 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -158,6 +158,7 @@ struct qed_tunn_params {
 struct qed_eth_cb_ops {
 	struct qed_common_cb_ops common;
 	void (*force_mac) (void *dev, u8 *mac, bool forced);
+	void (*ports_update)(void *dev, u16 vxlan_port, u16 geneve_port);
 };
 
 #define QED_MAX_PHC_DRIFT_PPB   291666666
-- 
cgit v1.2.3


From b5cdae3291f7be7a34e75affe4c0ec1f7f328b64 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 18 Apr 2017 15:36:58 -0400
Subject: net: Generic XDP

This provides a generic SKB based non-optimized XDP path which is used
if either the driver lacks a specific XDP implementation, or the user
requests it via a new IFLA_XDP_FLAGS value named XDP_FLAGS_SKB_MODE.

It is arguable that perhaps I should have required something like
this as part of the initial XDP feature merge.

I believe this is critical for two reasons:

1) Accessibility.  More people can play with XDP with less
   dependencies.  Yes I know we have XDP support in virtio_net, but
   that just creates another depedency for learning how to use this
   facility.

   I wrote this to make life easier for the XDP newbies.

2) As a model for what the expected semantics are.  If there is a pure
   generic core implementation, it serves as a semantic example for
   driver folks adding XDP support.

One thing I have not tried to address here is the issue of
XDP_PACKET_HEADROOM, thanks to Daniel for spotting that.  It seems
incredibly expensive to do a skb_cow(skb, XDP_PACKET_HEADROOM) or
whatever even if the XDP program doesn't try to push headers at all.
I think we really need the verifier to somehow propagate whether
certain XDP helpers are used or not.

v5:
 - Handle both negative and positive offset after running prog
 - Fix mac length in XDP_TX case (Alexei)
 - Use rcu_dereference_protected() in free_netdev (kbuild test robot)

v4:
 - Fix MAC header adjustmnet before calling prog (David Ahern)
 - Disable LRO when generic XDP is installed (Michael Chan)
 - Bypass qdisc et al. on XDP_TX and record the event (Alexei)
 - Do not perform generic XDP on reinjected packets (DaveM)

v3:
 - Make sure XDP program sees packet at MAC header, push back MAC
   header if we do XDP_TX.  (Alexei)
 - Elide GRO when generic XDP is in use.  (Alexei)
 - Add XDP_FLAG_SKB_MODE flag which the user can use to request generic
   XDP even if the driver has an XDP implementation.  (Alexei)
 - Report whether SKB mode is in use in rtnl_xdp_fill() via XDP_FLAGS
   attribute.  (Daniel)

v2:
 - Add some "fall through" comments in switch statements based
   upon feedback from Andrew Lunn
 - Use RCU for generic xdp_prog, thanks to Johannes Berg.

Tested-by: Andy Gospodarek <andy@greyhouse.net>
Tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Tested-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |   8 +++
 include/uapi/linux/if_link.h |   4 +-
 net/core/dev.c               | 155 +++++++++++++++++++++++++++++++++++++++++--
 net/core/gro_cells.c         |   2 +-
 net/core/rtnetlink.c         |  40 ++++++-----
 5 files changed, 187 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5d5267febd56..46d220c2bf92 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1905,9 +1905,17 @@ struct net_device {
 	struct lock_class_key	*qdisc_tx_busylock;
 	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
+	struct bpf_prog __rcu	*xdp_prog;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+static inline bool netif_elide_gro(const struct net_device *dev)
+{
+	if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
+		return true;
+	return false;
+}
+
 #define	NETDEV_ALIGN		32
 
 static inline
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8b405afb2376..633aa0276d32 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -887,7 +887,9 @@ enum {
 /* XDP section */
 
 #define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
-#define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST)
+#define XDP_FLAGS_SKB_MODE		(2U << 0)
+#define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
+					 XDP_FLAGS_SKB_MODE)
 
 enum {
 	IFLA_XDP_UNSPEC,
diff --git a/net/core/dev.c b/net/core/dev.c
index db6e31564d06..1b3317c026c6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -95,6 +95,7 @@
 #include <linux/notifier.h>
 #include <linux/skbuff.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/busy_poll.h>
@@ -4251,6 +4252,125 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
+static struct static_key generic_xdp_needed __read_mostly;
+
+static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	struct bpf_prog *new = xdp->prog;
+	int ret = 0;
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG: {
+		struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
+
+		rcu_assign_pointer(dev->xdp_prog, new);
+		if (old)
+			bpf_prog_put(old);
+
+		if (old && !new) {
+			static_key_slow_dec(&generic_xdp_needed);
+		} else if (new && !old) {
+			static_key_slow_inc(&generic_xdp_needed);
+			dev_disable_lro(dev);
+		}
+		break;
+	}
+
+	case XDP_QUERY_PROG:
+		xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog);
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff *skb,
+				     struct bpf_prog *xdp_prog)
+{
+	struct xdp_buff xdp;
+	u32 act = XDP_DROP;
+	void *orig_data;
+	int hlen, off;
+	u32 mac_len;
+
+	/* Reinjected packets coming from act_mirred or similar should
+	 * not get XDP generic processing.
+	 */
+	if (skb_cloned(skb))
+		return XDP_PASS;
+
+	if (skb_linearize(skb))
+		goto do_drop;
+
+	/* The XDP program wants to see the packet starting at the MAC
+	 * header.
+	 */
+	mac_len = skb->data - skb_mac_header(skb);
+	hlen = skb_headlen(skb) + mac_len;
+	xdp.data = skb->data - mac_len;
+	xdp.data_end = xdp.data + hlen;
+	xdp.data_hard_start = skb->data - skb_headroom(skb);
+	orig_data = xdp.data;
+
+	act = bpf_prog_run_xdp(xdp_prog, &xdp);
+
+	off = xdp.data - orig_data;
+	if (off > 0)
+		__skb_pull(skb, off);
+	else if (off < 0)
+		__skb_push(skb, -off);
+
+	switch (act) {
+	case XDP_TX:
+		__skb_push(skb, mac_len);
+		/* fall through */
+	case XDP_PASS:
+		break;
+
+	default:
+		bpf_warn_invalid_xdp_action(act);
+		/* fall through */
+	case XDP_ABORTED:
+		trace_xdp_exception(skb->dev, xdp_prog, act);
+		/* fall through */
+	case XDP_DROP:
+	do_drop:
+		kfree_skb(skb);
+		break;
+	}
+
+	return act;
+}
+
+/* When doing generic XDP we have to bypass the qdisc layer and the
+ * network taps in order to match in-driver-XDP behavior.
+ */
+static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+{
+	struct net_device *dev = skb->dev;
+	struct netdev_queue *txq;
+	bool free_skb = true;
+	int cpu, rc;
+
+	txq = netdev_pick_tx(dev, skb, NULL);
+	cpu = smp_processor_id();
+	HARD_TX_LOCK(dev, txq, cpu);
+	if (!netif_xmit_stopped(txq)) {
+		rc = netdev_start_xmit(skb, dev, txq, 0);
+		if (dev_xmit_complete(rc))
+			free_skb = false;
+	}
+	HARD_TX_UNLOCK(dev, txq);
+	if (free_skb) {
+		trace_xdp_exception(dev, xdp_prog, XDP_TX);
+		kfree_skb(skb);
+	}
+}
+
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
 	int ret;
@@ -4262,6 +4382,21 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 
 	rcu_read_lock();
 
+	if (static_key_false(&generic_xdp_needed)) {
+		struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+
+		if (xdp_prog) {
+			u32 act = netif_receive_generic_xdp(skb, xdp_prog);
+
+			if (act != XDP_PASS) {
+				rcu_read_unlock();
+				if (act == XDP_TX)
+					generic_xdp_tx(skb, xdp_prog);
+				return NET_RX_DROP;
+			}
+		}
+	}
+
 #ifdef CONFIG_RPS
 	if (static_key_false(&rps_needed)) {
 		struct rps_dev_flow voidflow, *rflow = &voidflow;
@@ -4494,7 +4629,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	enum gro_result ret;
 	int grow;
 
-	if (!(skb->dev->features & NETIF_F_GRO))
+	if (netif_elide_gro(skb->dev))
 		goto normal;
 
 	if (skb->csum_bad)
@@ -6723,6 +6858,7 @@ EXPORT_SYMBOL(dev_change_proto_down);
  */
 int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 {
+	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
 	struct netdev_xdp xdp;
@@ -6730,14 +6866,16 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
 	ASSERT_RTNL();
 
-	if (!ops->ndo_xdp)
-		return -EOPNOTSUPP;
+	xdp_op = ops->ndo_xdp;
+	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
+		xdp_op = generic_xdp_install;
+
 	if (fd >= 0) {
 		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
 			memset(&xdp, 0, sizeof(xdp));
 			xdp.command = XDP_QUERY_PROG;
 
-			err = ops->ndo_xdp(dev, &xdp);
+			err = xdp_op(dev, &xdp);
 			if (err < 0)
 				return err;
 			if (xdp.prog_attached)
@@ -6753,7 +6891,7 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 	xdp.command = XDP_SETUP_PROG;
 	xdp.prog = prog;
 
-	err = ops->ndo_xdp(dev, &xdp);
+	err = xdp_op(dev, &xdp);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
@@ -7793,6 +7931,7 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
 void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
+	struct bpf_prog *prog;
 
 	might_sleep();
 	netif_free_tx_queues(dev);
@@ -7811,6 +7950,12 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->pcpu_refcnt);
 	dev->pcpu_refcnt = NULL;
 
+	prog = rcu_dereference_protected(dev->xdp_prog, 1);
+	if (prog) {
+		bpf_prog_put(prog);
+		static_key_slow_dec(&generic_xdp_needed);
+	}
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		netdev_freemem(dev);
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index c98bbfbd26b8..814e58a3ce8b 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -13,7 +13,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct gro_cell *cell;
 
-	if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO))
+	if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev))
 		return netif_rx(skb);
 
 	cell = this_cpu_ptr(gcells->cells);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 088f9c8b4196..9031a6c8bfa7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -896,15 +896,13 @@ static size_t rtnl_port_size(const struct net_device *dev,
 		return port_self_size;
 }
 
-static size_t rtnl_xdp_size(const struct net_device *dev)
+static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
-			  nla_total_size(1);	/* XDP_ATTACHED */
+			  nla_total_size(1) +	/* XDP_ATTACHED */
+			  nla_total_size(4);	/* XDP_FLAGS */
 
-	if (!dev->netdev_ops->ndo_xdp)
-		return 0;
-	else
-		return xdp_size;
+	return xdp_size;
 }
 
 static noinline size_t if_nlmsg_size(const struct net_device *dev,
@@ -943,7 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
-	       + rtnl_xdp_size(dev) /* IFLA_XDP */
+	       + rtnl_xdp_size() /* IFLA_XDP */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1251,23 +1249,35 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
-	struct netdev_xdp xdp_op = {};
 	struct nlattr *xdp;
+	u32 xdp_flags = 0;
+	u8 val = 0;
 	int err;
 
-	if (!dev->netdev_ops->ndo_xdp)
-		return 0;
 	xdp = nla_nest_start(skb, IFLA_XDP);
 	if (!xdp)
 		return -EMSGSIZE;
-	xdp_op.command = XDP_QUERY_PROG;
-	err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
-	if (err)
-		goto err_cancel;
-	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached);
+	if (rcu_access_pointer(dev->xdp_prog)) {
+		xdp_flags = XDP_FLAGS_SKB_MODE;
+		val = 1;
+	} else if (dev->netdev_ops->ndo_xdp) {
+		struct netdev_xdp xdp_op = {};
+
+		xdp_op.command = XDP_QUERY_PROG;
+		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
+		if (err)
+			goto err_cancel;
+		val = xdp_op.prog_attached;
+	}
+	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
 	if (err)
 		goto err_cancel;
 
+	if (xdp_flags) {
+		err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
+		if (err)
+			goto err_cancel;
+	}
 	nla_nest_end(skb, xdp);
 	return 0;
 
-- 
cgit v1.2.3


From 817bf40265459578abc36c6bd53e27775b5c7ec4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 12 Apr 2017 13:37:44 -0700
Subject: dm: teach dm-targets to use a dax_device + dax_operations

Arrange for dm to lookup the dax services available from member devices.
Update the dax-capable targets, linear and stripe, to route dax
operations to the underlying device. Changes the target-internal
->direct_access() method to more closely align with the dax_operations
->direct_access() calling convention.

Cc: Toshi Kani <toshi.kani@hpe.com>
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/md/dm-linear.c        | 27 +++++++++++++--------------
 drivers/md/dm-snap.c          |  6 +++---
 drivers/md/dm-stripe.c        | 29 ++++++++++++++---------------
 drivers/md/dm-target.c        |  6 +++---
 drivers/md/dm.c               | 16 ++++++----------
 include/linux/device-mapper.h |  7 ++++---
 6 files changed, 43 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..c5a52f4dae81 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/device-mapper.h>
 
@@ -141,22 +142,20 @@ static int linear_iterate_devices(struct dm_target *ti,
 	return fn(ti, lc->dev, lc->start, ti->len, data);
 }
 
-static long linear_direct_access(struct dm_target *ti, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
+	long ret;
 	struct linear_c *lc = ti->private;
 	struct block_device *bdev = lc->dev->bdev;
-	struct blk_dax_ctl dax = {
-		.sector = linear_map_sector(ti, sector),
-		.size = size,
-	};
-	long ret;
-
-	ret = bdev_direct_access(bdev, &dax);
-	*kaddr = dax.addr;
-	*pfn = dax.pfn;
-
-	return ret;
+	struct dax_device *dax_dev = lc->dev->dax_dev;
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
+
+	dev_sector = linear_map_sector(ti, sector);
+	ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
+	if (ret)
+		return ret;
+	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
 static struct target_type linear_target = {
@@ -169,7 +168,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.prepare_ioctl = linear_prepare_ioctl,
 	.iterate_devices = linear_iterate_devices,
-	.direct_access = linear_direct_access,
+	.direct_access = linear_dax_direct_access,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c65feeada864..e152d9817c81 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2302,8 +2302,8 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
 	return do_origin(o->dev, bio);
 }
 
-static long origin_direct_access(struct dm_target *ti, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
+static long origin_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
 	DMWARN("device does not support dax.");
 	return -EIO;
@@ -2368,7 +2368,7 @@ static struct target_type origin_target = {
 	.postsuspend = origin_postsuspend,
 	.status  = origin_status,
 	.iterate_devices = origin_iterate_devices,
-	.direct_access = origin_direct_access,
+	.direct_access = origin_dax_direct_access,
 };
 
 static struct target_type snapshot_target = {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..cb4b1e9e16ab 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/dax.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
 
@@ -308,27 +309,25 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 	return DM_MAPIO_REMAPPED;
 }
 
-static long stripe_direct_access(struct dm_target *ti, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
+	sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
 	struct stripe_c *sc = ti->private;
-	uint32_t stripe;
+	struct dax_device *dax_dev;
 	struct block_device *bdev;
-	struct blk_dax_ctl dax = {
-		.size = size,
-	};
+	uint32_t stripe;
 	long ret;
 
-	stripe_map_sector(sc, sector, &stripe, &dax.sector);
-
-	dax.sector += sc->stripe[stripe].physical_start;
+	stripe_map_sector(sc, sector, &stripe, &dev_sector);
+	dev_sector += sc->stripe[stripe].physical_start;
+	dax_dev = sc->stripe[stripe].dev->dax_dev;
 	bdev = sc->stripe[stripe].dev->bdev;
 
-	ret = bdev_direct_access(bdev, &dax);
-	*kaddr = dax.addr;
-	*pfn = dax.pfn;
-
-	return ret;
+	ret = bdev_dax_pgoff(bdev, dev_sector, nr_pages * PAGE_SIZE, &pgoff);
+	if (ret)
+		return ret;
+	return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
 }
 
 /*
@@ -448,7 +447,7 @@ static struct target_type stripe_target = {
 	.status = stripe_status,
 	.iterate_devices = stripe_iterate_devices,
 	.io_hints = stripe_io_hints,
-	.direct_access = stripe_direct_access,
+	.direct_access = stripe_dax_direct_access,
 };
 
 int __init dm_stripe_init(void)
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 43d3445b121d..6a7968f93f3c 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -142,8 +142,8 @@ static void io_err_release_clone_rq(struct request *clone)
 {
 }
 
-static long io_err_direct_access(struct dm_target *ti, sector_t sector,
-				 void **kaddr, pfn_t *pfn, long size)
+static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn)
 {
 	return -EIO;
 }
@@ -157,7 +157,7 @@ static struct target_type error_target = {
 	.map  = io_err_map,
 	.clone_and_map_rq = io_err_clone_and_map_rq,
 	.release_clone_rq = io_err_release_clone_rq,
-	.direct_access = io_err_direct_access,
+	.direct_access = io_err_dax_direct_access,
 };
 
 int __init dm_target_init(void)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index bd56dfe43a99..ef4c6f8cad47 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -630,6 +630,7 @@ static int open_table_device(struct table_device *td, dev_t dev,
 	}
 
 	td->dm_dev.bdev = bdev;
+	td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	return 0;
 }
 
@@ -643,7 +644,9 @@ static void close_table_device(struct table_device *td, struct mapped_device *md
 
 	bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 	blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
+	put_dax(td->dm_dev.dax_dev);
 	td->dm_dev.bdev = NULL;
+	td->dm_dev.dax_dev = NULL;
 }
 
 static struct table_device *find_table_device(struct list_head *l, dev_t dev,
@@ -945,16 +948,9 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	if (len < 1)
 		goto out;
 	nr_pages = min(len, nr_pages);
-	if (ti->type->direct_access) {
-		ret = ti->type->direct_access(ti, sector, kaddr, pfn,
-				nr_pages * PAGE_SIZE);
-		/*
-		 * FIXME: convert ti->type->direct_access to return
-		 * nr_pages directly.
-		 */
-		if (ret >= 0)
-			ret /= PAGE_SIZE;
-	}
+	if (ti->type->direct_access)
+		ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
+
  out:
 	dm_put_live_table(md, srcu_idx);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index bcba4d89089c..df830d167892 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -128,14 +128,15 @@ typedef int (*dm_busy_fn) (struct dm_target *ti);
  *  < 0 : error
  * >= 0 : the number of bytes accessible at the address
  */
-typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector,
-				     void **kaddr, pfn_t *pfn, long size);
+typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff,
+		long nr_pages, void **kaddr, pfn_t *pfn);
 #define PAGE_SECTORS (PAGE_SIZE / 512)
 
 void dm_error(const char *message);
 
 struct dm_dev {
 	struct block_device *bdev;
+	struct dax_device *dax_dev;
 	fmode_t mode;
 	char name[16];
 };
@@ -177,7 +178,7 @@ struct target_type {
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
-	dm_direct_access_fn direct_access;
+	dm_dax_direct_access_fn direct_access;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3


From fa5d932c323e8e0d9b24b3517997d15b36d1607d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 12:04:59 -0800
Subject: ext2, ext4, xfs: retrieve dax_device for iomap operations

In preparation for converting fs/dax.c to use dax_direct_access()
instead of bdev_direct_access(), add the plumbing to retrieve the
dax_device associated with a given block_device.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext2/inode.c       |  9 ++++++++-
 fs/ext4/inode.c       |  9 ++++++++-
 fs/xfs/xfs_iomap.c    | 10 ++++++++++
 include/linux/iomap.h |  1 +
 4 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 128cce540645..4c9d2d44e879 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
 static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		unsigned flags, struct iomap *iomap)
 {
+	struct block_device *bdev;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned long first_block = offset >> blkbits;
 	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->flags = 0;
-	iomap->bdev = inode->i_sb->s_bdev;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
 	iomap->offset = (u64)first_block << blkbits;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
 
 	if (ret == 0) {
 		iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
 ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 		ssize_t written, unsigned flags, struct iomap *iomap)
 {
+	put_dax(iomap->dax_dev);
 	if (iomap->type == IOMAP_MAPPED &&
 	    written < length &&
 	    (flags & IOMAP_WRITE))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4247d8d25687..2cb2634daa99 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3305,6 +3305,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
+	struct block_device *bdev;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned long first_block = offset >> blkbits;
 	unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3373,7 +3374,12 @@ retry:
 	}
 
 	iomap->flags = 0;
-	iomap->bdev = inode->i_sb->s_bdev;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
 	iomap->offset = first_block << blkbits;
 
 	if (ret == 0) {
@@ -3406,6 +3412,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	int blkbits = inode->i_blkbits;
 	bool truncate = false;
 
+	put_dax(iomap->dax_dev);
 	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
 		return 0;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..4b47403f8089 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false, trimmed = false;
 	unsigned		lockmode;
+	struct block_device	*bdev;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
 	}
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
+
+	/* optionally associate a dax device with the iomap bdev */
+	bdev = iomap->bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
+
 	if (shared)
 		iomap->flags |= IOMAP_F_SHARED;
 	return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
+	put_dax(iomap->dax_dev);
 	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
 		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
 				length, written, iomap);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 7291810067eb..f753e788da31 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -41,6 +41,7 @@ struct iomap {
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
 	struct block_device	*bdev;	/* block device for I/O */
+	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 };
 
 /*
-- 
cgit v1.2.3


From a41fe02b6bba853a29c864d00fd161bbe6cfc715 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 14:13:15 -0800
Subject: Revert "block: use DAX for partition table reads"

commit d1a5f2b4d8a1 ("block: use DAX for partition table reads") was
part of a stalled effort to allow dax mappings of block devices. Since
then the device-dax mechanism has filled the role of dax-mapping static
device ranges.

Now that we are moving ->direct_access() from a block_device operation
to a dax_inode operation we would need block devices to map and carry
their own dax_inode reference.

Unless / until we decide to revive dax mapping of raw block devices
through the dax_inode scheme, there is no need to carry
read_dax_sector(). Its removal in turn allows for the removal of
bdev_direct_access() and should have been included in commit
223757016837 ("block_dev: remove DAX leftovers").

Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/partition-generic.c | 17 ++---------------
 fs/dax.c                  | 20 --------------------
 include/linux/dax.h       |  6 ------
 3 files changed, 2 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821f..5dfac337b0f2 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -16,7 +16,6 @@
 #include <linux/kmod.h>
 #include <linux/ctype.h>
 #include <linux/genhd.h>
-#include <linux/dax.h>
 #include <linux/blktrace_api.h>
 
 #include "partitions/check.h"
@@ -631,24 +630,12 @@ int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
 	return 0;
 }
 
-static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
-{
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
-
-	return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)),
-				 NULL);
-}
-
 unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 {
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
 	struct page *page;
 
-	/* don't populate page cache for dax capable devices */
-	if (IS_DAX(bdev->bd_inode))
-		page = read_dax_sector(bdev, n);
-	else
-		page = read_pagecache_sector(bdev, n);
-
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)), NULL);
 	if (!IS_ERR(page)) {
 		if (PageError(page))
 			goto fail;
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..b78a6947c4f5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -101,26 +101,6 @@ static int dax_is_empty_entry(void *entry)
 	return (unsigned long)entry & RADIX_DAX_EMPTY;
 }
 
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
-{
-	struct page *page = alloc_pages(GFP_KERNEL, 0);
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
-	};
-	long rc;
-
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	rc = dax_map_atomic(bdev, &dax);
-	if (rc < 0)
-		return ERR_PTR(rc);
-	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
-	dax_unmap_atomic(bdev, &dax);
-	return page;
-}
-
 /*
  * DAX radix tree locking
  */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 7e62e280c11f..0d0d890f9186 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,15 +70,9 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
-struct page *read_dax_sector(struct block_device *bdev, sector_t n);
 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
-static inline struct page *read_dax_sector(struct block_device *bdev,
-		sector_t n)
-{
-	return ERR_PTR(-ENXIO);
-}
 static inline int __dax_zero_page_range(struct block_device *bdev,
 		sector_t sector, unsigned int offset, unsigned int length)
 {
-- 
cgit v1.2.3


From cccbce67158290537cc671cbd4c1564876485a65 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 13:31:42 -0800
Subject: filesystem-dax: convert to dax_direct_access()

Now that a dax_device is plumbed through all dax-capable drivers we can
switch from block_device_operations to dax_operations for invoking
->direct_access.

This also lets us kill off some usages of struct blk_dax_ctl on the way
to its eventual removal.

Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/dax.c            | 277 +++++++++++++++++++++++++++++-----------------------
 fs/iomap.c          |   3 +-
 include/linux/dax.h |   6 +-
 3 files changed, 162 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index b78a6947c4f5..ce9dc9c3e829 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
-	struct request_queue *q = bdev->bd_queue;
-	long rc = -EIO;
-
-	dax->addr = ERR_PTR(-EIO);
-	if (blk_queue_enter(q, true) != 0)
-		return rc;
-
-	rc = bdev_direct_access(bdev, dax);
-	if (rc < 0) {
-		dax->addr = ERR_PTR(rc);
-		blk_queue_exit(q);
-		return rc;
-	}
-	return rc;
-}
-
-static void dax_unmap_atomic(struct block_device *bdev,
-		const struct blk_dax_ctl *dax)
-{
-	if (IS_ERR(dax->addr))
-		return;
-	blk_queue_exit(bdev->bd_queue);
-}
-
 static int dax_is_pmd_entry(void *entry)
 {
 	return (unsigned long)entry & RADIX_DAX_PMD;
@@ -553,21 +527,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
 	return ret;
 }
 
-static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
-		struct page *to, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
+		sector_t sector, size_t size, struct page *to,
+		unsigned long vaddr)
 {
-	struct blk_dax_ctl dax = {
-		.sector = sector,
-		.size = size,
-	};
-	void *vto;
-
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
+	void *vto, *kaddr;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	long rc;
+	int id;
+
+	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (rc)
+		return rc;
+
+	id = dax_read_lock();
+	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (rc < 0) {
+		dax_read_unlock(id);
+		return rc;
+	}
 	vto = kmap_atomic(to);
-	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
 	kunmap_atomic(vto);
-	dax_unmap_atomic(bdev, &dax);
+	dax_read_unlock(id);
 	return 0;
 }
 
@@ -735,12 +718,16 @@ unlock_pte:
 }
 
 static int dax_writeback_one(struct block_device *bdev,
-		struct address_space *mapping, pgoff_t index, void *entry)
+		struct dax_device *dax_dev, struct address_space *mapping,
+		pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	struct blk_dax_ctl dax;
-	void *entry2, **slot;
-	int ret = 0;
+	void *entry2, **slot, *kaddr;
+	long ret = 0, id;
+	sector_t sector;
+	pgoff_t pgoff;
+	size_t size;
+	pfn_t pfn;
 
 	/*
 	 * A page got tagged dirty in DAX mapping? Something is seriously
@@ -789,26 +776,29 @@ static int dax_writeback_one(struct block_device *bdev,
 	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
 	 * worry about partial PMD writebacks.
 	 */
-	dax.sector = dax_radix_sector(entry);
-	dax.size = PAGE_SIZE << dax_radix_order(entry);
+	sector = dax_radix_sector(entry);
+	size = PAGE_SIZE << dax_radix_order(entry);
+
+	id = dax_read_lock();
+	ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (ret)
+		goto dax_unlock;
 
 	/*
-	 * We cannot hold tree_lock while calling dax_map_atomic() because it
-	 * eventually calls cond_resched().
+	 * dax_direct_access() may sleep, so cannot hold tree_lock over
+	 * its invocation.
 	 */
-	ret = dax_map_atomic(bdev, &dax);
-	if (ret < 0) {
-		put_locked_mapping_entry(mapping, index, entry);
-		return ret;
-	}
+	ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
+	if (ret < 0)
+		goto dax_unlock;
 
-	if (WARN_ON_ONCE(ret < dax.size)) {
+	if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
 		ret = -EIO;
-		goto unmap;
+		goto dax_unlock;
 	}
 
-	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
-	wb_cache_pmem(dax.addr, dax.size);
+	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
+	wb_cache_pmem(kaddr, size);
 	/*
 	 * After we have flushed the cache, we can clear the dirty tag. There
 	 * cannot be new dirty data in the pfn after the flush has completed as
@@ -818,8 +808,8 @@ static int dax_writeback_one(struct block_device *bdev,
 	spin_lock_irq(&mapping->tree_lock);
 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
 	spin_unlock_irq(&mapping->tree_lock);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
+ dax_unlock:
+	dax_read_unlock(id);
 	put_locked_mapping_entry(mapping, index, entry);
 	return ret;
 
@@ -840,6 +830,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
+	struct dax_device *dax_dev;
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
@@ -850,6 +841,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev)
+		return -EIO;
+
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
 
@@ -870,38 +865,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 				break;
 			}
 
-			ret = dax_writeback_one(bdev, mapping, indices[i],
-					pvec.pages[i]);
-			if (ret < 0)
+			ret = dax_writeback_one(bdev, dax_dev, mapping,
+					indices[i], pvec.pages[i]);
+			if (ret < 0) {
+				put_dax(dax_dev);
 				return ret;
+			}
 		}
 	}
+	put_dax(dax_dev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_insert_mapping(struct address_space *mapping,
-		struct block_device *bdev, sector_t sector, size_t size,
-		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
+		struct block_device *bdev, struct dax_device *dax_dev,
+		sector_t sector, size_t size, void **entryp,
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = vmf->address;
-	struct blk_dax_ctl dax = {
-		.sector = sector,
-		.size = size,
-	};
-	void *ret;
 	void *entry = *entryp;
+	void *ret, *kaddr;
+	pgoff_t pgoff;
+	int id, rc;
+	pfn_t pfn;
 
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
-	dax_unmap_atomic(bdev, &dax);
+	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (rc)
+		return rc;
 
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
+	id = dax_read_lock();
+	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (rc < 0) {
+		dax_read_unlock(id);
+		return rc;
+	}
+	dax_read_unlock(id);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 	*entryp = ret;
 
-	return vm_insert_mixed(vma, vaddr, dax.pfn);
+	return vm_insert_mixed(vma, vaddr, pfn);
 }
 
 /**
@@ -950,24 +956,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
 	return true;
 }
 
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
-		unsigned int offset, unsigned int length)
+int __dax_zero_page_range(struct block_device *bdev,
+		struct dax_device *dax_dev, sector_t sector,
+		unsigned int offset, unsigned int size)
 {
-	struct blk_dax_ctl dax = {
-		.sector		= sector,
-		.size		= PAGE_SIZE,
-	};
-
-	if (dax_range_is_aligned(bdev, offset, length)) {
-		sector_t start_sector = dax.sector + (offset >> 9);
+	if (dax_range_is_aligned(bdev, offset, size)) {
+		sector_t start_sector = sector + (offset >> 9);
 
 		return blkdev_issue_zeroout(bdev, start_sector,
-				length >> 9, GFP_NOFS, true);
+				size >> 9, GFP_NOFS, true);
 	} else {
-		if (dax_map_atomic(bdev, &dax) < 0)
-			return PTR_ERR(dax.addr);
-		clear_pmem(dax.addr + offset, length);
-		dax_unmap_atomic(bdev, &dax);
+		pgoff_t pgoff;
+		long rc, id;
+		void *kaddr;
+		pfn_t pfn;
+
+		rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+		if (rc)
+			return rc;
+
+		id = dax_read_lock();
+		rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+				&pfn);
+		if (rc < 0) {
+			dax_read_unlock(id);
+			return rc;
+		}
+		clear_pmem(kaddr + offset, size);
+		dax_read_unlock(id);
 	}
 	return 0;
 }
@@ -982,9 +998,12 @@ static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
 {
+	struct block_device *bdev = iomap->bdev;
+	struct dax_device *dax_dev = iomap->dax_dev;
 	struct iov_iter *iter = data;
 	loff_t end = pos + length, done = 0;
 	ssize_t ret = 0;
+	int id;
 
 	if (iov_iter_rw(iter) == READ) {
 		end = min(end, i_size_read(inode));
@@ -1009,34 +1028,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 					      (end - 1) >> PAGE_SHIFT);
 	}
 
+	id = dax_read_lock();
 	while (pos < end) {
 		unsigned offset = pos & (PAGE_SIZE - 1);
-		struct blk_dax_ctl dax = { 0 };
+		const size_t size = ALIGN(length + offset, PAGE_SIZE);
+		const sector_t sector = dax_iomap_sector(iomap, pos);
 		ssize_t map_len;
+		pgoff_t pgoff;
+		void *kaddr;
+		pfn_t pfn;
 
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		dax.sector = dax_iomap_sector(iomap, pos);
-		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
-		map_len = dax_map_atomic(iomap->bdev, &dax);
+		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+		if (ret)
+			break;
+
+		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
+				&kaddr, &pfn);
 		if (map_len < 0) {
 			ret = map_len;
 			break;
 		}
 
-		dax.addr += offset;
+		map_len = PFN_PHYS(map_len);
+		kaddr += offset;
 		map_len -= offset;
 		if (map_len > end - pos)
 			map_len = end - pos;
 
 		if (iov_iter_rw(iter) == WRITE)
-			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+			map_len = copy_from_iter_pmem(kaddr, map_len, iter);
 		else
-			map_len = copy_to_iter(dax.addr, map_len, iter);
-		dax_unmap_atomic(iomap->bdev, &dax);
+			map_len = copy_to_iter(kaddr, map_len, iter);
 		if (map_len <= 0) {
 			ret = map_len ? map_len : -EFAULT;
 			break;
@@ -1046,6 +1073,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		length -= map_len;
 		done += map_len;
 	}
+	dax_read_unlock(id);
 
 	return done ? done : ret;
 }
@@ -1152,8 +1180,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			clear_user_highpage(vmf->cow_page, vaddr);
 			break;
 		case IOMAP_MAPPED:
-			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
-					vmf->cow_page, vaddr);
+			error = copy_user_dax(iomap.bdev, iomap.dax_dev,
+					sector, PAGE_SIZE, vmf->cow_page, vaddr);
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -1178,8 +1206,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(mapping, iomap.bdev, sector,
-				PAGE_SIZE, &entry, vmf->vma, vmf);
+		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
+				sector, PAGE_SIZE, &entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
@@ -1229,41 +1257,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		loff_t pos, void **entryp)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	const sector_t sector = dax_iomap_sector(iomap, pos);
+	struct dax_device *dax_dev = iomap->dax_dev;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
-	struct blk_dax_ctl dax = {
-		.sector = dax_iomap_sector(iomap, pos),
-		.size = PMD_SIZE,
-	};
-	long length = dax_map_atomic(bdev, &dax);
-	void *ret = NULL;
-
-	if (length < 0) /* dax_map_atomic() failed */
+	const size_t size = PMD_SIZE;
+	void *ret = NULL, *kaddr;
+	long length = 0;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	int id;
+
+	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
 		goto fallback;
-	if (length < PMD_SIZE)
-		goto unmap_fallback;
-	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
-		goto unmap_fallback;
-	if (!pfn_t_devmap(dax.pfn))
-		goto unmap_fallback;
-
-	dax_unmap_atomic(bdev, &dax);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+	id = dax_read_lock();
+	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (length < 0)
+		goto unlock_fallback;
+	length = PFN_PHYS(length);
+
+	if (length < size)
+		goto unlock_fallback;
+	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
+		goto unlock_fallback;
+	if (!pfn_t_devmap(pfn))
+		goto unlock_fallback;
+	dax_read_unlock(id);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
 	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
+			pfn, vmf->flags & FAULT_FLAG_WRITE);
 
- unmap_fallback:
-	dax_unmap_atomic(bdev, &dax);
+unlock_fallback:
+	dax_read_unlock(id);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
-			dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..4ec0d6ac8bc1 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
 	sector_t sector = iomap->blkno +
 		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
 
-	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
+			offset, bytes);
 }
 
 static loff_t
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0d0d890f9186..d3158e74a59e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -70,11 +70,13 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		pgoff_t index, void *entry, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
+int __dax_zero_page_range(struct block_device *bdev,
+		struct dax_device *dax_dev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
 static inline int __dax_zero_page_range(struct block_device *bdev,
-		sector_t sector, unsigned int offset, unsigned int length)
+		struct dax_device *dax_dev, sector_t sector,
+		unsigned int offset, unsigned int length)
 {
 	return -ENXIO;
 }
-- 
cgit v1.2.3


From d4b29fd78ea6fc2be219be3af1a992149b4ff0f6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 27 Jan 2017 17:22:03 -0800
Subject: block: remove block_device_operations ->direct_access()

Now that all the producers and consumers of dax interfaces have been
converted to using dax_operations on a dax_device, remove the block
device direct_access enabling.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/powerpc/sysdev/axonram.c | 23 ++++------------------
 drivers/block/brd.c           | 15 ---------------
 drivers/md/dm.c               | 13 -------------
 drivers/nvdimm/pmem.c         | 10 ----------
 drivers/s390/block/dcssblk.c  | 16 ---------------
 fs/block_dev.c                | 45 -------------------------------------------
 include/linux/blkdev.h        | 17 ----------------
 7 files changed, 4 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 171ba86a3494..a7fe5fee744f 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -139,6 +139,10 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
+static const struct block_device_operations axon_ram_devops = {
+	.owner		= THIS_MODULE,
+};
+
 static long
 __axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_pages,
 		       void **kaddr, pfn_t *pfn)
@@ -150,25 +154,6 @@ __axon_ram_direct_access(struct axon_ram_bank *bank, pgoff_t pgoff, long nr_page
 	return (bank->size - offset) / PAGE_SIZE;
 }
 
-/**
- * axon_ram_direct_access - direct_access() method for block device
- * @device, @sector, @data: see block_device_operations method
- */
-static long
-axon_ram_blk_direct_access(struct block_device *device, sector_t sector,
-		       void **kaddr, pfn_t *pfn, long size)
-{
-	struct axon_ram_bank *bank = device->bd_disk->private_data;
-
-	return __axon_ram_direct_access(bank, (sector * 512) / PAGE_SIZE,
-			size / PAGE_SIZE, kaddr, pfn) * PAGE_SIZE;
-}
-
-static const struct block_device_operations axon_ram_devops = {
-	.owner		= THIS_MODULE,
-	.direct_access	= axon_ram_blk_direct_access
-};
-
 static long
 axon_ram_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
 		       void **kaddr, pfn_t *pfn)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 60f3193c9ce2..bfa4ed2c75ef 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -395,18 +395,6 @@ static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
 	return 1;
 }
 
-static long brd_blk_direct_access(struct block_device *bdev, sector_t sector,
-			void **kaddr, pfn_t *pfn, long size)
-{
-	struct brd_device *brd = bdev->bd_disk->private_data;
-	long nr_pages = __brd_direct_access(brd, PHYS_PFN(sector * 512),
-			PHYS_PFN(size), kaddr, pfn);
-
-	if (nr_pages < 0)
-		return nr_pages;
-	return nr_pages * PAGE_SIZE;
-}
-
 static long brd_dax_direct_access(struct dax_device *dax_dev,
 		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -418,14 +406,11 @@ static long brd_dax_direct_access(struct dax_device *dax_dev,
 static const struct dax_operations brd_dax_ops = {
 	.direct_access = brd_dax_direct_access,
 };
-#else
-#define brd_blk_direct_access NULL
 #endif
 
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		brd_rw_page,
-	.direct_access =	brd_blk_direct_access,
 };
 
 /*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ef4c6f8cad47..79d5f5fd823e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -957,18 +957,6 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 	return ret;
 }
 
-static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
-{
-	struct mapped_device *md = bdev->bd_disk->private_data;
-	struct dax_device *dax_dev = md->dax_dev;
-	long nr_pages = size / PAGE_SIZE;
-
-	nr_pages = dm_dax_direct_access(dax_dev, sector / PAGE_SECTORS,
-			nr_pages, kaddr, pfn);
-	return nr_pages < 0 ? nr_pages : nr_pages * PAGE_SIZE;
-}
-
 /*
  * A target may call dm_accept_partial_bio only from the map routine.  It is
  * allowed for all bio types except REQ_PREFLUSH.
@@ -2823,7 +2811,6 @@ static const struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
-	.direct_access = dm_blk_direct_access,
 	.getgeo = dm_blk_getgeo,
 	.pr_ops = &dm_pr_ops,
 	.owner = THIS_MODULE
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fbbcf8154eec..85b85633d674 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -220,19 +220,9 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
 	return PHYS_PFN(pmem->size - pmem->pfn_pad - offset);
 }
 
-static long pmem_blk_direct_access(struct block_device *bdev, sector_t sector,
-		void **kaddr, pfn_t *pfn, long size)
-{
-	struct pmem_device *pmem = bdev->bd_queue->queuedata;
-
-	return __pmem_direct_access(pmem, PHYS_PFN(sector * 512),
-			PHYS_PFN(size), kaddr, pfn);
-}
-
 static const struct block_device_operations pmem_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		pmem_rw_page,
-	.direct_access =	pmem_blk_direct_access,
 	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index dc84cfd4e438..36e5280af3e4 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -31,8 +31,6 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode);
 static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static blk_qc_t dcssblk_make_request(struct request_queue *q,
 						struct bio *bio);
-static long dcssblk_blk_direct_access(struct block_device *bdev, sector_t secnum,
-			 void **kaddr, pfn_t *pfn, long size);
 static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn);
 
@@ -43,7 +41,6 @@ static const struct block_device_operations dcssblk_devops = {
 	.owner   	= THIS_MODULE,
 	.open    	= dcssblk_open,
 	.release 	= dcssblk_release,
-	.direct_access 	= dcssblk_blk_direct_access,
 };
 
 static const struct dax_operations dcssblk_dax_ops = {
@@ -915,19 +912,6 @@ __dcssblk_direct_access(struct dcssblk_dev_info *dev_info, pgoff_t pgoff,
 	return (dev_sz - offset) / PAGE_SIZE;
 }
 
-static long
-dcssblk_blk_direct_access(struct block_device *bdev, sector_t secnum,
-			void **kaddr, pfn_t *pfn, long size)
-{
-	struct dcssblk_dev_info *dev_info;
-
-	dev_info = bdev->bd_disk->private_data;
-	if (!dev_info)
-		return -ENODEV;
-	return __dcssblk_direct_access(dev_info, PHYS_PFN(secnum * 512),
-			PHYS_PFN(size), kaddr, pfn) * PAGE_SIZE;
-}
-
 static long
 dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 		long nr_pages, void **kaddr, pfn_t *pfn)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ecbdc8f9f718..10e21465d5a9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -718,51 +718,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
-/**
- * bdev_direct_access() - Get the address for directly-accessibly memory
- * @bdev: The device containing the memory
- * @dax: control and output parameters for ->direct_access
- *
- * If a block device is made up of directly addressable memory, this function
- * will tell the caller the PFN and the address of the memory.  The address
- * may be directly dereferenced within the kernel without the need to call
- * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
- * page tables.
- *
- * Return: negative errno if an error occurs, otherwise the number of bytes
- * accessible at this address.
- */
-long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
-	sector_t sector = dax->sector;
-	long avail, size = dax->size;
-	const struct block_device_operations *ops = bdev->bd_disk->fops;
-
-	/*
-	 * The device driver is allowed to sleep, in order to make the
-	 * memory directly accessible.
-	 */
-	might_sleep();
-
-	if (size < 0)
-		return size;
-	if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
-		return -EOPNOTSUPP;
-	if ((sector + DIV_ROUND_UP(size, 512)) >
-					part_nr_sects_read(bdev->bd_part))
-		return -ERANGE;
-	sector += get_start_sect(bdev);
-	if (sector % (PAGE_SIZE / 512))
-		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
-	if (!avail)
-		return -ERANGE;
-	if (avail > 0 && avail & ~PAGE_MASK)
-		return -ENXIO;
-	return min(avail, size);
-}
-EXPORT_SYMBOL_GPL(bdev_direct_access);
-
 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
 		pgoff_t *pgoff)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 612c497d1461..848f87eb1905 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1916,28 +1916,12 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-/**
- * struct blk_dax_ctl - control and output parameters for ->direct_access
- * @sector: (input) offset relative to a block_device
- * @addr: (output) kernel virtual address for @sector populated by driver
- * @pfn: (output) page frame number for @addr populated by driver
- * @size: (input) number of bytes requested
- */
-struct blk_dax_ctl {
-	sector_t sector;
-	void *addr;
-	long size;
-	pfn_t pfn;
-};
-
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
 	int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
-	long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *,
-			long);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
@@ -1956,7 +1940,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
-extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *);
 extern int bdev_dax_supported(struct super_block *, int);
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #else /* CONFIG_BLOCK */
-- 
cgit v1.2.3


From 6abccd1bfee49e491095772fd5aa9e96d915ae52 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 13 Jan 2017 14:14:23 -0800
Subject: x86, dax, pmem: remove indirection around memcpy_from_pmem()

memcpy_from_pmem() maps directly to memcpy_mcsafe(). The wrapper
serves no real benefit aside from affording a more generic function name
than the x86-specific 'mcsafe'. However this would not be the first time
that x86 terminology leaked into the global namespace. For lack of
better name, just use memcpy_mcsafe() directly.

This conversion also catches a place where we should have been using
plain memcpy, acpi_nfit_blk_single_io().

Cc: <x86@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/include/asm/pmem.h      |  5 -----
 arch/x86/include/asm/string_64.h |  1 +
 drivers/acpi/nfit/core.c         |  3 +--
 drivers/nvdimm/claim.c           |  2 +-
 drivers/nvdimm/pmem.c            |  2 +-
 include/linux/pmem.h             | 23 -----------------------
 include/linux/string.h           |  8 ++++++++
 7 files changed, 12 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 529bb4a6487a..d5a22bac9988 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -44,11 +44,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 		BUG();
 }
 
-static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
-{
-	return memcpy_mcsafe(dst, src, n);
-}
-
 /**
  * arch_wb_cache_pmem - write back a cache range with CLWB
  * @vaddr:	virtual start address
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index a164862d77e3..733bae07fb29 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -79,6 +79,7 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+#define __HAVE_ARCH_MEMCPY_MCSAFE 1
 __must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
 DECLARE_STATIC_KEY_FALSE(mcsafe_key);
 
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index c8ea9d698cd0..d0c07b2344e4 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1783,8 +1783,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 				mmio_flush_range((void __force *)
 					mmio->addr.aperture + offset, c);
 
-			memcpy_from_pmem(iobuf + copied,
-					mmio->addr.aperture + offset, c);
+			memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
 		}
 
 		copied += c;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index ca6d572c48fc..3a35e8028b9c 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -239,7 +239,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	if (rw == READ) {
 		if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align)))
 			return -EIO;
-		return memcpy_from_pmem(buf, nsio->addr + offset, size);
+		return memcpy_mcsafe(buf, nsio->addr + offset, size);
 	}
 
 	if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 85b85633d674..3b3dab73d741 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -89,7 +89,7 @@ static int read_pmem(struct page *page, unsigned int off,
 	int rc;
 	void *mem = kmap_atomic(page);
 
-	rc = memcpy_from_pmem(mem + off, pmem_addr, len);
+	rc = memcpy_mcsafe(mem + off, pmem_addr, len);
 	kunmap_atomic(mem);
 	if (rc)
 		return -EIO;
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index e856c2cb0fe8..71ecf3d46aac 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -31,12 +31,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
 	BUG();
 }
 
-static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
-{
-	BUG();
-	return -EFAULT;
-}
-
 static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes,
 		struct iov_iter *i)
 {
@@ -65,23 +59,6 @@ static inline bool arch_has_pmem_api(void)
 	return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API);
 }
 
-/*
- * memcpy_from_pmem - read from persistent memory with error handling
- * @dst: destination buffer
- * @src: source buffer
- * @size: transfer length
- *
- * Returns 0 on success negative error code on failure.
- */
-static inline int memcpy_from_pmem(void *dst, void const *src, size_t size)
-{
-	if (arch_has_pmem_api())
-		return arch_memcpy_from_pmem(dst, src, size);
-	else
-		memcpy(dst, src, size);
-	return 0;
-}
-
 /**
  * memcpy_to_pmem - copy data to persistent memory
  * @dst: destination buffer for the copy
diff --git a/include/linux/string.h b/include/linux/string.h
index 26b6f6a66f83..9d6f189157e2 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -114,6 +114,14 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 #ifndef __HAVE_ARCH_MEMCHR
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
+#ifndef __HAVE_ARCH_MEMCPY_MCSAFE
+static inline __must_check int memcpy_mcsafe(void *dst, const void *src,
+		size_t cnt)
+{
+	memcpy(dst, src, cnt);
+	return 0;
+}
+#endif
 void *memchr_inv(const void *s, int c, size_t n);
 char *strreplace(char *s, char old, char new);
 
-- 
cgit v1.2.3


From 7acedaf5c4355f812cfef883ac28bf15f7d9205e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Apr 2017 11:36:52 -0700
Subject: net: move xdp_prog field in RX cache lines

(struct net_device, xdp_prog) field should be moved in RX cache lines,
reducing latencies when a single packet is received on idle host,
since netif_elide_gro() needs it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46d220c2bf92..8c5c8cdc7b97 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1790,6 +1790,7 @@ struct net_device {
 	unsigned int		real_num_rx_queues;
 #endif
 
+	struct bpf_prog __rcu	*xdp_prog;
 	unsigned long		gro_flush_timeout;
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
@@ -1905,7 +1906,6 @@ struct net_device {
 	struct lock_class_key	*qdisc_tx_busylock;
 	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
-	struct bpf_prog __rcu	*xdp_prog;
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
-- 
cgit v1.2.3


From 51f567777799c9d85a778302b9eb61cf15214a98 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 6 Apr 2017 22:36:31 -0400
Subject: nfsd: check for oversized NFSv2/v3 arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A client can append random data to the end of an NFSv2 or NFSv3 RPC call
without our complaining; we'll just stop parsing at the end of the
expected data and ignore the rest.

Encoded arguments and replies are stored together in an array of pages,
and if a call is too large it could leave inadequate space for the
reply.  This is normally OK because NFS RPC's typically have either
short arguments and long replies (like READ) or long arguments and short
replies (like WRITE).  But a client that sends an incorrectly long reply
can violate those assumptions.  This was observed to cause crashes.

So, insist that the argument not be any longer than we expect.

Also, several operations increment rq_next_page in the decode routine
before checking the argument size, which can leave rq_next_page pointing
well past the end of the page array, causing trouble later in
svc_free_pages.

As followup we may also want to rewrite the encoding routines to check
more carefully that they aren't running off the end of the page array.

Reported-by: Tuomas Haanpää <thaan@synopsys.com>
Reported-by: Ari Kauppi <ari@synopsys.com>
Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs3xdr.c          | 23 +++++++++++++++++------
 fs/nfsd/nfsxdr.c           | 13 ++++++++++---
 include/linux/sunrpc/svc.h |  3 +--
 3 files changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 452334694a5d..12feac6ee2fd 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 	if (!p)
 		return 0;
 	p = xdr_decode_hyper(p, &args->offset);
-
 	args->count = ntohl(*p++);
+
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	len = min(args->count, max_blocksize);
 
 	/* set up the kvec */
@@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 		v++;
 	}
 	args->vlen = v;
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -541,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -569,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
 	args->verf   = p; p += 2;
 	args->dircount = ~0;
 	args->count  = ntohl(*p++);
+
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	args->count  = min_t(u32, args->count, PAGE_SIZE);
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -590,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
 	args->dircount = ntohl(*p++);
 	args->count    = ntohl(*p++);
 
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	len = args->count = min(args->count, max_blocksize);
 	while (len > 0) {
 		struct page *p = *(rqstp->rq_next_page++);
@@ -597,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
 			args->buffer = page_address(p);
 		len -= PAGE_SIZE;
 	}
-
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index de07ff625777..6a4947a3f4fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 	len = args->count     = ntohl(*p++);
 	p++; /* totalcount - unused */
 
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
+
 	len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
 
 	/* set up somewhere to store response.
@@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 		v++;
 	}
 	args->vlen = v;
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -362,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
 	p = decode_fh(p, &args->fh);
 	if (!p)
 		return 0;
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 int
@@ -402,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
 	args->cookie = ntohl(*p++);
 	args->count  = ntohl(*p++);
 	args->count  = min_t(u32, args->count, PAGE_SIZE);
+	if (!xdr_argsize_check(rqstp, p))
+		return 0;
 	args->buffer = page_address(*(rqstp->rq_next_page++));
 
-	return xdr_argsize_check(rqstp, p);
+	return 1;
 }
 
 /*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index e770abeed32d..6ef19cf658b4 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p)
 {
 	char *cp = (char *)p;
 	struct kvec *vec = &rqstp->rq_arg.head[0];
-	return cp >= (char*)vec->iov_base
-		&& cp <= (char*)vec->iov_base + vec->iov_len;
+	return cp == (char *)vec->iov_base + vec->iov_len;
 }
 
 static inline int
-- 
cgit v1.2.3


From 17f5f7f506aaca985b95df7ef7fc2ff49c36a8e9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:05:36 -0400
Subject: svcrdma: Move send_wr to svc_rdma_op_ctxt

Clean up: Move the ib_send_wr off the stack, and move common code
to post a Send Work Request into a helper.

This is a refactoring change only.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  4 ++
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 11 +----
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 64 ++++++++++++++++++------------
 3 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index b105f73e3ca2..287db5c179d8 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -85,6 +85,7 @@ struct svc_rdma_op_ctxt {
 	enum dma_data_direction direction;
 	int count;
 	unsigned int mapped_sges;
+	struct ib_send_wr send_wr;
 	struct ib_sge sge[RPCSVC_MAXPAGES];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
@@ -227,6 +228,9 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 /* svc_rdma_sendto.c */
 extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
 			    struct svc_rdma_req_map *, bool);
+extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_op_ctxt *ctxt,
+				 int num_sge, u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
 extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
 				int);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index ff1df40f0d26..f12f39c189c3 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -104,7 +104,6 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
 	struct svc_rdma_op_ctxt *ctxt;
 	struct svc_rdma_req_map *vec;
-	struct ib_send_wr send_wr;
 	int ret;
 
 	vec = svc_rdma_get_req_map(rdma);
@@ -132,15 +131,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 	}
 	svc_rdma_count_mappings(rdma, ctxt);
 
-	memset(&send_wr, 0, sizeof(send_wr));
-	ctxt->cqe.done = svc_rdma_wc_send;
-	send_wr.wr_cqe = &ctxt->cqe;
-	send_wr.sg_list = ctxt->sge;
-	send_wr.num_sge = 1;
-	send_wr.opcode = IB_WR_SEND;
-	send_wr.send_flags = IB_SEND_SIGNALED;
-
-	ret = svc_rdma_send(rdma, &send_wr);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
 	if (ret) {
 		ret = -EIO;
 		goto out_unmap;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 515221b16d09..f90b40d0932f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -435,6 +435,43 @@ out_err:
 	return -EIO;
 }
 
+/**
+ * svc_rdma_post_send_wr - Set up and post one Send Work Request
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for transmitting the Send WR
+ * @num_sge: number of SGEs to send
+ * @inv_rkey: R_key argument to Send With Invalidate, or zero
+ *
+ * Returns:
+ *	%0 if the Send* was posted successfully,
+ *	%-ENOTCONN if the connection was lost or dropped,
+ *	%-EINVAL if there was a problem with the Send we built,
+ *	%-ENOMEM if ib_post_send failed.
+ */
+int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
+			  struct svc_rdma_op_ctxt *ctxt, int num_sge,
+			  u32 inv_rkey)
+{
+	struct ib_send_wr *send_wr = &ctxt->send_wr;
+
+	dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge);
+
+	send_wr->next = NULL;
+	ctxt->cqe.done = svc_rdma_wc_send;
+	send_wr->wr_cqe = &ctxt->cqe;
+	send_wr->sg_list = ctxt->sge;
+	send_wr->num_sge = num_sge;
+	send_wr->send_flags = IB_SEND_SIGNALED;
+	if (inv_rkey) {
+		send_wr->opcode = IB_WR_SEND_WITH_INV;
+		send_wr->ex.invalidate_rkey = inv_rkey;
+	} else {
+		send_wr->opcode = IB_WR_SEND;
+	}
+
+	return svc_rdma_send(rdma, send_wr);
+}
+
 /* This function prepares the portion of the RPCRDMA message to be
  * sent in the RDMA_SEND. This function is called after data sent via
  * RDMA has already been transmitted. There are three cases:
@@ -460,7 +497,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      u32 inv_rkey)
 {
 	struct svc_rdma_op_ctxt *ctxt;
-	struct ib_send_wr send_wr;
 	u32 xdr_off;
 	int sge_no;
 	int sge_bytes;
@@ -524,19 +560,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
 		goto err;
 	}
-	memset(&send_wr, 0, sizeof send_wr);
-	ctxt->cqe.done = svc_rdma_wc_send;
-	send_wr.wr_cqe = &ctxt->cqe;
-	send_wr.sg_list = ctxt->sge;
-	send_wr.num_sge = sge_no;
-	if (inv_rkey) {
-		send_wr.opcode = IB_WR_SEND_WITH_INV;
-		send_wr.ex.invalidate_rkey = inv_rkey;
-	} else
-		send_wr.opcode = IB_WR_SEND;
-	send_wr.send_flags =  IB_SEND_SIGNALED;
 
-	ret = svc_rdma_send(rdma, &send_wr);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, sge_no, inv_rkey);
 	if (ret)
 		goto err;
 
@@ -652,7 +677,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 			 int status)
 {
-	struct ib_send_wr err_wr;
 	struct page *p;
 	struct svc_rdma_op_ctxt *ctxt;
 	enum rpcrdma_errcode err;
@@ -692,17 +716,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 	}
 	svc_rdma_count_mappings(xprt, ctxt);
 
-	/* Prepare SEND WR */
-	memset(&err_wr, 0, sizeof(err_wr));
-	ctxt->cqe.done = svc_rdma_wc_send;
-	err_wr.wr_cqe = &ctxt->cqe;
-	err_wr.sg_list = ctxt->sge;
-	err_wr.num_sge = 1;
-	err_wr.opcode = IB_WR_SEND;
-	err_wr.send_flags = IB_SEND_SIGNALED;
-
-	/* Post It */
-	ret = svc_rdma_send(xprt, &err_wr);
+	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
 	if (ret) {
 		dprintk("svcrdma: Error %d posting send for protocol error\n",
 			ret);
-- 
cgit v1.2.3


From 6e6092ca305ad785c605d7e313727aad96c228a5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:05:44 -0400
Subject: svcrdma: Add svc_rdma_map_reply_hdr()

Introduce a helper to DMA-map a reply's transport header before
sending it. This will in part replace the map vector cache.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  3 ++
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 36 ++++++------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 61 +++++++++++++++++++++++-------
 3 files changed, 62 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 287db5c179d8..002a46d1faa1 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -228,6 +228,9 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 /* svc_rdma_sendto.c */
 extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
 			    struct svc_rdma_req_map *, bool);
+extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_op_ctxt *ctxt,
+				  __be32 *rdma_resp, unsigned int len);
 extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 int num_sge, u32 inv_rkey);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index f12f39c189c3..0305b33d482f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -101,50 +101,36 @@ out_notfound:
 static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 			      struct rpc_rqst *rqst)
 {
-	struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
 	struct svc_rdma_op_ctxt *ctxt;
-	struct svc_rdma_req_map *vec;
 	int ret;
 
-	vec = svc_rdma_get_req_map(rdma);
-	ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
-	if (ret)
+	ctxt = svc_rdma_get_context(rdma);
+
+	/* rpcrdma_bc_send_request builds the transport header and
+	 * the backchannel RPC message in the same buffer. Thus only
+	 * one SGE is needed to send both.
+	 */
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
+				     rqst->rq_snd_buf.len);
+	if (ret < 0)
 		goto out_err;
 
 	ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
 	if (ret)
 		goto out_err;
 
-	ctxt = svc_rdma_get_context(rdma);
-	ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
-	ctxt->count = 1;
-
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length = sndbuf->len;
-	ctxt->sge[0].addr =
-	    ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
-			    sndbuf->len, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
-		ret = -EIO;
-		goto out_unmap;
-	}
-	svc_rdma_count_mappings(rdma, ctxt);
-
 	ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
-	if (ret) {
-		ret = -EIO;
+	if (ret)
 		goto out_unmap;
-	}
 
 out_err:
-	svc_rdma_put_req_map(rdma, vec);
 	dprintk("svcrdma: %s returns %d\n", __func__, ret);
 	return ret;
 
 out_unmap:
 	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
+	ret = -EIO;
 	goto out_err;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f90b40d0932f..a7dc71daa776 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -217,6 +217,49 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
 	return 0;
 }
 
+static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_op_ctxt *ctxt,
+				 unsigned int sge_no,
+				 struct page *page,
+				 unsigned int offset,
+				 unsigned int len)
+{
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		return -EIO;
+
+	ctxt->sge[sge_no].addr = dma_addr;
+	ctxt->sge[sge_no].length = len;
+	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+	svc_rdma_count_mappings(rdma, ctxt);
+	return 0;
+}
+
+/**
+ * svc_rdma_map_reply_hdr - DMA map the transport header buffer
+ * @rdma: controlling transport
+ * @ctxt: op_ctxt for the Send WR
+ * @rdma_resp: buffer containing transport header
+ * @len: length of transport header
+ *
+ * Returns:
+ *	%0 if the header is DMA mapped,
+ *	%-EIO if DMA mapping failed.
+ */
+int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
+			   struct svc_rdma_op_ctxt *ctxt,
+			   __be32 *rdma_resp,
+			   unsigned int len)
+{
+	ctxt->direction = DMA_TO_DEVICE;
+	ctxt->pages[0] = virt_to_page(rdma_resp);
+	ctxt->count = 1;
+	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
+}
+
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
@@ -699,22 +742,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 		err = ERR_VERS;
 	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
 
+	/* Map transport header; no RPC message payload */
 	ctxt = svc_rdma_get_context(xprt);
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->count = 1;
-	ctxt->pages[0] = p;
-
-	/* Prepare SGE for local address */
-	ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length = length;
-	ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
-					    p, 0, length, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
-		dprintk("svcrdma: Error mapping buffer for protocol error\n");
-		svc_rdma_put_context(ctxt, 1);
+	ret = svc_rdma_map_reply_hdr(xprt, ctxt, &rmsgp->rm_xid, length);
+	if (ret) {
+		dprintk("svcrdma: Error %d mapping send for protocol error\n",
+			ret);
 		return;
 	}
-	svc_rdma_count_mappings(xprt, ctxt);
 
 	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
 	if (ret) {
-- 
cgit v1.2.3


From b623589dbacbc786c2fffc85113a1dc1a331e2ca Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:05:52 -0400
Subject: svcrdma: Eliminate RPCRDMA_SQ_DEPTH_MULT

The Send Queue depth is temporarily reduced to 1 SQE per credit. The
new rdma_rw API does an internal computation, during QP creation, to
increase the depth of the Send Queue to handle RDMA Read and Write
operations.

This change has to come before the NFSD code paths are updated to
use the rdma_rw API. Without this patch, rdma_rw_init_qp() increases
the size of the SQ too much, resulting in memory allocation failures
during QP creation.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 1 -
 net/sunrpc/xprtrdma/svc_rdma.c           | 2 --
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +-
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 002a46d1faa1..11d5aa123f17 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -182,7 +182,6 @@ struct svcxprt_rdma {
 /* The default ORD value is based on two outstanding full-size writes with a
  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
 #define RPCRDMA_ORD             (64/4)
-#define RPCRDMA_SQ_DEPTH_MULT   8
 #define RPCRDMA_MAX_REQUESTS    32
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index c846ca9f1eba..912444174647 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -247,8 +247,6 @@ int svc_rdma_init(void)
 	dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
 	dprintk("\tsvcrdma_ord      : %d\n", svcrdma_ord);
 	dprintk("\tmax_requests     : %u\n", svcrdma_max_requests);
-	dprintk("\tsq_depth         : %u\n",
-		svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
 	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
 	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fc8f14c7bfec..e1097cc6d1eb 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1014,7 +1014,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 					    svcrdma_max_bc_requests);
 	newxprt->sc_rq_depth = newxprt->sc_max_requests +
 			       newxprt->sc_max_bc_requests;
-	newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+	newxprt->sc_sq_depth = newxprt->sc_rq_depth;
 	atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
 
 	if (!svc_rdma_prealloc_ctxts(newxprt))
-- 
cgit v1.2.3


From f13193f50b64e2e0c87706b838d6b9895626a892 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:16 -0400
Subject: svcrdma: Introduce local rdma_rw API helpers

The plan is to replace the local bespoke code that constructs and
posts RDMA Read and Write Work Requests with calls to the rdma_rw
API. This shares code with other RDMA-enabled ULPs that manages the
gory details of buffer registration and posting Work Requests.

Some design notes:

 o The structure of RPC-over-RDMA transport headers is flexible,
   allowing multiple segments per Reply with arbitrary alignment,
   each with a unique R_key. Write and Send WRs continue to be
   built and posted in separate code paths. However, one whole
   chunk (with one or more RDMA segments apiece) gets exactly
   one ib_post_send and one work completion.

 o svc_xprt reference counting is modified, since a chain of
   rdma_rw_ctx structs generates one completion, no matter how
   many Write WRs are posted.

 o The current code builds the transport header as it is construct-
   ing Write WRs. I've replaced that with marshaling of transport
   header data items in a separate step. This is because the exact
   structure of client-provided segments may not align with the
   components of the server's reply xdr_buf, or the pages in the
   page list. Thus parts of each client-provided segment may be
   written at different points in the send path.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  11 +
 net/sunrpc/Kconfig                       |   1 +
 net/sunrpc/xprtrdma/Makefile             |   2 +-
 net/sunrpc/xprtrdma/svc_rdma_rw.c        | 512 +++++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/svc_rdma_transport.c |   4 +
 5 files changed, 529 insertions(+), 1 deletion(-)
 create mode 100644 net/sunrpc/xprtrdma/svc_rdma_rw.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 11d5aa123f17..ca08671fb7e2 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -145,12 +145,15 @@ struct svcxprt_rdma {
 	u32		     sc_max_requests;	/* Max requests */
 	u32		     sc_max_bc_requests;/* Backward credits */
 	int                  sc_max_req_size;	/* Size of each RQ WR buf */
+	u8		     sc_port_num;
 
 	struct ib_pd         *sc_pd;
 
 	spinlock_t	     sc_ctxt_lock;
 	struct list_head     sc_ctxts;
 	int		     sc_ctxt_used;
+	spinlock_t	     sc_rw_ctxt_lock;
+	struct list_head     sc_rw_ctxts;
 	spinlock_t	     sc_map_lock;
 	struct list_head     sc_maps;
 
@@ -224,6 +227,14 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
 				struct svc_rdma_op_ctxt *, int *, u32 *,
 				u32, u32, u64, bool);
 
+/* svc_rdma_rw.c */
+extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
+extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
+				     __be32 *wr_ch, struct xdr_buf *xdr);
+extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+				     __be32 *rp_ch, bool writelist,
+				     struct xdr_buf *xdr);
+
 /* svc_rdma_sendto.c */
 extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
 			    struct svc_rdma_req_map *, bool);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 04ce2c0b660e..ac09ca803296 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
 	tristate "RPC-over-RDMA transport"
 	depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
 	default SUNRPC && INFINIBAND
+	select SG_POOL
 	help
 	  This option allows the NFS client and server to use RDMA
 	  transports (InfiniBand, iWARP, or RoCE).
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index ef19fa42c50f..c1ae8142ab73 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
 	fmr_ops.o frwr_ops.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
-	module.o
+	svc_rdma_rw.o module.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
new file mode 100644
index 000000000000..0cf620277693
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016 Oracle.  All rights reserved.
+ *
+ * Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
+ */
+
+#include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sunrpc/svc_rdma.h>
+#include <linux/sunrpc/debug.h>
+
+#include <rdma/rw.h>
+
+#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
+
+/* Each R/W context contains state for one chain of RDMA Read or
+ * Write Work Requests.
+ *
+ * Each WR chain handles a single contiguous server-side buffer,
+ * because scatterlist entries after the first have to start on
+ * page alignment. xdr_buf iovecs cannot guarantee alignment.
+ *
+ * Each WR chain handles only one R_key. Each RPC-over-RDMA segment
+ * from a client may contain a unique R_key, so each WR chain moves
+ * up to one segment at a time.
+ *
+ * The scatterlist makes this data structure over 4KB in size. To
+ * make it less likely to fail, and to handle the allocation for
+ * smaller I/O requests without disabling bottom-halves, these
+ * contexts are created on demand, but cached and reused until the
+ * controlling svcxprt_rdma is destroyed.
+ */
+struct svc_rdma_rw_ctxt {
+	struct list_head	rw_list;
+	struct rdma_rw_ctx	rw_ctx;
+	int			rw_nents;
+	struct sg_table		rw_sg_table;
+	struct scatterlist	rw_first_sgl[0];
+};
+
+static inline struct svc_rdma_rw_ctxt *
+svc_rdma_next_ctxt(struct list_head *list)
+{
+	return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
+					rw_list);
+}
+
+static struct svc_rdma_rw_ctxt *
+svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+
+	ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
+	if (ctxt) {
+		list_del(&ctxt->rw_list);
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+	} else {
+		spin_unlock(&rdma->sc_rw_ctxt_lock);
+		ctxt = kmalloc(sizeof(*ctxt) +
+			       SG_CHUNK_SIZE * sizeof(struct scatterlist),
+			       GFP_KERNEL);
+		if (!ctxt)
+			goto out;
+		INIT_LIST_HEAD(&ctxt->rw_list);
+	}
+
+	ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
+	if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
+				   ctxt->rw_sg_table.sgl)) {
+		kfree(ctxt);
+		ctxt = NULL;
+	}
+out:
+	return ctxt;
+}
+
+static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
+				 struct svc_rdma_rw_ctxt *ctxt)
+{
+	sg_free_table_chained(&ctxt->rw_sg_table, true);
+
+	spin_lock(&rdma->sc_rw_ctxt_lock);
+	list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
+	spin_unlock(&rdma->sc_rw_ctxt_lock);
+}
+
+/**
+ * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
+ * @rdma: transport about to be destroyed
+ *
+ */
+void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
+{
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+		kfree(ctxt);
+	}
+}
+
+/* A chunk context tracks all I/O for moving one Read or Write
+ * chunk. This is a a set of rdma_rw's that handle data movement
+ * for all segments of one chunk.
+ *
+ * These are small, acquired with a single allocator call, and
+ * no more than one is needed per chunk. They are allocated on
+ * demand, and not cached.
+ */
+struct svc_rdma_chunk_ctxt {
+	struct ib_cqe		cc_cqe;
+	struct svcxprt_rdma	*cc_rdma;
+	struct list_head	cc_rwctxts;
+	int			cc_sqecount;
+	enum dma_data_direction cc_dir;
+};
+
+static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
+			     struct svc_rdma_chunk_ctxt *cc,
+			     enum dma_data_direction dir)
+{
+	cc->cc_rdma = rdma;
+	svc_xprt_get(&rdma->sc_xprt);
+
+	INIT_LIST_HEAD(&cc->cc_rwctxts);
+	cc->cc_sqecount = 0;
+	cc->cc_dir = dir;
+}
+
+static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+
+	while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
+		list_del(&ctxt->rw_list);
+
+		rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
+				    rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				    ctxt->rw_nents, cc->cc_dir);
+		svc_rdma_put_rw_ctxt(rdma, ctxt);
+	}
+	svc_xprt_put(&rdma->sc_xprt);
+}
+
+/* State for sending a Write or Reply chunk.
+ *  - Tracks progress of writing one chunk over all its segments
+ *  - Stores arguments for the SGL constructor functions
+ */
+struct svc_rdma_write_info {
+	/* write state of this chunk */
+	unsigned int		wi_seg_off;
+	unsigned int		wi_seg_no;
+	unsigned int		wi_nsegs;
+	__be32			*wi_segs;
+
+	/* SGL constructor arguments */
+	struct xdr_buf		*wi_xdr;
+	unsigned char		*wi_base;
+	unsigned int		wi_next_off;
+
+	struct svc_rdma_chunk_ctxt	wi_cc;
+};
+
+static struct svc_rdma_write_info *
+svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
+{
+	struct svc_rdma_write_info *info;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return info;
+
+	info->wi_seg_off = 0;
+	info->wi_seg_no = 0;
+	info->wi_nsegs = be32_to_cpup(++chunk);
+	info->wi_segs = ++chunk;
+	svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
+	return info;
+}
+
+static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
+{
+	svc_rdma_cc_release(&info->wi_cc);
+	kfree(info);
+}
+
+/**
+ * svc_rdma_write_done - Write chunk completion
+ * @cq: controlling Completion Queue
+ * @wc: Work Completion
+ *
+ * Pages under I/O are freed by a subsequent Send completion.
+ */
+static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+	struct ib_cqe *cqe = wc->wr_cqe;
+	struct svc_rdma_chunk_ctxt *cc =
+			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_write_info *info =
+			container_of(cc, struct svc_rdma_write_info, wi_cc);
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
+			       ib_wc_status_msg(wc->status),
+			       wc->status, wc->vendor_err);
+	}
+
+	svc_rdma_write_info_free(info);
+}
+
+/* This function sleeps when the transport's Send Queue is congested.
+ *
+ * Assumptions:
+ * - If ib_post_send() succeeds, only one completion is expected,
+ *   even if one or more WRs are flushed. This is true when posting
+ *   an rdma_rw_ctx or when posting a single signaled WR.
+ */
+static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
+{
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_xprt *xprt = &rdma->sc_xprt;
+	struct ib_send_wr *first_wr, *bad_wr;
+	struct list_head *tmp;
+	struct ib_cqe *cqe;
+	int ret;
+
+	first_wr = NULL;
+	cqe = &cc->cc_cqe;
+	list_for_each(tmp, &cc->cc_rwctxts) {
+		struct svc_rdma_rw_ctxt *ctxt;
+
+		ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
+		first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
+					   rdma->sc_port_num, cqe, first_wr);
+		cqe = NULL;
+	}
+
+	do {
+		if (atomic_sub_return(cc->cc_sqecount,
+				      &rdma->sc_sq_avail) > 0) {
+			ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
+			if (ret)
+				break;
+			return 0;
+		}
+
+		atomic_inc(&rdma_stat_sq_starve);
+		atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+		wait_event(rdma->sc_send_wait,
+			   atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
+	} while (1);
+
+	pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
+
+	/* If even one was posted, there will be a completion. */
+	if (bad_wr != first_wr)
+		return 0;
+
+	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
+	wake_up(&rdma->sc_send_wait);
+	return -ENOTCONN;
+}
+
+/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
+ */
+static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
+			       unsigned int len,
+			       struct svc_rdma_rw_ctxt *ctxt)
+{
+	struct scatterlist *sg = ctxt->rw_sg_table.sgl;
+
+	sg_set_buf(&sg[0], info->wi_base, len);
+	info->wi_base += len;
+
+	ctxt->rw_nents = 1;
+}
+
+/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
+ */
+static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
+				    unsigned int remaining,
+				    struct svc_rdma_rw_ctxt *ctxt)
+{
+	unsigned int sge_no, sge_bytes, page_off, page_no;
+	struct xdr_buf *xdr = info->wi_xdr;
+	struct scatterlist *sg;
+	struct page **page;
+
+	page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
+	page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
+	page = xdr->pages + page_no;
+	info->wi_next_off += remaining;
+	sg = ctxt->rw_sg_table.sgl;
+	sge_no = 0;
+	do {
+		sge_bytes = min_t(unsigned int, remaining,
+				  PAGE_SIZE - page_off);
+		sg_set_page(sg, *page, sge_bytes, page_off);
+
+		remaining -= sge_bytes;
+		sg = sg_next(sg);
+		page_off = 0;
+		sge_no++;
+		page++;
+	} while (remaining);
+
+	ctxt->rw_nents = sge_no;
+}
+
+/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
+ * an RPC Reply.
+ */
+static int
+svc_rdma_build_writes(struct svc_rdma_write_info *info,
+		      void (*constructor)(struct svc_rdma_write_info *info,
+					  unsigned int len,
+					  struct svc_rdma_rw_ctxt *ctxt),
+		      unsigned int remaining)
+{
+	struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
+	struct svcxprt_rdma *rdma = cc->cc_rdma;
+	struct svc_rdma_rw_ctxt *ctxt;
+	__be32 *seg;
+	int ret;
+
+	cc->cc_cqe.done = svc_rdma_write_done;
+	seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
+	do {
+		unsigned int write_len;
+		u32 seg_length, seg_handle;
+		u64 seg_offset;
+
+		if (info->wi_seg_no >= info->wi_nsegs)
+			goto out_overflow;
+
+		seg_handle = be32_to_cpup(seg);
+		seg_length = be32_to_cpup(seg + 1);
+		xdr_decode_hyper(seg + 2, &seg_offset);
+		seg_offset += info->wi_seg_off;
+
+		write_len = min(remaining, seg_length - info->wi_seg_off);
+		ctxt = svc_rdma_get_rw_ctxt(rdma,
+					    (write_len >> PAGE_SHIFT) + 2);
+		if (!ctxt)
+			goto out_noctx;
+
+		constructor(info, write_len, ctxt);
+		ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
+				       rdma->sc_port_num, ctxt->rw_sg_table.sgl,
+				       ctxt->rw_nents, 0, seg_offset,
+				       seg_handle, DMA_TO_DEVICE);
+		if (ret < 0)
+			goto out_initerr;
+
+		list_add(&ctxt->rw_list, &cc->cc_rwctxts);
+		cc->cc_sqecount += ret;
+		if (write_len == seg_length - info->wi_seg_off) {
+			seg += 4;
+			info->wi_seg_no++;
+			info->wi_seg_off = 0;
+		} else {
+			info->wi_seg_off += write_len;
+		}
+		remaining -= write_len;
+	} while (remaining);
+
+	return 0;
+
+out_overflow:
+	dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
+		info->wi_nsegs);
+	return -E2BIG;
+
+out_noctx:
+	dprintk("svcrdma: no R/W ctxs available\n");
+	return -ENOMEM;
+
+out_initerr:
+	svc_rdma_put_rw_ctxt(rdma, ctxt);
+	pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
+	return -EIO;
+}
+
+/* Send one of an xdr_buf's kvecs by itself. To send a Reply
+ * chunk, the whole RPC Reply is written back to the client.
+ * This function writes either the head or tail of the xdr_buf
+ * containing the Reply.
+ */
+static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
+				  struct kvec *vec)
+{
+	info->wi_base = vec->iov_base;
+	return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
+				     vec->iov_len);
+}
+
+/* Send an xdr_buf's page list by itself. A Write chunk is
+ * just the page list. a Reply chunk is the head, page list,
+ * and tail. This function is shared between the two types
+ * of chunk.
+ */
+static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
+				      struct xdr_buf *xdr)
+{
+	info->wi_xdr = xdr;
+	info->wi_next_off = 0;
+	return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
+				     xdr->page_len);
+}
+
+/**
+ * svc_rdma_send_write_chunk - Write all segments in a Write chunk
+ * @rdma: controlling RDMA transport
+ * @wr_ch: Write chunk provided by client
+ * @xdr: xdr_buf containing the data payload
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Write chunk,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
+			      struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int ret;
+
+	if (!xdr->page_len)
+		return 0;
+
+	info = svc_rdma_write_info_alloc(rdma, wr_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_pagelist(info, xdr);
+	if (ret < 0)
+		goto out_err;
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+	return xdr->page_len;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
+
+/**
+ * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
+ * @rdma: controlling RDMA transport
+ * @rp_ch: Reply chunk provided by client
+ * @writelist: true if client provided a Write list
+ * @xdr: xdr_buf containing an RPC Reply
+ *
+ * Returns a non-negative number of bytes the chunk consumed, or
+ *	%-E2BIG if the payload was larger than the Reply chunk,
+ *	%-ENOMEM if rdma_rw context pool was exhausted,
+ *	%-ENOTCONN if posting failed (connection is lost),
+ *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
+ */
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
+			      bool writelist, struct xdr_buf *xdr)
+{
+	struct svc_rdma_write_info *info;
+	int consumed, ret;
+
+	info = svc_rdma_write_info_alloc(rdma, rp_ch);
+	if (!info)
+		return -ENOMEM;
+
+	ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
+	if (ret < 0)
+		goto out_err;
+	consumed = xdr->head[0].iov_len;
+
+	/* Send the page list in the Reply chunk only if the
+	 * client did not provide Write chunks.
+	 */
+	if (!writelist && xdr->page_len) {
+		ret = svc_rdma_send_xdr_pagelist(info, xdr);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->page_len;
+	}
+
+	if (xdr->tail[0].iov_len) {
+		ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
+		if (ret < 0)
+			goto out_err;
+		consumed += xdr->tail[0].iov_len;
+	}
+
+	ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
+	if (ret < 0)
+		goto out_err;
+	return consumed;
+
+out_err:
+	svc_rdma_write_info_free(info);
+	return ret;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index e1097cc6d1eb..b25c50992a95 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -561,6 +561,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_maps);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
@@ -568,6 +569,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
+	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_map_lock);
 
 	/*
@@ -999,6 +1001,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		newxprt, newxprt->sc_cm_id);
 
 	dev = newxprt->sc_cm_id->device;
+	newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
 
 	/* Qualify the transport resource defaults with the
 	 * capabilities of this particular device */
@@ -1248,6 +1251,7 @@ static void __svc_rdma_free(struct work_struct *work)
 	}
 
 	rdma_dealloc_frmr_q(rdma);
+	svc_rdma_destroy_rw_ctxts(rdma);
 	svc_rdma_destroy_ctxts(rdma);
 	svc_rdma_destroy_maps(rdma);
 
-- 
cgit v1.2.3


From 9a6a180b7867ceceeeab88a6f011bac23174b939 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:25 -0400
Subject: svcrdma: Use rdma_rw API in RPC reply path

The current svcrdma sendto code path posts one RDMA Write WR at a
time. Each of these Writes typically carries a small number of pages
(for instance, up to 30 pages for mlx4 devices). That means a 1MB
NFS READ reply requires 9 ib_post_send() calls for the Write WRs,
and one for the Send WR carrying the actual RPC Reply message.

Instead, use the new rdma_rw API. The details of Write WR chain
construction and memory registration are taken care of in the RDMA
core. svcrdma can focus on the details of the RPC-over-RDMA
protocol. This gives three main benefits:

1. All Write WRs for one RDMA segment are posted in a single chain.
As few as one ib_post_send() for each Write chunk.

2. The Write path can now use FRWR to register the Write buffers.
If the device's maximum page list depth is large, this means a
single Write WR is needed for each RPC's Write chunk data.

3. The new code introduces support for RPCs that carry both a Write
list and a Reply chunk. This combination can be used for an NFSv4
READ where the data payload is large, and thus is removed from the
Payload Stream, but the Payload Stream is still larger than the
inline threshold.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |   1 -
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c |   6 +-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c      | 696 ++++++++++++++---------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |   2 +
 4 files changed, 350 insertions(+), 355 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index ca08671fb7e2..599ee03ee3fb 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -212,7 +212,6 @@ extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
 				     struct rpcrdma_msg *,
 				     enum rpcrdma_errcode, __be32 *);
-extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
 					    __be32, __be64, u32);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 0305b33d482f..bf185b79c98f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -90,9 +90,9 @@ out_notfound:
  * Caller holds the connection's mutex and has already marshaled
  * the RPC/RDMA request.
  *
- * This is similar to svc_rdma_reply, but takes an rpc_rqst
- * instead, does not support chunks, and avoids blocking memory
- * allocation.
+ * This is similar to svc_rdma_send_reply_msg, but takes a struct
+ * rpc_rqst instead, does not support chunks, and avoids blocking
+ * memory allocation.
  *
  * XXX: There is still an opportunity to block in svc_rdma_send()
  * if there are no SQ entries to post the Send. This may occur if
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 2eb3df698e11..ce62b78e5bc9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 2016 Oracle. All rights reserved.
  * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
  * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
  *
@@ -40,6 +41,63 @@
  * Author: Tom Tucker <tom@opengridcomputing.com>
  */
 
+/* Operation
+ *
+ * The main entry point is svc_rdma_sendto. This is called by the
+ * RPC server when an RPC Reply is ready to be transmitted to a client.
+ *
+ * The passed-in svc_rqst contains a struct xdr_buf which holds an
+ * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
+ * transport header, post all Write WRs needed for this Reply, then post
+ * a Send WR conveying the transport header and the RPC message itself to
+ * the client.
+ *
+ * svc_rdma_sendto must fully transmit the Reply before returning, as
+ * the svc_rqst will be recycled as soon as sendto returns. Remaining
+ * resources referred to by the svc_rqst are also recycled at that time.
+ * Therefore any resources that must remain longer must be detached
+ * from the svc_rqst and released later.
+ *
+ * Page Management
+ *
+ * The I/O that performs Reply transmission is asynchronous, and may
+ * complete well after sendto returns. Thus pages under I/O must be
+ * removed from the svc_rqst before sendto returns.
+ *
+ * The logic here depends on Send Queue and completion ordering. Since
+ * the Send WR is always posted last, it will always complete last. Thus
+ * when it completes, it is guaranteed that all previous Write WRs have
+ * also completed.
+ *
+ * Write WRs are constructed and posted. Each Write segment gets its own
+ * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
+ * DMA-unmap the pages under I/O for that Write segment. The Write
+ * completion handler does not release any pages.
+ *
+ * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
+ * The ownership of all of the Reply's pages are transferred into that
+ * ctxt, the Send WR is posted, and sendto returns.
+ *
+ * The svc_rdma_op_ctxt is presented when the Send WR completes. The
+ * Send completion handler finally releases the Reply's pages.
+ *
+ * This mechanism also assumes that completions on the transport's Send
+ * Completion Queue do not run in parallel. Otherwise a Write completion
+ * and Send completion running at the same time could release pages that
+ * are still DMA-mapped.
+ *
+ * Error Handling
+ *
+ * - If the Send WR is posted successfully, it will either complete
+ *   successfully, or get flushed. Either way, the Send completion
+ *   handler releases the Reply's pages.
+ * - If the Send WR cannot be not posted, the forward path releases
+ *   the Reply's pages.
+ *
+ * This handles the case, without the use of page reference counting,
+ * where two different Write segments send portions of the same page.
+ */
+
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
 #include <linux/spinlock.h>
@@ -55,6 +113,133 @@ static u32 xdr_padsize(u32 len)
 	return (len & 3) ? (4 - (len & 3)) : 0;
 }
 
+/* Returns length of transport header, in bytes.
+ */
+static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
+{
+	unsigned int nsegs;
+	__be32 *p;
+
+	p = rdma_resp;
+
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p += rpcrdma_fixed_maxsz + 1;
+
+	/* Skip Write list. */
+	while (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	/* Skip Reply chunk. */
+	if (*p++ != xdr_zero) {
+		nsegs = be32_to_cpup(p++);
+		p += nsegs * rpcrdma_segment_maxsz;
+	}
+
+	return (unsigned long)p - (unsigned long)rdma_resp;
+}
+
+/* One Write chunk is copied from Call transport header to Reply
+ * transport header. Each segment's length field is updated to
+ * reflect number of bytes consumed in the segment.
+ *
+ * Returns number of segments in this chunk.
+ */
+static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+					   unsigned int remaining)
+{
+	unsigned int i, nsegs;
+	u32 seg_len;
+
+	/* Write list discriminator */
+	*dst++ = *src++;
+
+	/* number of segments in this chunk */
+	nsegs = be32_to_cpup(src);
+	*dst++ = *src++;
+
+	for (i = nsegs; i; i--) {
+		/* segment's RDMA handle */
+		*dst++ = *src++;
+
+		/* bytes returned in this segment */
+		seg_len = be32_to_cpu(*src);
+		if (remaining >= seg_len) {
+			/* entire segment was consumed */
+			*dst = *src;
+			remaining -= seg_len;
+		} else {
+			/* segment only partly filled */
+			*dst = cpu_to_be32(remaining);
+			remaining = 0;
+		}
+		dst++; src++;
+
+		/* segment's RDMA offset */
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+
+	return nsegs;
+}
+
+/* The client provided a Write list in the Call message. Fill in
+ * the segments in the first Write chunk in the Reply's transport
+ * header with the number of bytes consumed in each segment.
+ * Remaining chunks are returned unused.
+ *
+ * Assumptions:
+ *  - Client has provided only one Write chunk
+ */
+static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
+					   unsigned int consumed)
+{
+	unsigned int nsegs;
+	__be32 *p, *q;
+
+	/* RPC-over-RDMA V1 replies never have a Read list. */
+	p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+	q = wr_ch;
+	while (*q != xdr_zero) {
+		nsegs = xdr_encode_write_chunk(p, q, consumed);
+		q += 2 + nsegs * rpcrdma_segment_maxsz;
+		p += 2 + nsegs * rpcrdma_segment_maxsz;
+		consumed = 0;
+	}
+
+	/* Terminate Write list */
+	*p++ = xdr_zero;
+
+	/* Reply chunk discriminator; may be replaced later */
+	*p = xdr_zero;
+}
+
+/* The client provided a Reply chunk in the Call message. Fill in
+ * the segments in the Reply chunk in the Reply message with the
+ * number of bytes consumed in each segment.
+ *
+ * Assumptions:
+ * - Reply can always fit in the provided Reply chunk
+ */
+static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
+					    unsigned int consumed)
+{
+	__be32 *p;
+
+	/* Find the Reply chunk in the Reply's xprt header.
+	 * RPC-over-RDMA V1 replies never have a Read list.
+	 */
+	p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+
+	/* Skip past Write list */
+	while (*p++ != xdr_zero)
+		p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+
+	xdr_encode_write_chunk(p, rp_ch, consumed);
+}
+
 int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
 		     struct xdr_buf *xdr,
 		     struct svc_rdma_req_map *vec,
@@ -123,45 +308,14 @@ int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
 	return 0;
 }
 
-static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
-			      struct xdr_buf *xdr,
-			      u32 xdr_off, size_t len, int dir)
-{
-	struct page *page;
-	dma_addr_t dma_addr;
-	if (xdr_off < xdr->head[0].iov_len) {
-		/* This offset is in the head */
-		xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
-		page = virt_to_page(xdr->head[0].iov_base);
-	} else {
-		xdr_off -= xdr->head[0].iov_len;
-		if (xdr_off < xdr->page_len) {
-			/* This offset is in the page list */
-			xdr_off += xdr->page_base;
-			page = xdr->pages[xdr_off >> PAGE_SHIFT];
-			xdr_off &= ~PAGE_MASK;
-		} else {
-			/* This offset is in the tail */
-			xdr_off -= xdr->page_len;
-			xdr_off += (unsigned long)
-				xdr->tail[0].iov_base & ~PAGE_MASK;
-			page = virt_to_page(xdr->tail[0].iov_base);
-		}
-	}
-	dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
-				   min_t(size_t, PAGE_SIZE, len), dir);
-	return dma_addr;
-}
-
 /* Parse the RPC Call's transport header.
  */
-static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
-				      struct rpcrdma_write_array **write,
-				      struct rpcrdma_write_array **reply)
+static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
+				      __be32 **write, __be32 **reply)
 {
 	__be32 *p;
 
-	p = (__be32 *)&rmsgp->rm_body.rm_chunks[0];
+	p = rdma_argp + rpcrdma_fixed_maxsz;
 
 	/* Read list */
 	while (*p++ != xdr_zero)
@@ -169,7 +323,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
 
 	/* Write list */
 	if (*p != xdr_zero) {
-		*write = (struct rpcrdma_write_array *)p;
+		*write = p;
 		while (*p++ != xdr_zero)
 			p += 1 + be32_to_cpu(*p) * 4;
 	} else {
@@ -179,7 +333,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp,
 
 	/* Reply chunk */
 	if (*p != xdr_zero)
-		*reply = (struct rpcrdma_write_array *)p;
+		*reply = p;
 	else
 		*reply = NULL;
 }
@@ -210,6 +364,32 @@ static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp,
 	return be32_to_cpup(p);
 }
 
+/* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
+ * is used during completion to DMA-unmap this memory, and
+ * it uses ib_dma_unmap_page() exclusively.
+ */
+static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
+				struct svc_rdma_op_ctxt *ctxt,
+				unsigned int sge_no,
+				unsigned char *base,
+				unsigned int len)
+{
+	unsigned long offset = (unsigned long)base & ~PAGE_MASK;
+	struct ib_device *dev = rdma->sc_cm_id->device;
+	dma_addr_t dma_addr;
+
+	dma_addr = ib_dma_map_page(dev, virt_to_page(base),
+				   offset, len, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(dev, dma_addr))
+		return -EIO;
+
+	ctxt->sge[sge_no].addr = dma_addr;
+	ctxt->sge[sge_no].length = len;
+	ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
+	svc_rdma_count_mappings(rdma, ctxt);
+	return 0;
+}
+
 static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 unsigned int sge_no,
@@ -253,222 +433,73 @@ int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 	return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len);
 }
 
-/* Assumptions:
- * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
+/* Load the xdr_buf into the ctxt's sge array, and DMA map each
+ * element as it is added.
+ *
+ * Returns the number of sge elements loaded on success, or
+ * a negative errno on failure.
  */
-static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
-		      u32 rmr, u64 to,
-		      u32 xdr_off, int write_len,
-		      struct svc_rdma_req_map *vec)
+static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
+				  struct svc_rdma_op_ctxt *ctxt,
+				  struct xdr_buf *xdr, __be32 *wr_lst)
 {
-	struct ib_rdma_wr write_wr;
-	struct ib_sge *sge;
-	int xdr_sge_no;
-	int sge_no;
-	int sge_bytes;
-	int sge_off;
-	int bc;
-	struct svc_rdma_op_ctxt *ctxt;
+	unsigned int len, sge_no, remaining, page_off;
+	struct page **ppages;
+	unsigned char *base;
+	u32 xdr_pad;
+	int ret;
 
-	if (vec->count > RPCSVC_MAXPAGES) {
-		pr_err("svcrdma: Too many pages (%lu)\n", vec->count);
-		return -EIO;
-	}
+	sge_no = 1;
 
-	dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
-		"write_len=%d, vec->sge=%p, vec->count=%lu\n",
-		rmr, (unsigned long long)to, xdr_off,
-		write_len, vec->sge, vec->count);
+	ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++,
+				   xdr->head[0].iov_base,
+				   xdr->head[0].iov_len);
+	if (ret < 0)
+		return ret;
 
-	ctxt = svc_rdma_get_context(xprt);
-	ctxt->direction = DMA_TO_DEVICE;
-	sge = ctxt->sge;
-
-	/* Find the SGE associated with xdr_off */
-	for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
-	     xdr_sge_no++) {
-		if (vec->sge[xdr_sge_no].iov_len > bc)
-			break;
-		bc -= vec->sge[xdr_sge_no].iov_len;
-	}
+	/* If a Write chunk is present, the xdr_buf's page list
+	 * is not included inline. However the Upper Layer may
+	 * have added XDR padding in the tail buffer, and that
+	 * should not be included inline.
+	 */
+	if (wr_lst) {
+		base = xdr->tail[0].iov_base;
+		len = xdr->tail[0].iov_len;
+		xdr_pad = xdr_padsize(xdr->page_len);
 
-	sge_off = bc;
-	bc = write_len;
-	sge_no = 0;
-
-	/* Copy the remaining SGE */
-	while (bc != 0) {
-		sge_bytes = min_t(size_t,
-			  bc, vec->sge[xdr_sge_no].iov_len-sge_off);
-		sge[sge_no].length = sge_bytes;
-		sge[sge_no].addr =
-			dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
-				    sge_bytes, DMA_TO_DEVICE);
-		xdr_off += sge_bytes;
-		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
-					 sge[sge_no].addr))
-			goto err;
-		svc_rdma_count_mappings(xprt, ctxt);
-		sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
-		ctxt->count++;
-		sge_off = 0;
-		sge_no++;
-		xdr_sge_no++;
-		if (xdr_sge_no > vec->count) {
-			pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no);
-			goto err;
+		if (len && xdr_pad) {
+			base += xdr_pad;
+			len -= xdr_pad;
 		}
-		bc -= sge_bytes;
-		if (sge_no == xprt->sc_max_sge)
-			break;
-	}
-
-	/* Prepare WRITE WR */
-	memset(&write_wr, 0, sizeof write_wr);
-	ctxt->cqe.done = svc_rdma_wc_write;
-	write_wr.wr.wr_cqe = &ctxt->cqe;
-	write_wr.wr.sg_list = &sge[0];
-	write_wr.wr.num_sge = sge_no;
-	write_wr.wr.opcode = IB_WR_RDMA_WRITE;
-	write_wr.wr.send_flags = IB_SEND_SIGNALED;
-	write_wr.rkey = rmr;
-	write_wr.remote_addr = to;
-
-	/* Post It */
-	atomic_inc(&rdma_stat_write);
-	if (svc_rdma_send(xprt, &write_wr.wr))
-		goto err;
-	return write_len - bc;
- err:
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 0);
-	return -EIO;
-}
 
-noinline
-static int send_write_chunks(struct svcxprt_rdma *xprt,
-			     struct rpcrdma_write_array *wr_ary,
-			     struct rpcrdma_msg *rdma_resp,
-			     struct svc_rqst *rqstp,
-			     struct svc_rdma_req_map *vec)
-{
-	u32 xfer_len = rqstp->rq_res.page_len;
-	int write_len;
-	u32 xdr_off;
-	int chunk_off;
-	int chunk_no;
-	int nchunks;
-	struct rpcrdma_write_array *res_ary;
-	int ret;
-
-	res_ary = (struct rpcrdma_write_array *)
-		&rdma_resp->rm_body.rm_chunks[1];
-
-	/* Write chunks start at the pagelist */
-	nchunks = be32_to_cpu(wr_ary->wc_nchunks);
-	for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
-	     xfer_len && chunk_no < nchunks;
-	     chunk_no++) {
-		struct rpcrdma_segment *arg_ch;
-		u64 rs_offset;
-
-		arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
-		write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
-
-		/* Prepare the response chunk given the length actually
-		 * written */
-		xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
-		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
-						arg_ch->rs_handle,
-						arg_ch->rs_offset,
-						write_len);
-		chunk_off = 0;
-		while (write_len) {
-			ret = send_write(xprt, rqstp,
-					 be32_to_cpu(arg_ch->rs_handle),
-					 rs_offset + chunk_off,
-					 xdr_off,
-					 write_len,
-					 vec);
-			if (ret <= 0)
-				goto out_err;
-			chunk_off += ret;
-			xdr_off += ret;
-			xfer_len -= ret;
-			write_len -= ret;
-		}
+		goto tail;
 	}
-	/* Update the req with the number of chunks actually used */
-	svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
 
-	return rqstp->rq_res.page_len;
+	ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+	page_off = xdr->page_base & ~PAGE_MASK;
+	remaining = xdr->page_len;
+	while (remaining) {
+		len = min_t(u32, PAGE_SIZE - page_off, remaining);
 
-out_err:
-	pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
-	return -EIO;
-}
-
-noinline
-static int send_reply_chunks(struct svcxprt_rdma *xprt,
-			     struct rpcrdma_write_array *rp_ary,
-			     struct rpcrdma_msg *rdma_resp,
-			     struct svc_rqst *rqstp,
-			     struct svc_rdma_req_map *vec)
-{
-	u32 xfer_len = rqstp->rq_res.len;
-	int write_len;
-	u32 xdr_off;
-	int chunk_no;
-	int chunk_off;
-	int nchunks;
-	struct rpcrdma_segment *ch;
-	struct rpcrdma_write_array *res_ary;
-	int ret;
+		ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++,
+					    *ppages++, page_off, len);
+		if (ret < 0)
+			return ret;
 
-	/* XXX: need to fix when reply lists occur with read-list and or
-	 * write-list */
-	res_ary = (struct rpcrdma_write_array *)
-		&rdma_resp->rm_body.rm_chunks[2];
-
-	/* xdr offset starts at RPC message */
-	nchunks = be32_to_cpu(rp_ary->wc_nchunks);
-	for (xdr_off = 0, chunk_no = 0;
-	     xfer_len && chunk_no < nchunks;
-	     chunk_no++) {
-		u64 rs_offset;
-		ch = &rp_ary->wc_array[chunk_no].wc_target;
-		write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
-
-		/* Prepare the reply chunk given the length actually
-		 * written */
-		xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
-		svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
-						ch->rs_handle, ch->rs_offset,
-						write_len);
-		chunk_off = 0;
-		while (write_len) {
-			ret = send_write(xprt, rqstp,
-					 be32_to_cpu(ch->rs_handle),
-					 rs_offset + chunk_off,
-					 xdr_off,
-					 write_len,
-					 vec);
-			if (ret <= 0)
-				goto out_err;
-			chunk_off += ret;
-			xdr_off += ret;
-			xfer_len -= ret;
-			write_len -= ret;
-		}
+		remaining -= len;
+		page_off = 0;
 	}
-	/* Update the req with the number of chunks actually used */
-	svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
 
-	return rqstp->rq_res.len;
+	base = xdr->tail[0].iov_base;
+	len = xdr->tail[0].iov_len;
+tail:
+	if (len) {
+		ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len);
+		if (ret < 0)
+			return ret;
+	}
 
-out_err:
-	pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
-	return -EIO;
+	return sge_no - 1;
 }
 
 /* The svc_rqst and all resources it owns are released as soon as
@@ -525,90 +556,66 @@ int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 	return svc_rdma_send(rdma, send_wr);
 }
 
-/* This function prepares the portion of the RPCRDMA message to be
- * sent in the RDMA_SEND. This function is called after data sent via
- * RDMA has already been transmitted. There are three cases:
- * - The RPCRDMA header, RPC header, and payload are all sent in a
- *   single RDMA_SEND. This is the "inline" case.
- * - The RPCRDMA header and some portion of the RPC header and data
- *   are sent via this RDMA_SEND and another portion of the data is
- *   sent via RDMA.
- * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
- *   header and data are all transmitted via RDMA.
- * In all three cases, this function prepares the RPCRDMA header in
- * sge[0], the 'type' parameter indicates the type to place in the
- * RPCRDMA header, and the 'byte_count' field indicates how much of
- * the XDR to include in this RDMA_SEND. NB: The offset of the payload
- * to send is zero in the XDR.
+/* Prepare the portion of the RPC Reply that will be transmitted
+ * via RDMA Send. The RPC-over-RDMA transport header is prepared
+ * in sge[0], and the RPC xdr_buf is prepared in following sges.
+ *
+ * Depending on whether a Write list or Reply chunk is present,
+ * the server may send all, a portion of, or none of the xdr_buf.
+ * In the latter case, only the transport header (sge[0]) is
+ * transmitted.
+ *
+ * RDMA Send is the last step of transmitting an RPC reply. Pages
+ * involved in the earlier RDMA Writes are here transferred out
+ * of the rqstp and into the ctxt's page array. These pages are
+ * DMA unmapped by each Write completion, but the subsequent Send
+ * completion finally releases these pages.
+ *
+ * Assumptions:
+ * - The Reply's transport header will never be larger than a page.
  */
-static int send_reply(struct svcxprt_rdma *rdma,
-		      struct svc_rqst *rqstp,
-		      struct page *page,
-		      struct rpcrdma_msg *rdma_resp,
-		      struct svc_rdma_req_map *vec,
-		      int byte_count,
-		      u32 inv_rkey)
+static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
+				   __be32 *rdma_argp, __be32 *rdma_resp,
+				   struct svc_rqst *rqstp,
+				   __be32 *wr_lst, __be32 *rp_ch)
 {
 	struct svc_rdma_op_ctxt *ctxt;
-	u32 xdr_off;
-	int sge_no;
-	int sge_bytes;
-	int ret = -EIO;
+	u32 inv_rkey;
+	int ret;
+
+	dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
+		(rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"),
+		rqstp->rq_res.head[0].iov_len,
+		rqstp->rq_res.page_len,
+		rqstp->rq_res.tail[0].iov_len);
 
-	/* Prepare the context */
 	ctxt = svc_rdma_get_context(rdma);
-	ctxt->direction = DMA_TO_DEVICE;
-	ctxt->pages[0] = page;
-	ctxt->count = 1;
 
-	/* Prepare the SGE for the RPCRDMA Header */
-	ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
-	ctxt->sge[0].length =
-	    svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp);
-	ctxt->sge[0].addr =
-	    ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
-			    ctxt->sge[0].length, DMA_TO_DEVICE);
-	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
+	ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp,
+				     svc_rdma_reply_hdr_len(rdma_resp));
+	if (ret < 0)
 		goto err;
-	svc_rdma_count_mappings(rdma, ctxt);
-
-	ctxt->direction = DMA_TO_DEVICE;
 
-	/* Map the payload indicated by 'byte_count' */
-	xdr_off = 0;
-	for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
-		sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
-		byte_count -= sge_bytes;
-		ctxt->sge[sge_no].addr =
-			dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
-				    sge_bytes, DMA_TO_DEVICE);
-		xdr_off += sge_bytes;
-		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
-					 ctxt->sge[sge_no].addr))
+	if (!rp_ch) {
+		ret = svc_rdma_map_reply_msg(rdma, ctxt,
+					     &rqstp->rq_res, wr_lst);
+		if (ret < 0)
 			goto err;
-		svc_rdma_count_mappings(rdma, ctxt);
-		ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
-		ctxt->sge[sge_no].length = sge_bytes;
-	}
-	if (byte_count != 0) {
-		pr_err("svcrdma: Could not map %d bytes\n", byte_count);
-		goto err;
 	}
 
 	svc_rdma_save_io_pages(rqstp, ctxt);
 
-	if (sge_no > rdma->sc_max_sge) {
-		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
-		goto err;
-	}
-
-	ret = svc_rdma_post_send_wr(rdma, ctxt, sge_no, inv_rkey);
+	inv_rkey = 0;
+	if (rdma->sc_snd_w_inv)
+		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch);
+	ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey);
 	if (ret)
 		goto err;
 
 	return 0;
 
- err:
+err:
+	pr_err("svcrdma: failed to post Send WR (%d)\n", ret);
 	svc_rdma_unmap_dma(ctxt);
 	svc_rdma_put_context(ctxt, 1);
 	return ret;
@@ -618,41 +625,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
 {
 }
 
+/**
+ * svc_rdma_sendto - Transmit an RPC reply
+ * @rqstp: processed RPC request, reply XDR already in ::rq_res
+ *
+ * Any resources still associated with @rqstp are released upon return.
+ * If no reply message was possible, the connection is closed.
+ *
+ * Returns:
+ *	%0 if an RPC reply has been successfully posted,
+ *	%-ENOMEM if a resource shortage occurred (connection is lost),
+ *	%-ENOTCONN if posting failed (connection is lost).
+ */
 int svc_rdma_sendto(struct svc_rqst *rqstp)
 {
 	struct svc_xprt *xprt = rqstp->rq_xprt;
 	struct svcxprt_rdma *rdma =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
-	struct rpcrdma_msg *rdma_argp;
-	struct rpcrdma_msg *rdma_resp;
-	struct rpcrdma_write_array *wr_ary, *rp_ary;
-	int ret;
-	int inline_bytes;
+	__be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+	struct xdr_buf *xdr = &rqstp->rq_res;
 	struct page *res_page;
-	struct svc_rdma_req_map *vec;
-	u32 inv_rkey;
-	__be32 *p;
-
-	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
+	int ret;
 
-	/* Get the RDMA request header. The receive logic always
-	 * places this at the start of page 0.
+	/* Find the call's chunk lists to decide how to send the reply.
+	 * Receive places the Call's xprt header at the start of page 0.
 	 */
 	rdma_argp = page_address(rqstp->rq_pages[0]);
-	svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary);
+	svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
 
-	inv_rkey = 0;
-	if (rdma->sc_snd_w_inv)
-		inv_rkey = svc_rdma_get_inv_rkey(&rdma_argp->rm_xid,
-						 (__be32 *)wr_ary,
-						 (__be32 *)rp_ary);
-
-	/* Build an req vec for the XDR */
-	vec = svc_rdma_get_req_map(rdma);
-	ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
-	if (ret)
-		goto err0;
-	inline_bytes = rqstp->rq_res.len;
+	dprintk("svcrdma: preparing response for XID 0x%08x\n",
+		be32_to_cpup(rdma_argp));
 
 	/* Create the RDMA response header. xprt->xpt_mutex,
 	 * acquired in svc_send(), serializes RPC replies. The
@@ -666,54 +668,46 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 		goto err0;
 	rdma_resp = page_address(res_page);
 
-	p = &rdma_resp->rm_xid;
-	*p++ = rdma_argp->rm_xid;
-	*p++ = rdma_argp->rm_vers;
+	p = rdma_resp;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
 	*p++ = rdma->sc_fc_credits;
-	*p++ = rp_ary ? rdma_nomsg : rdma_msg;
+	*p++ = rp_ch ? rdma_nomsg : rdma_msg;
 
 	/* Start with empty chunks */
 	*p++ = xdr_zero;
 	*p++ = xdr_zero;
 	*p   = xdr_zero;
 
-	/* Send any write-chunk data and build resp write-list */
-	if (wr_ary) {
-		ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+	if (wr_lst) {
+		/* XXX: Presume the client sent only one Write chunk */
+		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
 		if (ret < 0)
 			goto err1;
-		inline_bytes -= ret + xdr_padsize(ret);
+		svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
 	}
-
-	/* Send any reply-list data and update resp reply-list */
-	if (rp_ary) {
-		ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
+	if (rp_ch) {
+		ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
 		if (ret < 0)
 			goto err1;
-		inline_bytes -= ret;
+		svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
 	}
 
-	/* Post a fresh Receive buffer _before_ sending the reply */
 	ret = svc_rdma_post_recv(rdma, GFP_KERNEL);
 	if (ret)
 		goto err1;
-
-	ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
-			 inline_bytes, inv_rkey);
+	ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp,
+				      wr_lst, rp_ch);
 	if (ret < 0)
 		goto err0;
-
-	svc_rdma_put_req_map(rdma, vec);
-	dprintk("svcrdma: send_reply returns %d\n", ret);
-	return ret;
+	return 0;
 
  err1:
 	put_page(res_page);
  err0:
-	svc_rdma_put_req_map(rdma, vec);
 	pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
 	       ret);
-	set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	return -ENOTCONN;
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index b25c50992a95..237c377c1e06 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -1053,6 +1053,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	memset(&qp_attr, 0, sizeof qp_attr);
 	qp_attr.event_handler = qp_event_handler;
 	qp_attr.qp_context = &newxprt->sc_xprt;
+	qp_attr.port_num = newxprt->sc_cm_id->port_num;
+	qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
 	qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
 	qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
 	qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
-- 
cgit v1.2.3


From 6b19cc5ca2f78ebc88f5d39ba6a94197bb392fcc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:33 -0400
Subject: svcrdma: Clean up RDMA_ERROR path

Now that svc_rdma_sendto has been renovated, svc_rdma_send_error can
be refactored to reduce code duplication and remove C structure-
based XDR encoding. It is also relocated to the source file that
contains its only caller.

This is a refactoring change only.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/rpc_rdma.h         |  3 ++
 include/linux/sunrpc/svc_rdma.h         |  5 ----
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 19 ------------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 52 ++++++++++++++++++++++++++++++++-
 net/sunrpc/xprtrdma/svc_rdma_sendto.c   | 43 ---------------------------
 5 files changed, 54 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 245fc59b7324..b7e85b341a54 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -143,6 +143,9 @@ enum rpcrdma_proc {
 #define rdma_done	cpu_to_be32(RDMA_DONE)
 #define rdma_error	cpu_to_be32(RDMA_ERROR)
 
+#define err_vers	cpu_to_be32(ERR_VERS)
+#define err_chunk	cpu_to_be32(ERR_CHUNK)
+
 /*
  * Private extension to RPC-over-RDMA Version One.
  * Message passed during RDMA-CM connection set-up.
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 599ee03ee3fb..a770d200f607 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -209,9 +209,6 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
-extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
-				     struct rpcrdma_msg *,
-				     enum rpcrdma_errcode, __be32 *);
 extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
 extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
 					    __be32, __be64, u32);
@@ -244,8 +241,6 @@ extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
 				 struct svc_rdma_op_ctxt *ctxt,
 				 int num_sge, u32 inv_rkey);
 extern int svc_rdma_sendto(struct svc_rqst *);
-extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
-				int);
 
 /* svc_rdma_transport.c */
 extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 1c4aabf0f657..ae58a897eca0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -167,25 +167,6 @@ out_inval:
 	return -EINVAL;
 }
 
-int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
-			      struct rpcrdma_msg *rmsgp,
-			      enum rpcrdma_errcode err, __be32 *va)
-{
-	__be32 *startp = va;
-
-	*va++ = rmsgp->rm_xid;
-	*va++ = rmsgp->rm_vers;
-	*va++ = xprt->sc_fc_credits;
-	*va++ = rdma_error;
-	*va++ = cpu_to_be32(err);
-	if (err == ERR_VERS) {
-		*va++ = rpcrdma_version;
-		*va++ = rpcrdma_version;
-	}
-
-	return (int)((unsigned long)va - (unsigned long)startp);
-}
-
 /**
  * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
  * @rdma_resp: buffer containing Reply transport header
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f7b2daf72a86..7435cb666f42 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -558,6 +558,56 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
 	rqstp->rq_arg.buflen = head->arg.buflen;
 }
 
+static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
+				__be32 *rdma_argp, int status)
+{
+	struct svc_rdma_op_ctxt *ctxt;
+	__be32 *p, *err_msgp;
+	unsigned int length;
+	struct page *page;
+	int ret;
+
+	ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+	if (ret)
+		return;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return;
+	err_msgp = page_address(page);
+
+	p = err_msgp;
+	*p++ = *rdma_argp;
+	*p++ = *(rdma_argp + 1);
+	*p++ = xprt->sc_fc_credits;
+	*p++ = rdma_error;
+	if (status == -EPROTONOSUPPORT) {
+		*p++ = err_vers;
+		*p++ = rpcrdma_version;
+		*p++ = rpcrdma_version;
+	} else {
+		*p++ = err_chunk;
+	}
+	length = (unsigned long)p - (unsigned long)err_msgp;
+
+	/* Map transport header; no RPC message payload */
+	ctxt = svc_rdma_get_context(xprt);
+	ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
+	if (ret) {
+		dprintk("svcrdma: Error %d mapping send for protocol error\n",
+			ret);
+		return;
+	}
+
+	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
+	if (ret) {
+		dprintk("svcrdma: Error %d posting send for protocol error\n",
+			ret);
+		svc_rdma_unmap_dma(ctxt);
+		svc_rdma_put_context(ctxt, 1);
+	}
+}
+
 /* By convention, backchannel calls arrive via rdma_msg type
  * messages, and never populate the chunk lists. This makes
  * the RPC/RDMA header small and fixed in size, so it is
@@ -686,7 +736,7 @@ complete:
 	return ret;
 
 out_err:
-	svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+	svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
 	svc_rdma_put_context(ctxt, 0);
 	return 0;
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index ce62b78e5bc9..0b646e8f23c7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -710,46 +710,3 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	set_bit(XPT_CLOSE, &xprt->xpt_flags);
 	return -ENOTCONN;
 }
-
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
-			 int status)
-{
-	struct page *p;
-	struct svc_rdma_op_ctxt *ctxt;
-	enum rpcrdma_errcode err;
-	__be32 *va;
-	int length;
-	int ret;
-
-	ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
-	if (ret)
-		return;
-
-	p = alloc_page(GFP_KERNEL);
-	if (!p)
-		return;
-	va = page_address(p);
-
-	/* XDR encode an error reply */
-	err = ERR_CHUNK;
-	if (status == -EPROTONOSUPPORT)
-		err = ERR_VERS;
-	length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-
-	/* Map transport header; no RPC message payload */
-	ctxt = svc_rdma_get_context(xprt);
-	ret = svc_rdma_map_reply_hdr(xprt, ctxt, &rmsgp->rm_xid, length);
-	if (ret) {
-		dprintk("svcrdma: Error %d mapping send for protocol error\n",
-			ret);
-		return;
-	}
-
-	ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
-	if (ret) {
-		dprintk("svcrdma: Error %d posting send for protocol error\n",
-			ret);
-		svc_rdma_unmap_dma(ctxt);
-		svc_rdma_put_context(ctxt, 1);
-	}
-}
-- 
cgit v1.2.3


From f5821c76b2c9c2fb98b276c0bf6a101bfe9050a3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:49 -0400
Subject: svcrdma: Clean up RPC-over-RDMA backchannel reply processing

Replace C structure-based XDR decoding with pointer arithmetic.
Pointer arithmetic is considered more portable.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h            |  2 +-
 net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 18 ++++++++++++++----
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c    | 27 +++++++++++++++------------
 3 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index a770d200f607..44d642bbfce6 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -204,7 +204,7 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
 
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
-				    struct rpcrdma_msg *rmsgp,
+				    __be32 *rdma_resp,
 				    struct xdr_buf *rcvbuf);
 
 /* svc_rdma_marshal.c */
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index bf185b79c98f..c676ed0efb5a 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -12,7 +12,17 @@
 
 #undef SVCRDMA_BACKCHANNEL_DEBUG
 
-int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+/**
+ * svc_rdma_handle_bc_reply - Process incoming backchannel reply
+ * @xprt: controlling backchannel transport
+ * @rdma_resp: pointer to incoming transport header
+ * @rcvbuf: XDR buffer into which to decode the reply
+ *
+ * Returns:
+ *	%0 if @rcvbuf is filled in, xprt_complete_rqst called,
+ *	%-EAGAIN if server should call ->recvfrom again.
+ */
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
 			     struct xdr_buf *rcvbuf)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
 
 	p = (__be32 *)src->iov_base;
 	len = src->iov_len;
-	xid = rmsgp->rm_xid;
+	xid = *rdma_resp;
 
 #ifdef SVCRDMA_BACKCHANNEL_DEBUG
 	pr_info("%s: xid=%08x, length=%zu\n",
 		__func__, be32_to_cpu(xid), len);
 	pr_info("%s: RPC/RDMA: %*ph\n",
-		__func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+		__func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
 	pr_info("%s:      RPC: %*ph\n",
 		__func__, (int)len, p);
 #endif
@@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
 		goto out_unlock;
 	memcpy(dst->iov_base, p, len);
 
-	credits = be32_to_cpu(rmsgp->rm_credit);
+	credits = be32_to_cpup(rdma_resp + 2);
 	if (credits == 0)
 		credits = 1;	/* don't deadlock */
 	else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 7435cb666f42..27a99bf5b1a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -613,28 +613,30 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
  * the RPC/RDMA header small and fixed in size, so it is
  * straightforward to check the RPC header's direction field.
  */
-static bool
-svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
+					  __be32 *rdma_resp)
 {
-	__be32 *p = (__be32 *)rmsgp;
+	__be32 *p;
 
 	if (!xprt->xpt_bc_xprt)
 		return false;
 
-	if (rmsgp->rm_type != rdma_msg)
+	p = rdma_resp + 3;
+	if (*p++ != rdma_msg)
 		return false;
-	if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+
+	if (*p++ != xdr_zero)
 		return false;
-	if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+	if (*p++ != xdr_zero)
 		return false;
-	if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+	if (*p++ != xdr_zero)
 		return false;
 
-	/* sanity */
-	if (p[7] != rmsgp->rm_xid)
+	/* XID sanity */
+	if (*p++ != *rdma_resp)
 		return false;
 	/* call direction */
-	if (p[8] == cpu_to_be32(RPC_CALL))
+	if (*p == cpu_to_be32(RPC_CALL))
 		return false;
 
 	return true;
@@ -700,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
 		goto out_drop;
 	rqstp->rq_xprt_hlen = ret;
 
-	if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
-		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+	if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
+		ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
+					       &rmsgp->rm_xid,
 					       &rqstp->rq_arg);
 		svc_rdma_put_context(ctxt, 0);
 		if (ret)
-- 
cgit v1.2.3


From ded8d19641a605232ab48f5d27f542648beba3cc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:06:57 -0400
Subject: svcrdma: Reduce size of sge array in struct svc_rdma_op_ctxt

The sge array in struct svc_rdma_op_ctxt is no longer used for
sending RDMA Write WRs. It need only accommodate the construction of
Send and Receive WRs. The maximum inline size is the largest payload
it needs to handle now.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h | 9 +++++++--
 net/sunrpc/xprtrdma/svc_rdma.c  | 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 44d642bbfce6..e84b77556784 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -48,6 +48,12 @@
 #include <rdma/rdma_cm.h>
 #define SVCRDMA_DEBUG
 
+/* Default and maximum inline threshold sizes */
+enum {
+	RPCRDMA_DEF_INLINE_THRESH = 4096,
+	RPCRDMA_MAX_INLINE_THRESH = 65536
+};
+
 /* RPC/RDMA parameters and stats */
 extern unsigned int svcrdma_ord;
 extern unsigned int svcrdma_max_requests;
@@ -86,7 +92,7 @@ struct svc_rdma_op_ctxt {
 	int count;
 	unsigned int mapped_sges;
 	struct ib_send_wr send_wr;
-	struct ib_sge sge[RPCSVC_MAXPAGES];
+	struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
 
@@ -186,7 +192,6 @@ struct svcxprt_rdma {
  * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ.  */
 #define RPCRDMA_ORD             (64/4)
 #define RPCRDMA_MAX_REQUESTS    32
-#define RPCRDMA_MAX_REQ_SIZE    4096
 
 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
  * current NFSv4.1 implementation supports one backchannel slot.
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 912444174647..a4a8f6989ee7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
 unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
 static unsigned int min_max_requests = 4;
 static unsigned int max_max_requests = 16384;
-unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
-static unsigned int min_max_inline = 4096;
-static unsigned int max_max_inline = 65536;
+unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
+static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
 
 atomic_t rdma_stat_recv;
 atomic_t rdma_stat_read;
-- 
cgit v1.2.3


From 68cc4636bbbca89b9fedcf46d8b6bee444fc5e4e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:07:05 -0400
Subject: svcrdma: Remove unused RDMA Write completion handler

Clean up. All RDMA Write completions are now handled by
svc_rdma_wc_write_ctx.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  1 -
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ------------------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index e84b77556784..f58c5349beb7 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -249,7 +249,6 @@ extern int svc_rdma_sendto(struct svc_rqst *);
 
 /* svc_rdma_transport.c */
 extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
-extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
 extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 237c377c1e06..1597ca8ba3c0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -473,24 +473,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 	svc_rdma_put_context(ctxt, 1);
 }
 
-/**
- * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
- * @cq:        completion queue
- * @wc:        completed WR
- *
- */
-void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
-{
-	struct ib_cqe *cqe = wc->wr_cqe;
-	struct svc_rdma_op_ctxt *ctxt;
-
-	svc_rdma_send_wc_common_put(cq, wc, "write");
-
-	ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
-	svc_rdma_unmap_dma(ctxt);
-	svc_rdma_put_context(ctxt, 0);
-}
-
 /**
  * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
  * @cq:        completion queue
-- 
cgit v1.2.3


From 2cf32924c68a22783e6f630e1b5345a80aa1a376 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:07:13 -0400
Subject: svcrdma: Remove the req_map cache

req_maps are no longer used by the send path and can thus be removed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 34 +------------
 net/sunrpc/xprtrdma/svc_rdma_sendto.c    | 68 --------------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 84 --------------------------------
 3 files changed, 1 insertion(+), 185 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f58c5349beb7..479bb7f65233 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -96,23 +96,6 @@ struct svc_rdma_op_ctxt {
 	struct page *pages[RPCSVC_MAXPAGES];
 };
 
-/*
- * NFS_ requests are mapped on the client side by the chunk lists in
- * the RPCRDMA header. During the fetching of the RPC from the client
- * and the writing of the reply to the client, the memory in the
- * client and the memory in the server must be mapped as contiguous
- * vaddr/len for access by the hardware. These data strucures keep
- * these mappings.
- *
- * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the
- * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the
- * 'ch' field maps the read-list of the RPCRDMA header to the 'sge'
- * mapping of the reply.
- */
-struct svc_rdma_chunk_sge {
-	int start;		/* sge no for this chunk */
-	int count;		/* sge count for this chunk */
-};
 struct svc_rdma_fastreg_mr {
 	struct ib_mr *mr;
 	struct scatterlist *sg;
@@ -121,15 +104,7 @@ struct svc_rdma_fastreg_mr {
 	enum dma_data_direction direction;
 	struct list_head frmr_list;
 };
-struct svc_rdma_req_map {
-	struct list_head free;
-	unsigned long count;
-	union {
-		struct kvec sge[RPCSVC_MAXPAGES];
-		struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
-		unsigned long lkey[RPCSVC_MAXPAGES];
-	};
-};
+
 #define RDMACTXT_F_LAST_CTXT	2
 
 #define	SVCRDMA_DEVCAP_FAST_REG		1	/* fast mr registration */
@@ -160,8 +135,6 @@ struct svcxprt_rdma {
 	int		     sc_ctxt_used;
 	spinlock_t	     sc_rw_ctxt_lock;
 	struct list_head     sc_rw_ctxts;
-	spinlock_t	     sc_map_lock;
-	struct list_head     sc_maps;
 
 	struct list_head     sc_rq_dto_q;
 	spinlock_t	     sc_rq_dto_lock;
@@ -237,8 +210,6 @@ extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
 				     struct xdr_buf *xdr);
 
 /* svc_rdma_sendto.c */
-extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
-			    struct svc_rdma_req_map *, bool);
 extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
 				  struct svc_rdma_op_ctxt *ctxt,
 				  __be32 *rdma_resp, unsigned int len);
@@ -259,9 +230,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
 extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
 extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
 extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
-extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
-extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
-				 struct svc_rdma_req_map *);
 extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
 extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
 			      struct svc_rdma_fastreg_mr *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index e514f6864a93..1736337f3a55 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -240,74 +240,6 @@ static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
 	xdr_encode_write_chunk(p, rp_ch, consumed);
 }
 
-int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
-		     struct xdr_buf *xdr,
-		     struct svc_rdma_req_map *vec,
-		     bool write_chunk_present)
-{
-	int sge_no;
-	u32 sge_bytes;
-	u32 page_bytes;
-	u32 page_off;
-	int page_no;
-
-	if (xdr->len !=
-	    (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
-		pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
-		return -EIO;
-	}
-
-	/* Skip the first sge, this is for the RPCRDMA header */
-	sge_no = 1;
-
-	/* Head SGE */
-	vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
-	vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
-	sge_no++;
-
-	/* pages SGE */
-	page_no = 0;
-	page_bytes = xdr->page_len;
-	page_off = xdr->page_base;
-	while (page_bytes) {
-		vec->sge[sge_no].iov_base =
-			page_address(xdr->pages[page_no]) + page_off;
-		sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
-		page_bytes -= sge_bytes;
-		vec->sge[sge_no].iov_len = sge_bytes;
-
-		sge_no++;
-		page_no++;
-		page_off = 0; /* reset for next time through loop */
-	}
-
-	/* Tail SGE */
-	if (xdr->tail[0].iov_len) {
-		unsigned char *base = xdr->tail[0].iov_base;
-		size_t len = xdr->tail[0].iov_len;
-		u32 xdr_pad = xdr_padsize(xdr->page_len);
-
-		if (write_chunk_present && xdr_pad) {
-			base += xdr_pad;
-			len -= xdr_pad;
-		}
-
-		if (len) {
-			vec->sge[sge_no].iov_base = base;
-			vec->sge[sge_no].iov_len = len;
-			sge_no++;
-		}
-	}
-
-	dprintk("svcrdma: %s: sge_no %d page_no %d "
-		"page_base %u page_len %u head_len %zu tail_len %zu\n",
-		__func__, sge_no, page_no, xdr->page_base, xdr->page_len,
-		xdr->head[0].iov_len, xdr->tail[0].iov_len);
-
-	vec->count = sge_no;
-	return 0;
-}
-
 /* Parse the RPC Call's transport header.
  */
 static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 1597ca8ba3c0..a9d9cb1ba4c6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
 	}
 }
 
-static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
-{
-	struct svc_rdma_req_map *map;
-
-	map = kmalloc(sizeof(*map), flags);
-	if (map)
-		INIT_LIST_HEAD(&map->free);
-	return map;
-}
-
-static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
-{
-	unsigned int i;
-
-	/* One for each receive buffer on this connection. */
-	i = xprt->sc_max_requests;
-
-	while (i--) {
-		struct svc_rdma_req_map *map;
-
-		map = alloc_req_map(GFP_KERNEL);
-		if (!map) {
-			dprintk("svcrdma: No memory for request map\n");
-			return false;
-		}
-		list_add(&map->free, &xprt->sc_maps);
-	}
-	return true;
-}
-
-struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
-{
-	struct svc_rdma_req_map *map = NULL;
-
-	spin_lock(&xprt->sc_map_lock);
-	if (list_empty(&xprt->sc_maps))
-		goto out_empty;
-
-	map = list_first_entry(&xprt->sc_maps,
-			       struct svc_rdma_req_map, free);
-	list_del_init(&map->free);
-	spin_unlock(&xprt->sc_map_lock);
-
-out:
-	map->count = 0;
-	return map;
-
-out_empty:
-	spin_unlock(&xprt->sc_map_lock);
-
-	/* Pre-allocation amount was incorrect */
-	map = alloc_req_map(GFP_NOIO);
-	if (map)
-		goto out;
-
-	WARN_ONCE(1, "svcrdma: empty request map list?\n");
-	return NULL;
-}
-
-void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
-			  struct svc_rdma_req_map *map)
-{
-	spin_lock(&xprt->sc_map_lock);
-	list_add(&map->free, &xprt->sc_maps);
-	spin_unlock(&xprt->sc_map_lock);
-}
-
-static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
-{
-	while (!list_empty(&xprt->sc_maps)) {
-		struct svc_rdma_req_map *map;
-
-		map = list_first_entry(&xprt->sc_maps,
-				       struct svc_rdma_req_map, free);
-		list_del(&map->free);
-		kfree(map);
-	}
-}
-
 /* QP event handler */
 static void qp_event_handler(struct ib_event *event, void *context)
 {
@@ -544,7 +465,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
-	INIT_LIST_HEAD(&cma_xprt->sc_maps);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
@@ -552,7 +472,6 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
 	spin_lock_init(&cma_xprt->sc_frmr_q_lock);
 	spin_lock_init(&cma_xprt->sc_ctxt_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
-	spin_lock_init(&cma_xprt->sc_map_lock);
 
 	/*
 	 * Note that this implies that the underlying transport support
@@ -1004,8 +923,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 
 	if (!svc_rdma_prealloc_ctxts(newxprt))
 		goto errout;
-	if (!svc_rdma_prealloc_maps(newxprt))
-		goto errout;
 
 	/*
 	 * Limit ORD based on client limit, local device limit, and
@@ -1237,7 +1154,6 @@ static void __svc_rdma_free(struct work_struct *work)
 	rdma_dealloc_frmr_q(rdma);
 	svc_rdma_destroy_rw_ctxts(rdma);
 	svc_rdma_destroy_ctxts(rdma);
-	svc_rdma_destroy_maps(rdma);
 
 	/* Destroy the QP if present (not a listener) */
 	if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
-- 
cgit v1.2.3


From dadf3e435debb85dfcf28c157012047153a21a97 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 9 Apr 2017 13:07:21 -0400
Subject: svcrdma: Clean out old XDR encoders

Clean up: These have been replaced and are no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h        |  4 --
 net/sunrpc/xprtrdma/svc_rdma_marshal.c | 70 ----------------------------------
 2 files changed, 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 479bb7f65233..f3787d800ba4 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -187,10 +187,6 @@ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 
 /* svc_rdma_marshal.c */
 extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
-extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
-extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
-					    __be32, __be64, u32);
-extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index ae58a897eca0..bdcf7d85a3dc 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -166,73 +166,3 @@ out_inval:
 	dprintk("svcrdma: failed to parse transport header\n");
 	return -EINVAL;
 }
-
-/**
- * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
- * @rdma_resp: buffer containing Reply transport header
- *
- * Returns length of transport header, in bytes.
- */
-unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
-{
-	unsigned int nsegs;
-	__be32 *p;
-
-	p = rdma_resp;
-
-	/* RPC-over-RDMA V1 replies never have a Read list. */
-	p += rpcrdma_fixed_maxsz + 1;
-
-	/* Skip Write list. */
-	while (*p++ != xdr_zero) {
-		nsegs = be32_to_cpup(p++);
-		p += nsegs * rpcrdma_segment_maxsz;
-	}
-
-	/* Skip Reply chunk. */
-	if (*p++ != xdr_zero) {
-		nsegs = be32_to_cpup(p++);
-		p += nsegs * rpcrdma_segment_maxsz;
-	}
-
-	return (unsigned long)p - (unsigned long)rdma_resp;
-}
-
-void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
-{
-	struct rpcrdma_write_array *ary;
-
-	/* no read-list */
-	rmsgp->rm_body.rm_chunks[0] = xdr_zero;
-
-	/* write-array discrim */
-	ary = (struct rpcrdma_write_array *)
-		&rmsgp->rm_body.rm_chunks[1];
-	ary->wc_discrim = xdr_one;
-	ary->wc_nchunks = cpu_to_be32(chunks);
-
-	/* write-list terminator */
-	ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
-
-	/* reply-array discriminator */
-	ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
-}
-
-void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
-				 int chunks)
-{
-	ary->wc_discrim = xdr_one;
-	ary->wc_nchunks = cpu_to_be32(chunks);
-}
-
-void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
-				     int chunk_no,
-				     __be32 rs_handle,
-				     __be64 rs_offset,
-				     u32 write_len)
-{
-	struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
-	seg->rs_handle = rs_handle;
-	seg->rs_offset = rs_offset;
-	seg->rs_length = cpu_to_be32(write_len);
-}
-- 
cgit v1.2.3


From aee12a0a3727e16fb837367c4755cb6daaf45109 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 20 Apr 2017 00:45:48 +0200
Subject: ebtables: remove nf_hook_register usage

Similar to ip_register_table, pass nf_hook_ops to ebt_register_table().
This allows to handle hook registration also via pernet_ops and allows
us to avoid use of legacy register_hook api.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h |  6 ++-
 net/bridge/netfilter/ebtable_broute.c     |  4 +-
 net/bridge/netfilter/ebtable_filter.c     | 15 ++------
 net/bridge/netfilter/ebtable_nat.c        | 15 ++------
 net/bridge/netfilter/ebtables.c           | 61 +++++++++++++++++++------------
 5 files changed, 50 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index 984b2112c77b..a30efb437e6d 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -109,8 +109,10 @@ struct ebt_table {
 #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \
 		     ~(__alignof__(struct _xt_align)-1))
 extern struct ebt_table *ebt_register_table(struct net *net,
-					    const struct ebt_table *table);
-extern void ebt_unregister_table(struct net *net, struct ebt_table *table);
+					    const struct ebt_table *table,
+					    const struct nf_hook_ops *);
+extern void ebt_unregister_table(struct net *net, struct ebt_table *table,
+				 const struct nf_hook_ops *);
 extern unsigned int ebt_do_table(struct sk_buff *skb,
 				 const struct nf_hook_state *state,
 				 struct ebt_table *table);
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 8fe36dc3aab2..2585b100ebbb 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -65,13 +65,13 @@ static int ebt_broute(struct sk_buff *skb)
 
 static int __net_init broute_net_init(struct net *net)
 {
-	net->xt.broute_table = ebt_register_table(net, &broute_table);
+	net->xt.broute_table = ebt_register_table(net, &broute_table, NULL);
 	return PTR_ERR_OR_ZERO(net->xt.broute_table);
 }
 
 static void __net_exit broute_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.broute_table);
+	ebt_unregister_table(net, net->xt.broute_table, NULL);
 }
 
 static struct pernet_operations broute_net_ops = {
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 593a1bdc079e..f22ef7c21913 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -93,13 +93,13 @@ static struct nf_hook_ops ebt_ops_filter[] __read_mostly = {
 
 static int __net_init frame_filter_net_init(struct net *net)
 {
-	net->xt.frame_filter = ebt_register_table(net, &frame_filter);
+	net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter);
 	return PTR_ERR_OR_ZERO(net->xt.frame_filter);
 }
 
 static void __net_exit frame_filter_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.frame_filter);
+	ebt_unregister_table(net, net->xt.frame_filter, ebt_ops_filter);
 }
 
 static struct pernet_operations frame_filter_net_ops = {
@@ -109,20 +109,11 @@ static struct pernet_operations frame_filter_net_ops = {
 
 static int __init ebtable_filter_init(void)
 {
-	int ret;
-
-	ret = register_pernet_subsys(&frame_filter_net_ops);
-	if (ret < 0)
-		return ret;
-	ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
-	if (ret < 0)
-		unregister_pernet_subsys(&frame_filter_net_ops);
-	return ret;
+	return register_pernet_subsys(&frame_filter_net_ops);
 }
 
 static void __exit ebtable_filter_fini(void)
 {
-	nf_unregister_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter));
 	unregister_pernet_subsys(&frame_filter_net_ops);
 }
 
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index eb33919821ee..2f7a4f314406 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -93,13 +93,13 @@ static struct nf_hook_ops ebt_ops_nat[] __read_mostly = {
 
 static int __net_init frame_nat_net_init(struct net *net)
 {
-	net->xt.frame_nat = ebt_register_table(net, &frame_nat);
+	net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat);
 	return PTR_ERR_OR_ZERO(net->xt.frame_nat);
 }
 
 static void __net_exit frame_nat_net_exit(struct net *net)
 {
-	ebt_unregister_table(net, net->xt.frame_nat);
+	ebt_unregister_table(net, net->xt.frame_nat, ebt_ops_nat);
 }
 
 static struct pernet_operations frame_nat_net_ops = {
@@ -109,20 +109,11 @@ static struct pernet_operations frame_nat_net_ops = {
 
 static int __init ebtable_nat_init(void)
 {
-	int ret;
-
-	ret = register_pernet_subsys(&frame_nat_net_ops);
-	if (ret < 0)
-		return ret;
-	ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
-	if (ret < 0)
-		unregister_pernet_subsys(&frame_nat_net_ops);
-	return ret;
+	return register_pernet_subsys(&frame_nat_net_ops);
 }
 
 static void __exit ebtable_nat_fini(void)
 {
-	nf_unregister_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat));
 	unregister_pernet_subsys(&frame_nat_net_ops);
 }
 
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index bdc629eb0207..9ec0c9f908fa 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1157,8 +1157,30 @@ free_newinfo:
 	return ret;
 }
 
+static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
+{
+	int i;
+
+	mutex_lock(&ebt_mutex);
+	list_del(&table->list);
+	mutex_unlock(&ebt_mutex);
+	EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
+			  ebt_cleanup_entry, net, NULL);
+	if (table->private->nentries)
+		module_put(table->me);
+	vfree(table->private->entries);
+	if (table->private->chainstack) {
+		for_each_possible_cpu(i)
+			vfree(table->private->chainstack[i]);
+		vfree(table->private->chainstack);
+	}
+	vfree(table->private);
+	kfree(table);
+}
+
 struct ebt_table *
-ebt_register_table(struct net *net, const struct ebt_table *input_table)
+ebt_register_table(struct net *net, const struct ebt_table *input_table,
+		   const struct nf_hook_ops *ops)
 {
 	struct ebt_table_info *newinfo;
 	struct ebt_table *t, *table;
@@ -1238,6 +1260,16 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table)
 	}
 	list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
 	mutex_unlock(&ebt_mutex);
+
+	if (!ops)
+		return table;
+
+	ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+	if (ret) {
+		__ebt_unregister_table(net, table);
+		return ERR_PTR(ret);
+	}
+
 	return table;
 free_unlock:
 	mutex_unlock(&ebt_mutex);
@@ -1256,29 +1288,12 @@ out:
 	return ERR_PTR(ret);
 }
 
-void ebt_unregister_table(struct net *net, struct ebt_table *table)
+void ebt_unregister_table(struct net *net, struct ebt_table *table,
+			  const struct nf_hook_ops *ops)
 {
-	int i;
-
-	if (!table) {
-		BUGPRINT("Request to unregister NULL table!!!\n");
-		return;
-	}
-	mutex_lock(&ebt_mutex);
-	list_del(&table->list);
-	mutex_unlock(&ebt_mutex);
-	EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
-			  ebt_cleanup_entry, net, NULL);
-	if (table->private->nentries)
-		module_put(table->me);
-	vfree(table->private->entries);
-	if (table->private->chainstack) {
-		for_each_possible_cpu(i)
-			vfree(table->private->chainstack[i]);
-		vfree(table->private->chainstack);
-	}
-	vfree(table->private);
-	kfree(table);
+	if (ops)
+		nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+	__ebt_unregister_table(net, table);
 }
 
 /* userspace just supplied us with counters */
-- 
cgit v1.2.3


From 701cac61d0250912b89cbc28589969530179099a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Apr 2017 19:15:53 -0400
Subject: CONFIG_ARCH_HAS_RAW_COPY_USER is unconditional now

all architectures converted

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/Kconfig                  |  3 ---
 arch/alpha/Kconfig            |  1 -
 arch/arc/Kconfig              |  1 -
 arch/arm/Kconfig              |  1 -
 arch/arm64/Kconfig            |  1 -
 arch/avr32/Kconfig            |  1 -
 arch/blackfin/Kconfig         |  1 -
 arch/c6x/Kconfig              |  1 -
 arch/cris/Kconfig             |  1 -
 arch/frv/Kconfig              |  1 -
 arch/h8300/Kconfig            |  1 -
 arch/hexagon/Kconfig          |  1 -
 arch/ia64/Kconfig             |  1 -
 arch/m32r/Kconfig             |  1 -
 arch/m68k/Kconfig             |  1 -
 arch/metag/Kconfig            |  1 -
 arch/microblaze/Kconfig       |  1 -
 arch/mips/Kconfig             |  1 -
 arch/mn10300/Kconfig          |  1 -
 arch/nios2/Kconfig            |  1 -
 arch/openrisc/Kconfig         |  1 -
 arch/parisc/Kconfig           |  1 -
 arch/powerpc/Kconfig          |  1 -
 arch/s390/Kconfig             |  1 -
 arch/score/Kconfig            |  1 -
 arch/sh/Kconfig               |  1 -
 arch/sparc/Kconfig            |  1 -
 arch/tile/Kconfig             |  1 -
 arch/um/Kconfig.common        |  1 -
 arch/unicore32/Kconfig        |  1 -
 arch/x86/Kconfig              |  1 -
 arch/xtensa/Kconfig           |  1 -
 include/asm-generic/uaccess.h | 41 -----------------------------------------
 include/linux/uaccess.h       |  7 ++-----
 lib/Makefile                  |  4 +---
 35 files changed, 3 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 315d37626ddc..cd211a14a88f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -847,7 +847,4 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
-config ARCH_HAS_RAW_COPY_USER
-	bool
-
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 1be5f61dc630..0e49d39ea74a 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -26,7 +26,6 @@ config ALPHA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 7e213ff4f01f..c9f30f4763ab 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -44,7 +44,6 @@ config ARC
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZMA
-	select ARCH_HAS_RAW_COPY_USER
 
 config MIGHT_HAVE_PCI
 	bool
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6fab7f34739c..0d4e71b42c77 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -96,7 +96,6 @@ config ARM
 	select PERF_USE_VMALLOC
 	select RTC_LIB
 	select SYS_SUPPORTS_APM_EMULATION
-	select ARCH_HAS_RAW_COPY_USER
 	# Above selects are sorted alphabetically; please add new ones
 	# according to that.  Thanks.
 	help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3c833ff3303c..3741859765cf 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -115,7 +115,6 @@ config ARM64
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 8c349f2a9ebb..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -19,7 +19,6 @@ config AVR32
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 919dad1436f7..3c1bd640042a 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -41,7 +41,6 @@ config BLACKFIN
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 3c7bd9a29f90..5aa8ea8bad2d 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -18,7 +18,6 @@ config C6X
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	def_bool n
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 36f94c45e3f9..71b758dc3a96 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -71,7 +71,6 @@ config CRIS
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
 	select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 
 config HZ
 	int
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index e489fef111dd..eefd9a4ed156 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -16,7 +16,6 @@ config FRV
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select ARCH_HAS_RAW_COPY_USER
 
 config ZONE_DMA
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 473883417004..3ae852507e57 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -22,7 +22,6 @@ config H8300
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_HASH
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 0c536a83ea71..1941e4baaee6 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -26,7 +26,6 @@ config HEXAGON
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
 	select GENERIC_CPU_DEVICES
-	select ARCH_HAS_RAW_COPY_USER
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 0ed0e44856b2..18ca6a9ce566 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -53,7 +53,6 @@ config IA64
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_HARDENED_USERCOPY
-	select ARCH_HAS_RAW_COPY_USER
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index b3e82bdd6db0..95474460b367 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -19,7 +19,6 @@ config M32R
 	select HAVE_DEBUG_STACKOVERFLOW
 	select CPU_NO_EFFICIENT_FFS
 	select DMA_NOOP_OPS
-	select ARCH_HAS_RAW_COPY_USER
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 7d345758ea16..d140206d5d29 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -22,7 +22,6 @@ config M68K
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
-	select ARCH_HAS_RAW_COPY_USER
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index ecce0c5ec8e8..5b7a45d99cfb 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -1,6 +1,5 @@
 config METAG
 	def_bool y
-	select ARCH_HAS_RAW_COPY_USER
 	select EMBEDDED
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 1aff3658a104..85885a501dce 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -34,7 +34,6 @@ config MICROBLAZE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index aff5633bfe4d..a008a9f03072 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -69,7 +69,6 @@ config MIPS
 	select HAVE_EXIT_THREAD
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_ARCH_HARDENED_USERCOPY
-	select ARCH_HAS_RAW_COPY_USER
 
 menu "Machine selection"
 
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index a96f3dcb0119..38e3494bfb63 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -16,7 +16,6 @@ config MN10300
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select ARCH_HAS_RAW_COPY_USER
 
 config AM33_2
 	def_bool n
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 45b4727e3136..51a56c8b04b4 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,7 +16,6 @@ config NIOS2
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 38954181fa96..1e95920b0737 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -28,7 +28,6 @@ config OPENRISC
 	select OR1K_PIC
 	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 	select NO_BOOTMEM
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 15b7b279a169..ad294b3fb90b 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -41,7 +41,6 @@ config PARISC
 	select GENERIC_CLOCKEVENTS
 	select ARCH_NO_COHERENT_DMA_MMAP
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 839f08887269..97a8bc8a095c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -87,7 +87,6 @@ config PPC
 	select ARCH_HAS_DMA_SET_COHERENT_MASK
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_RAW_COPY_USER
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0724dbf2d8f8..a2dcef0aacc7 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -178,7 +178,6 @@ config S390
 	select ARCH_HAS_SCALED_CPUTIME
 	select VIRT_TO_BUS
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 4d241cfedc3b..507d63181389 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -15,7 +15,6 @@ config SCORE
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c689645eb8b7..ee086958b2b2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -48,7 +48,6 @@ config SUPERH
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 68353048f24e..68ac5c7cd982 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -45,7 +45,6 @@ config SPARC
 	select HAVE_ARCH_HARDENED_USERCOPY
 	select PROVE_LOCKING_SMALL if PROVE_LOCKING
 	select ARCH_WANT_RELAX_ORDER
-	select ARCH_HAS_RAW_COPY_USER
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 27808d86eb57..4583c0320059 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -33,7 +33,6 @@ config TILE
 	select USER_STACKTRACE_SUPPORT
 	select USE_PMC if PERF_EVENTS
 	select VIRT_TO_BUS
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	def_bool y
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index e13f763849b1..fd443852103c 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -13,7 +13,6 @@ config UML
 	select GENERIC_CLOCKEVENTS
 	select HAVE_GCC_PLUGINS
 	select TTY # Needed for line.c
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	bool
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 319519eaed24..0769066929c6 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -18,7 +18,6 @@ config UNICORE32
 	select ARCH_WANT_FRAME_POINTERS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_REL
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5f59fc388063..cc98d5a294ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -175,7 +175,6 @@ config X86
 	select USER_STACKTRACE_SUPPORT
 	select VIRT_TO_BUS
 	select X86_FEATURE_NAMES		if PROC_FS
-	select ARCH_HAS_RAW_COPY_USER
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 043d37d45919..f4126cf997a4 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -29,7 +29,6 @@ config XTENSA
 	select NO_BOOTMEM
 	select PERF_USE_VMALLOC
 	select VIRT_TO_BUS
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  Xtensa processors are 32-bit RISC machines designed by Tensilica
 	  primarily for embedded systems.  These processors are both
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index d65c311eb128..bbe4bb438e39 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -86,11 +86,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 
 static inline int __put_user_fn(size_t size, void __user *ptr, void *x)
 {
-#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
 	return unlikely(raw_copy_to_user(ptr, x, size)) ? -EFAULT : 0;
-#else
-	return unlikely(__copy_to_user(ptr, x, size)) ? -EFAULT : 0;
-#endif
 }
 
 #define __put_user_fn(sz, u, k)	__put_user_fn(sz, u, k)
@@ -151,11 +147,7 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 #ifndef __get_user_fn
 static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 {
-#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
 	return unlikely(raw_copy_from_user(x, ptr, size)) ? -EFAULT : 0;
-#else
-	return unlikely(__copy_from_user(x, ptr, size)) ? -EFAULT : 0;
-#endif
 }
 
 #define __get_user_fn(sz, u, k)	__get_user_fn(sz, u, k)
@@ -164,39 +156,6 @@ static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 
 extern int __get_user_bad(void) __attribute__((noreturn));
 
-#ifndef CONFIG_ARCH_HAS_RAW_COPY_USER
-
-#ifndef __copy_from_user_inatomic
-#define __copy_from_user_inatomic __copy_from_user
-#endif
-
-#ifndef __copy_to_user_inatomic
-#define __copy_to_user_inatomic __copy_to_user
-#endif
-
-static inline long copy_from_user(void *to,
-		const void __user * from, unsigned long n)
-{
-	unsigned long res = n;
-	might_fault();
-	if (likely(access_ok(VERIFY_READ, from, n)))
-		res = __copy_from_user(to, from, n);
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
-	return res;
-}
-
-static inline long copy_to_user(void __user *to,
-		const void *from, unsigned long n)
-{
-	might_fault();
-	if (access_ok(VERIFY_WRITE, to, n))
-		return __copy_to_user(to, from, n);
-	else
-		return n;
-}
-#endif
-
 /*
  * Copy a null terminated string from userspace.
  */
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 7fc2104b88bc..e0cbfb09e60f 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -12,12 +12,10 @@
 
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
 /*
  * Architectures should provide two primitives (raw_copy_{to,from}_user())
- * select ARCH_HAS_RAW_COPY_FROM_USER and get rid of their private instances
- * of copy_{to,from}_user() and __copy_{to,from}_user{,_inatomic}().  Once
- * all of them switch, this part of linux/uaccess.h will become unconditional.
+ * and get rid of their private instances of copy_{to,from}_user() and
+ * __copy_{to,from}_user{,_inatomic}().
  *
  * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
  * return the amount left to copy.  They should assume that access_ok() has
@@ -196,7 +194,6 @@ copy_in_user(void __user *to, const void *from, unsigned long n)
 	return n;
 }
 #endif
-#endif
 
 static __always_inline void pagefault_disabled_inc(void)
 {
diff --git a/lib/Makefile b/lib/Makefile
index 7d875c389172..b47cf97e1e68 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -41,7 +41,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-	 once.o refcount.o
+	 once.o refcount.o usercopy.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
@@ -242,5 +242,3 @@ UBSAN_SANITIZE_ubsan.o := n
 obj-$(CONFIG_SBITMAP) += sbitmap.o
 
 obj-$(CONFIG_PARMAN) += parman.o
-
-obj-$(CONFIG_ARCH_HAS_RAW_COPY_USER) += usercopy.o
-- 
cgit v1.2.3


From c373fff7bd252ec36e8a895c58a584088f1d38bc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 26 Apr 2017 12:26:22 -0400
Subject: NFSv4: Don't special case "launder"

If the client receives a fatal server error from nfs_pageio_add_request(),
then we should always truncate the page on which the error occurred.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c          |  2 +-
 fs/nfs/write.c         | 27 +++++++++++----------------
 include/linux/nfs_fs.h | 14 +-------------
 3 files changed, 13 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index bebed885b6e4..5713eb32a45e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page)
 		inode->i_ino, (long long)page_offset(page));
 
 	nfs_fscache_wait_on_page_write(nfsi, page);
-	return nfs_wb_launder_page(inode, page);
+	return nfs_wb_page(inode, page);
 }
 
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2e654940478f..59e21cc0a266 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -586,8 +586,7 @@ nfs_error_is_fatal_on_server(int err)
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page, bool nonblock,
-				bool launder)
+				struct page *page, bool nonblock)
 {
 	struct nfs_page *req;
 	int ret = 0;
@@ -610,13 +609,11 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 	if (!nfs_pageio_add_request(pgio, req)) {
 		ret = pgio->pg_error;
 		/*
-		 * Remove the problematic req upon fatal errors
-		 * in launder case, while other dirty pages can
-		 * still be around until they get flushed.
+		 * Remove the problematic req upon fatal errors on the server
 		 */
 		if (nfs_error_is_fatal(ret)) {
 			nfs_context_set_write_error(req->wb_context, ret);
-			if (launder)
+			if (nfs_error_is_fatal_on_server(ret))
 				goto out_launder;
 		}
 		nfs_redirty_request(req);
@@ -632,13 +629,12 @@ out_launder:
 }
 
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
-			    struct nfs_pageio_descriptor *pgio, bool launder)
+			    struct nfs_pageio_descriptor *pgio)
 {
 	int ret;
 
 	nfs_pageio_cond_complete(pgio, page_index(page));
-	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
-				   launder);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
@@ -650,8 +646,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
  * Write an mmapped page to the server.
  */
 static int nfs_writepage_locked(struct page *page,
-				struct writeback_control *wbc,
-				bool launder)
+				struct writeback_control *wbc)
 {
 	struct nfs_pageio_descriptor pgio;
 	struct inode *inode = page_file_mapping(page)->host;
@@ -660,7 +655,7 @@ static int nfs_writepage_locked(struct page *page,
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_pageio_init_write(&pgio, inode, 0,
 				false, &nfs_async_write_completion_ops);
-	err = nfs_do_writepage(page, wbc, &pgio, launder);
+	err = nfs_do_writepage(page, wbc, &pgio);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
 		return err;
@@ -673,7 +668,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 
-	ret = nfs_writepage_locked(page, wbc, false);
+	ret = nfs_writepage_locked(page, wbc);
 	unlock_page(page);
 	return ret;
 }
@@ -682,7 +677,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 {
 	int ret;
 
-	ret = nfs_do_writepage(page, wbc, data, false);
+	ret = nfs_do_writepage(page, wbc, data);
 	unlock_page(page);
 	return ret;
 }
@@ -2013,7 +2008,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 /*
  * Write back all requests on one page - we do this before reading it.
  */
-int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
+int nfs_wb_page(struct inode *inode, struct page *page)
 {
 	loff_t range_start = page_file_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
@@ -2030,7 +2025,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
 	for (;;) {
 		wait_on_page_writeback(page);
 		if (clear_page_dirty_for_io(page)) {
-			ret = nfs_writepage_locked(page, &wbc, launder);
+			ret = nfs_writepage_locked(page, &wbc);
 			if (ret < 0)
 				goto out_error;
 			continue;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 9aa044e76820..bb0eb2c9acca 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -500,24 +500,12 @@ extern int  nfs_updatepage(struct file *, struct page *, unsigned int, unsigned
  */
 extern int nfs_sync_inode(struct inode *inode);
 extern int nfs_wb_all(struct inode *inode);
-extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
+extern int nfs_wb_page(struct inode *inode, struct page *page);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 
-static inline int
-nfs_wb_launder_page(struct inode *inode, struct page *page)
-{
-	return nfs_wb_single_page(inode, page, true);
-}
-
-static inline int
-nfs_wb_page(struct inode *inode, struct page *page)
-{
-	return nfs_wb_single_page(inode, page, false);
-}
-
 static inline int
 nfs_have_writebacks(struct inode *inode)
 {
-- 
cgit v1.2.3


From c7e88067c1ae89e7bcbed070fb2c4e30bc39b51f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 16:01:46 -0700
Subject: srcu: Exact tracking of srcu_data structures containing callbacks

The current Tree SRCU implementation schedules a workqueue for every
srcu_data covered by a given leaf srcu_node structure having callbacks,
even if only one of those srcu_data structures actually contains
callbacks.  This is clearly inefficient for workloads that don't feature
callbacks everywhere all the time.  This commit therefore adds an array
of masks that are used by the leaf srcu_node structures to track exactly
which srcu_data structures contain callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/srcutree.h |  4 ++++
 kernel/rcu/srcutree.c    | 29 +++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 0400e211aa44..94515ff226fb 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -47,6 +47,8 @@ struct srcu_data {
 	struct delayed_work work;		/* Context for CB invoking. */
 	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
 	struct srcu_node *mynode;		/* Leaf srcu_node. */
+	unsigned long grpmask;			/* Mask for leaf srcu_node */
+						/*  ->srcu_data_have_cbs[]. */
 	int cpu;
 	struct srcu_struct *sp;
 };
@@ -59,6 +61,8 @@ struct srcu_node {
 	unsigned long srcu_have_cbs[4];		/* GP seq for children */
 						/*  having CBs, but only */
 						/*  is > ->srcu_gq_seq. */
+	unsigned long srcu_data_have_cbs[4];	/* Which srcu_data structs */
+						/*  have CBs for given GP? */
 	struct srcu_node *srcu_parent;		/* Next up in tree. */
 	int grplo;				/* Least CPU for node. */
 	int grphi;				/* Biggest CPU for node. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9ecf0acc18eb..1c2c1004b3b1 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -66,8 +66,12 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 	/* Each pass through this loop initializes one srcu_node structure. */
 	rcu_for_each_node_breadth_first(sp, snp) {
 		spin_lock_init(&snp->lock);
-		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++)
+		WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) !=
+			     ARRAY_SIZE(snp->srcu_data_have_cbs));
+		for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) {
 			snp->srcu_have_cbs[i] = 0;
+			snp->srcu_data_have_cbs[i] = 0;
+		}
 		snp->grplo = -1;
 		snp->grphi = -1;
 		if (snp == &sp->node[0]) {
@@ -107,6 +111,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 		sdp->cpu = cpu;
 		INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks);
 		sdp->sp = sp;
+		sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
 		if (is_static)
 			continue;
 
@@ -434,16 +439,21 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
 
 /*
  * Schedule callback invocation for all srcu_data structures associated
- * with the specified srcu_node structure, if possible, on the corresponding
- * CPUs.
+ * with the specified srcu_node structure that have callbacks for the
+ * just-completed grace period, the one corresponding to idx.  If possible,
+ * schedule this invocation on the corresponding CPUs.
  */
-static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp)
+static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
+				  unsigned long mask)
 {
 	int cpu;
 
-	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++)
+	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
+		if (!(mask & (1 << (cpu - snp->grplo))))
+			continue;
 		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
 				      atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+	}
 }
 
 /*
@@ -461,6 +471,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	unsigned long gpseq;
 	int idx;
 	int idxnext;
+	unsigned long mask;
 	struct srcu_node *snp;
 
 	/* Prevent more than one additional grace period. */
@@ -486,10 +497,12 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
 		snp->srcu_have_cbs[idx] = gpseq;
 		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+		mask = snp->srcu_data_have_cbs[idx];
+		snp->srcu_data_have_cbs[idx] = 0;
 		spin_unlock_irq(&snp->lock);
 		if (cbs) {
 			smp_mb(); /* GP end before CB invocation. */
-			srcu_schedule_cbs_snp(sp, snp);
+			srcu_schedule_cbs_snp(sp, snp, mask);
 		}
 	}
 
@@ -536,6 +549,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 		spin_lock_irqsave(&snp->lock, flags);
 		if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) {
 			snp_seq = snp->srcu_have_cbs[idx];
+			if (snp == sdp->mynode && snp_seq == s)
+				snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 			spin_unlock_irqrestore(&snp->lock, flags);
 			if (snp == sdp->mynode && snp_seq != s) {
 				smp_mb(); /* CBs after GP! */
@@ -544,6 +559,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 			return;
 		}
 		snp->srcu_have_cbs[idx] = s;
+		if (snp == sdp->mynode)
+			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
 		spin_unlock_irqrestore(&snp->lock, flags);
 	}
 
-- 
cgit v1.2.3


From 7f6733c3c648ddd6cf459c1b80ad388a95452955 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 18 Apr 2017 17:17:35 -0700
Subject: srcu: Make rcutorture writer stalls print SRCU GP state

In the past, SRCU was simple enough that there was little point in
making the rcutorture writer stall messages print the SRCU grace-period
number state.  With the advent of Tree SRCU, this has changed.  This
commit therefore makes Classic, Tiny, and Tree SRCU report this state
to rcutorture as needed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/srcuclassic.h | 14 ++++++++++++++
 include/linux/srcutiny.h    | 12 ++++++++++++
 include/linux/srcutree.h    |  4 ++++
 kernel/rcu/rcutorture.c     |  8 +++++---
 kernel/rcu/srcutree.c       | 13 +++++++++++++
 kernel/rcu/tree.c           | 12 ++++--------
 6 files changed, 52 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h
index 41cf99930f34..5753f7322262 100644
--- a/include/linux/srcuclassic.h
+++ b/include/linux/srcuclassic.h
@@ -98,4 +98,18 @@ void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = sp->completed;
+	*gpnum = *completed;
+	if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head)
+		(*gpnum)++;
+}
+
 #endif
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 4f284e4f4d8c..42311ee0334f 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -78,4 +78,16 @@ static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
 	return 0;
 }
 
+static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = sp->srcu_gp_seq;
+	*gpnum = *completed;
+}
+
 #endif
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 94515ff226fb..3865717df124 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -140,4 +140,8 @@ void synchronize_srcu_expedited(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 unsigned long srcu_batches_completed(struct srcu_struct *sp);
 
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+			     struct srcu_struct *sp, int *flags,
+			     unsigned long *gpnum, unsigned long *completed);
+
 #endif
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index e9d4527cdd43..ae6e574d4cf5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1360,12 +1360,14 @@ rcu_torture_stats_print(void)
 		cur_ops->stats();
 	if (rtcv_snap == rcu_torture_current_version &&
 	    rcu_torture_current != NULL) {
-		int __maybe_unused flags;
-		unsigned long __maybe_unused gpnum;
-		unsigned long __maybe_unused completed;
+		int __maybe_unused flags = 0;
+		unsigned long __maybe_unused gpnum = 0;
+		unsigned long __maybe_unused completed = 0;
 
 		rcutorture_get_gp_data(cur_ops->ttype,
 				       &flags, &gpnum, &completed);
+		srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
+					&flags, &gpnum, &completed);
 		wtp = READ_ONCE(writer_task);
 		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
 			 rcu_torture_writer_state_getname(),
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 1c2c1004b3b1..72b6cce5f591 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1011,3 +1011,16 @@ void process_srcu(struct work_struct *work)
 	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
 }
 EXPORT_SYMBOL_GPL(process_srcu);
+
+void srcutorture_get_gp_data(enum rcutorture_type test_type,
+					   struct srcu_struct *sp, int *flags,
+					   unsigned long *gpnum,
+					   unsigned long *completed)
+{
+	if (test_type != SRCU_FLAVOR)
+		return;
+	*flags = 0;
+	*completed = rcu_seq_ctr(sp->srcu_gp_seq);
+	*gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed);
+}
+EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 23aa02587d0f..91fff49d5869 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -704,15 +704,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
 	default:
 		break;
 	}
-	if (rsp != NULL) {
-		*flags = READ_ONCE(rsp->gp_flags);
-		*gpnum = READ_ONCE(rsp->gpnum);
-		*completed = READ_ONCE(rsp->completed);
+	if (rsp == NULL)
 		return;
-	}
-	*flags = 0;
-	*gpnum = 0;
-	*completed = 0;
+	*flags = READ_ONCE(rsp->gp_flags);
+	*gpnum = READ_ONCE(rsp->gpnum);
+	*completed = READ_ONCE(rsp->completed);
 }
 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
 
-- 
cgit v1.2.3


From f555f34fdc586a56204cd16d9a7c104ec6cb6650 Mon Sep 17 00:00:00 2001
From: Alexander Kochetkov <al.kochet@gmail.com>
Date: Thu, 20 Apr 2017 14:00:04 +0300
Subject: net: phy: fix auto-negotiation stall due to unavailable interrupt

The Ethernet link on an interrupt driven PHY was not coming up if the Ethernet
cable was plugged before the Ethernet interface was brought up.

The patch trigger PHY state machine to update link state if PHY was requested to
do auto-negotiation and auto-negotiation complete flag already set.

During power-up cycle the PHY do auto-negotiation, generate interrupt and set
auto-negotiation complete flag. Interrupt is handled by PHY state machine but
doesn't update link state because PHY is in PHY_READY state. After some time
MAC bring up, start and request PHY to do auto-negotiation. If there are no new
settings to advertise genphy_config_aneg() doesn't start PHY auto-negotiation.
PHY continue to stay in auto-negotiation complete state and doesn't fire
interrupt. At the same time PHY state machine expect that PHY started
auto-negotiation and is waiting for interrupt from PHY and it won't get it.

Fixes: 321beec5047a ("net: phy: Use interrupts when available in NOLINK state")
Signed-off-by: Alexander Kochetkov <al.kochet@gmail.com>
Cc: stable <stable@vger.kernel.org> # v4.9+
Tested-by: Roger Quadros <rogerq@ti.com>
Tested-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 40 ++++++++++++++++++++++++++++++++++++----
 include/linux/phy.h   |  1 +
 2 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index a2bfc82e95d7..97ff1278167b 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -591,16 +591,18 @@ int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd)
 EXPORT_SYMBOL(phy_mii_ioctl);
 
 /**
- * phy_start_aneg - start auto-negotiation for this PHY device
+ * phy_start_aneg_priv - start auto-negotiation for this PHY device
  * @phydev: the phy_device struct
+ * @sync: indicate whether we should wait for the workqueue cancelation
  *
  * Description: Sanitizes the settings (if we're not autonegotiating
  *   them), and then calls the driver's config_aneg function.
  *   If the PHYCONTROL Layer is operating, we change the state to
  *   reflect the beginning of Auto-negotiation or forcing.
  */
-int phy_start_aneg(struct phy_device *phydev)
+static int phy_start_aneg_priv(struct phy_device *phydev, bool sync)
 {
+	bool trigger = 0;
 	int err;
 
 	if (!phydev->drv)
@@ -628,10 +630,40 @@ int phy_start_aneg(struct phy_device *phydev)
 		}
 	}
 
+	/* Re-schedule a PHY state machine to check PHY status because
+	 * negotiation may already be done and aneg interrupt may not be
+	 * generated.
+	 */
+	if (phy_interrupt_is_valid(phydev) && (phydev->state == PHY_AN)) {
+		err = phy_aneg_done(phydev);
+		if (err > 0) {
+			trigger = true;
+			err = 0;
+		}
+	}
+
 out_unlock:
 	mutex_unlock(&phydev->lock);
+
+	if (trigger)
+		phy_trigger_machine(phydev, sync);
+
 	return err;
 }
+
+/**
+ * phy_start_aneg - start auto-negotiation for this PHY device
+ * @phydev: the phy_device struct
+ *
+ * Description: Sanitizes the settings (if we're not autonegotiating
+ *   them), and then calls the driver's config_aneg function.
+ *   If the PHYCONTROL Layer is operating, we change the state to
+ *   reflect the beginning of Auto-negotiation or forcing.
+ */
+int phy_start_aneg(struct phy_device *phydev)
+{
+	return phy_start_aneg_priv(phydev, true);
+}
 EXPORT_SYMBOL(phy_start_aneg);
 
 /**
@@ -659,7 +691,7 @@ void phy_start_machine(struct phy_device *phydev)
  *   state machine runs.
  */
 
-static void phy_trigger_machine(struct phy_device *phydev, bool sync)
+void phy_trigger_machine(struct phy_device *phydev, bool sync)
 {
 	if (sync)
 		cancel_delayed_work_sync(&phydev->state_queue);
@@ -1154,7 +1186,7 @@ void phy_state_machine(struct work_struct *work)
 	mutex_unlock(&phydev->lock);
 
 	if (needs_aneg)
-		err = phy_start_aneg(phydev);
+		err = phy_start_aneg_priv(phydev, false);
 	else if (do_suspend)
 		phy_suspend(phydev);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 43a774873aa9..fb3857337151 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -852,6 +852,7 @@ void phy_change_work(struct work_struct *work);
 void phy_mac_interrupt(struct phy_device *phydev, int new_link);
 void phy_start_machine(struct phy_device *phydev);
 void phy_stop_machine(struct phy_device *phydev);
+void phy_trigger_machine(struct phy_device *phydev, bool sync);
 int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd);
 int phy_ethtool_ksettings_get(struct phy_device *phydev,
-- 
cgit v1.2.3


From 038a3e858de4e3ddf42c330a22b7efcddbc0a81a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 25 Apr 2017 11:41:34 +0200
Subject: rhashtable: remove insecure_max_entries param

no users in the tree, insecure_max_entries is always set to
ht->p.max_size * 2 in rhtashtable_init().

Replace only spot that uses it with a ht->p.max_size check.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 6 ++----
 lib/rhashtable.c           | 6 ------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index ae87dcdf52d2..ae93b65d13d7 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -125,7 +125,6 @@ struct rhashtable;
  * @key_len: Length of key
  * @key_offset: Offset of key in struct to be hashed
  * @head_offset: Offset of rhash_head in struct to be hashed
- * @insecure_max_entries: Maximum number of entries (may be exceeded)
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
  * @nulls_base: Base value to generate nulls marker
@@ -140,7 +139,6 @@ struct rhashtable_params {
 	size_t			key_len;
 	size_t			key_offset;
 	size_t			head_offset;
-	unsigned int		insecure_max_entries;
 	unsigned int		max_size;
 	unsigned int		min_size;
 	u32			nulls_base;
@@ -329,8 +327,8 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
 				      const struct bucket_table *tbl)
 {
-	return ht->p.insecure_max_entries &&
-	       atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+	return ht->p.max_size &&
+	       (atomic_read(&ht->nelems) / 2u) >= ht->p.max_size;
 }
 
 /* The bucket lock is selected based on the hash and protects mutations
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index d22a5ef109fb..f3b82e0d417b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -961,12 +961,6 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->max_size)
 		ht->p.max_size = rounddown_pow_of_two(params->max_size);
 
-	if (params->insecure_max_entries)
-		ht->p.insecure_max_entries =
-			rounddown_pow_of_two(params->insecure_max_entries);
-	else
-		ht->p.insecure_max_entries = ht->p.max_size * 2;
-
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
 	if (params->nelem_hint)
-- 
cgit v1.2.3


From 69e996c58a35db9ca79b3f021a15bcd22202e1c0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Apr 2017 10:15:32 -0700
Subject: tcp: add tp->tcp_mstamp field

We want to use precise timestamps in TCP stack, but we do not
want to call possibly expensive kernel time services too often.

tp->tcp_mstamp is guaranteed to be updated once per incoming packet.

We will use it in the following patches, removing specific
skb_mstamp_get() calls, and removing ack_time from
struct tcp_sacktag_state.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  | 1 +
 net/ipv4/tcp_input.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index cbe5b602a2d3..99a22f44c32e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -240,6 +240,7 @@ struct tcp_sock {
 	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
+	struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */
 	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
 	u32	mdev_us;	/* medium deviation			*/
 	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5af2f04f8859..bd18c65df4a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5362,6 +5362,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	skb_mstamp_get(&tp->tcp_mstamp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
 	/*
@@ -5922,6 +5923,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 
 	case TCP_SYN_SENT:
 		tp->rx_opt.saw_tstamp = 0;
+		skb_mstamp_get(&tp->tcp_mstamp);
 		queued = tcp_rcv_synsent_state_process(sk, skb, th);
 		if (queued >= 0)
 			return queued;
@@ -5933,6 +5935,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 		return 0;
 	}
 
+	skb_mstamp_get(&tp->tcp_mstamp);
 	tp->rx_opt.saw_tstamp = 0;
 	req = tp->fastopen_rsk;
 	if (req) {
-- 
cgit v1.2.3


From 645f4c6f2ebd040688cc2a5f626ffc909e66ccf2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 25 Apr 2017 10:15:41 -0700
Subject: tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps

Some devices or distributions use HZ=100 or HZ=250

TCP receive buffer autotuning has poor behavior caused by this choice.
Since autotuning happens after 4 ms or 10 ms, short distance flows
get their receive buffer tuned to a very high value, but after an initial
period where it was frozen to (too small) initial value.

With tp->tcp_mstamp introduction, we can switch to high resolution
timestamps almost for free (at the expense of 8 additional bytes per
TCP structure)

Note that some TCP stacks use usec TCP timestamps where this
patch makes even more sense : Many TCP flows have < 500 usec RTT.
Hopefully this finer TS option can be standardized soon.

Tested:
 HZ=100 kernel
 ./netperf -H lpaa24 -t TCP_RR -l 1000 -- -r 10000,10000 &

 Peer without patch :
 lpaa24:~# ss -tmi dst lpaa23
 ...
 skmem:(r0,rb8388608,...)
 rcv_rtt:10 rcv_space:3210000 minrtt:0.017

 Peer with the patch :
 lpaa23:~# ss -tmi dst lpaa24
 ...
 skmem:(r0,rb428800,...)
 rcv_rtt:0.069 rcv_space:30000 minrtt:0.017

We can see saner RCVBUF, and more precise rcv_rtt information.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h  | 12 ++++++------
 net/ipv4/tcp.c       |  2 +-
 net/ipv4/tcp_input.c | 28 +++++++++++++++++-----------
 3 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 99a22f44c32e..b6d5adcee8fc 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -333,16 +333,16 @@ struct tcp_sock {
 
 /* Receiver side RTT estimation */
 	struct {
-		u32	rtt;
-		u32	seq;
-		u32	time;
+		u32		rtt_us;
+		u32		seq;
+		struct skb_mstamp time;
 	} rcv_rtt_est;
 
 /* Receiver queue space */
 	struct {
-		int	space;
-		u32	seq;
-		u32	time;
+		int		space;
+		u32		seq;
+		struct skb_mstamp time;
 	} rcvq_space;
 
 /* TCP-specific MTU probe information. */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index efc976ae66ae..059dad7deefe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2853,7 +2853,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
 	info->tcpi_advmss = tp->advmss;
 
-	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3;
 	info->tcpi_rcv_space = tp->rcvq_space.space;
 
 	info->tcpi_total_retrans = tp->total_retrans;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f475f0b53bfe..9739962bfb3f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk)
 		tcp_sndbuf_expand(sk);
 
 	tp->rcvq_space.space = tp->rcv_wnd;
-	tp->rcvq_space.time = tcp_time_stamp;
+	skb_mstamp_get(&tp->tcp_mstamp);
+	tp->rcvq_space.time = tp->tcp_mstamp;
 	tp->rcvq_space.seq = tp->copied_seq;
 
 	maxwin = tcp_full_space(sk);
@@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
  */
 static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
 {
-	u32 new_sample = tp->rcv_rtt_est.rtt;
+	u32 new_sample = tp->rcv_rtt_est.rtt_us;
 	long m = sample;
 
 	if (m == 0)
@@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
 		new_sample = m << 3;
 	}
 
-	if (tp->rcv_rtt_est.rtt != new_sample)
-		tp->rcv_rtt_est.rtt = new_sample;
+	tp->rcv_rtt_est.rtt_us = new_sample;
 }
 
 static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
 {
-	if (tp->rcv_rtt_est.time == 0)
+	u32 delta_us;
+
+	if (tp->rcv_rtt_est.time.v64 == 0)
 		goto new_measure;
 	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
 		return;
-	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
+	delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time);
+	tcp_rcv_rtt_update(tp, delta_us, 1);
 
 new_measure:
 	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
-	tp->rcv_rtt_est.time = tcp_time_stamp;
+	tp->rcv_rtt_est.time = tp->tcp_mstamp;
 }
 
 static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
@@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
 	if (tp->rx_opt.rcv_tsecr &&
 	    (TCP_SKB_CB(skb)->end_seq -
 	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
-		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+		tcp_rcv_rtt_update(tp,
+				   jiffies_to_usecs(tcp_time_stamp -
+						    tp->rx_opt.rcv_tsecr),
+				   0);
 }
 
 /*
@@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
 	int time;
 	int copied;
 
-	time = tcp_time_stamp - tp->rcvq_space.time;
-	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
+	time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time);
+	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
 		return;
 
 	/* Number of bytes copied to user in last RTT */
@@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
 
 new_measure:
 	tp->rcvq_space.seq = tp->copied_seq;
-	tp->rcvq_space.time = tcp_time_stamp;
+	tp->rcvq_space.time = tp->tcp_mstamp;
 }
 
 /* There is something which you must keep in mind when you analyze the
-- 
cgit v1.2.3


From 2836ee4b1acbe7b396219d0677426885f14cd792 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 26 Apr 2017 13:47:56 -0700
Subject: blk-mq: Add blk_mq_ops.show_rq()

This new callback function will be used in the next patch to show
more information about SCSI requests.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-debugfs.c | 6 +++++-
 include/linux/blk-mq.h | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ac39093c4ef7..bcd2a7d4a3a5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -311,6 +311,7 @@ static const char *const rqf_name[] = {
 static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 {
 	struct request *rq = list_entry_rq(v);
+	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
 	const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
 
 	seq_printf(m, "%p {.op=", rq);
@@ -324,8 +325,11 @@ static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
-	seq_printf(m, ", .tag=%d, .internal_tag=%d}\n", rq->tag,
+	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
+	if (mq_ops->show_rq)
+		mq_ops->show_rq(m, rq);
+	seq_puts(m, "}\n");
 	return 0;
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0c4dadb85f62..32bd8eb5ba67 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -142,6 +142,14 @@ struct blk_mq_ops {
 	reinit_request_fn	*reinit_request;
 
 	map_queues_fn		*map_queues;
+
+#ifdef CONFIG_BLK_DEBUG_FS
+	/*
+	 * Used by the debugfs implementation to show driver-specific
+	 * information about a request.
+	 */
+	void (*show_rq)(struct seq_file *m, struct request *rq);
+#endif
 };
 
 enum {
-- 
cgit v1.2.3


From 1cbf41dbacb6c8decdf8d838bbf5ca5b448a269f Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Wed, 26 Apr 2017 10:58:46 +0300
Subject: ieee80211: add SUITE_B AKM selectors

Add the definitions for SUITE_B and SUITE_B_192 AKM selectors as
defined in IEEE802.11REVmc_D5.0, table 9-132.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 294fa6273a62..23e095fa6701 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2356,18 +2356,20 @@ enum ieee80211_sa_query_action {
 #define WLAN_CIPHER_SUITE_SMS4		SUITE(0x001472, 1)
 
 /* AKM suite selectors */
-#define WLAN_AKM_SUITE_8021X		SUITE(0x000FAC, 1)
-#define WLAN_AKM_SUITE_PSK		SUITE(0x000FAC, 2)
-#define WLAN_AKM_SUITE_FT_PSK		SUITE(0x000FAC, 4)
-#define WLAN_AKM_SUITE_8021X_SHA256	SUITE(0x000FAC, 5)
-#define WLAN_AKM_SUITE_PSK_SHA256	SUITE(0x000FAC, 6)
-#define WLAN_AKM_SUITE_TDLS		SUITE(0x000FAC, 7)
-#define WLAN_AKM_SUITE_SAE		SUITE(0x000FAC, 8)
-#define WLAN_AKM_SUITE_FT_OVER_SAE	SUITE(0x000FAC, 9)
-#define WLAN_AKM_SUITE_FILS_SHA256	SUITE(0x000FAC, 14)
-#define WLAN_AKM_SUITE_FILS_SHA384	SUITE(0x000FAC, 15)
-#define WLAN_AKM_SUITE_FT_FILS_SHA256	SUITE(0x000FAC, 16)
-#define WLAN_AKM_SUITE_FT_FILS_SHA384	SUITE(0x000FAC, 17)
+#define WLAN_AKM_SUITE_8021X			SUITE(0x000FAC, 1)
+#define WLAN_AKM_SUITE_PSK			SUITE(0x000FAC, 2)
+#define WLAN_AKM_SUITE_FT_PSK			SUITE(0x000FAC, 4)
+#define WLAN_AKM_SUITE_8021X_SHA256		SUITE(0x000FAC, 5)
+#define WLAN_AKM_SUITE_PSK_SHA256		SUITE(0x000FAC, 6)
+#define WLAN_AKM_SUITE_TDLS			SUITE(0x000FAC, 7)
+#define WLAN_AKM_SUITE_SAE			SUITE(0x000FAC, 8)
+#define WLAN_AKM_SUITE_FT_OVER_SAE		SUITE(0x000FAC, 9)
+#define WLAN_AKM_SUITE_8021X_SUITE_B		SUITE(0x000FAC, 11)
+#define WLAN_AKM_SUITE_8021X_SUITE_B_192	SUITE(0x000FAC, 12)
+#define WLAN_AKM_SUITE_FILS_SHA256		SUITE(0x000FAC, 14)
+#define WLAN_AKM_SUITE_FILS_SHA384		SUITE(0x000FAC, 15)
+#define WLAN_AKM_SUITE_FT_FILS_SHA256		SUITE(0x000FAC, 16)
+#define WLAN_AKM_SUITE_FT_FILS_SHA384		SUITE(0x000FAC, 17)
 
 #define WLAN_MAX_KEY_LEN		32
 
-- 
cgit v1.2.3


From 2ead3235fd7128347a60a3942b3e2048834d62aa Mon Sep 17 00:00:00 2001
From: Luca Coelho <luciano.coelho@intel.com>
Date: Wed, 26 Apr 2017 10:58:48 +0300
Subject: ieee80211: add FT-802.1X AKM suite selector

Add the definition for FT-8021.1X AKM selector as defined in
IEEE Std 802.11-2016, table 9-133.

Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 23e095fa6701..52abfbcd5975 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2358,6 +2358,7 @@ enum ieee80211_sa_query_action {
 /* AKM suite selectors */
 #define WLAN_AKM_SUITE_8021X			SUITE(0x000FAC, 1)
 #define WLAN_AKM_SUITE_PSK			SUITE(0x000FAC, 2)
+#define WLAN_AKM_SUITE_FT_8021X			SUITE(0x000FAC, 3)
 #define WLAN_AKM_SUITE_FT_PSK			SUITE(0x000FAC, 4)
 #define WLAN_AKM_SUITE_8021X_SHA256		SUITE(0x000FAC, 5)
 #define WLAN_AKM_SUITE_PSK_SHA256		SUITE(0x000FAC, 6)
-- 
cgit v1.2.3


From f6601e176c8b01bc545959c091778343a8c66951 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 26 Apr 2017 10:58:52 +0300
Subject: ieee80211: fix kernel-doc parsing errors

Some of the enum definitions are unnamed but there's still
an attempt at documenting them - that doesn't work. Name
them to make that work.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 52abfbcd5975..639e77abf064 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2177,37 +2177,37 @@ enum ieee80211_tdls_actioncode {
 #define WLAN_BSS_COEX_INFORMATION_REQUEST	BIT(0)
 
 /**
- * enum - mesh synchronization method identifier
+ * enum ieee80211_mesh_sync_method - mesh synchronization method identifier
  *
  * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method
  * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method
  *	that will be specified in a vendor specific information element
  */
-enum {
+enum ieee80211_mesh_sync_method {
 	IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1,
 	IEEE80211_SYNC_METHOD_VENDOR = 255,
 };
 
 /**
- * enum - mesh path selection protocol identifier
+ * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier
  *
  * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol
  * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will
  *	be specified in a vendor specific information element
  */
-enum {
+enum ieee80211_mesh_path_protocol {
 	IEEE80211_PATH_PROTOCOL_HWMP = 1,
 	IEEE80211_PATH_PROTOCOL_VENDOR = 255,
 };
 
 /**
- * enum - mesh path selection metric identifier
+ * enum ieee80211_mesh_path_metric - mesh path selection metric identifier
  *
  * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric
  * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be
  *	specified in a vendor specific information element
  */
-enum {
+enum ieee80211_mesh_path_metric {
 	IEEE80211_PATH_METRIC_AIRTIME = 1,
 	IEEE80211_PATH_METRIC_VENDOR = 255,
 };
-- 
cgit v1.2.3


From bfd20f1cc85010d2f2d77e544da05cd8c149ba9b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 26 Apr 2017 09:18:35 -0700
Subject: x86, iommu/vt-d: Add an option to disable Intel IOMMU force on

IOMMU harms performance signficantly when we run very fast networking
workloads. It's 40GB networking doing XDP test. Software overhead is
almost unaware, but it's the IOTLB miss (based on our analysis) which
kills the performance. We observed the same performance issue even with
software passthrough (identity mapping), only the hardware passthrough
survives. The pps with iommu (with software passthrough) is only about
~30% of that without it. This is a limitation in hardware based on our
observation, so we'd like to disable the IOMMU force on, but we do want
to use TBOOT and we can sacrifice the DMA security bought by IOMMU. I
must admit I know nothing about TBOOT, but TBOOT guys (cc-ed) think not
eabling IOMMU is totally ok.

So introduce a new boot option to disable the force on. It's kind of
silly we need to run into intel_iommu_init even without force on, but we
need to disable TBOOT PMR registers. For system without the boot option,
nothing is changed.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 +++++++++
 arch/x86/kernel/tboot.c                         |  3 +++
 drivers/iommu/intel-iommu.c                     | 18 ++++++++++++++++++
 include/linux/dma_remapping.h                   |  1 +
 4 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2ba45caabada..17135bfade6a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1578,6 +1578,15 @@
 			extended tables themselves, and also PASID support. With
 			this option set, extended tables will not be used even
 			on hardware which claims to support them.
+		tboot_noforce [Default Off]
+			Do not force the Intel IOMMU enabled under tboot.
+			By default, tboot will force Intel IOMMU on, which
+			could harm performance of some high-throughput
+			devices like 40GBit network cards, even if identity
+			mapping is enabled.
+			Note that using this option lowers the security
+			provided by tboot because it makes the system
+			vulnerable to DMA attacks.
 
 	intel_idle.max_cstate=	[KNL,HW,ACPI,X86]
 			0	disables intel_idle and fall back on acpi_idle.
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index b868fa1b812b..edbdfe6ab60a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -510,6 +510,9 @@ int tboot_force_iommu(void)
 	if (!tboot_enabled())
 		return 0;
 
+	if (!intel_iommu_tboot_noforce)
+		return 1;
+
 	if (no_iommu || swiotlb || dmar_disabled)
 		pr_warning("Forcing Intel-IOMMU to enabled\n");
 
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 5f08ba13972b..b0ced1c13713 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -183,6 +183,7 @@ static int rwbf_quirk;
  * (used when kernel is launched w/ TXT)
  */
 static int force_on = 0;
+int intel_iommu_tboot_noforce;
 
 /*
  * 0: Present
@@ -607,6 +608,10 @@ static int __init intel_iommu_setup(char *str)
 				"Intel-IOMMU: enable pre-production PASID support\n");
 			intel_iommu_pasid28 = 1;
 			iommu_identity_mapping |= IDENTMAP_GFX;
+		} else if (!strncmp(str, "tboot_noforce", 13)) {
+			printk(KERN_INFO
+				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
+			intel_iommu_tboot_noforce = 1;
 		}
 
 		str += strcspn(str, ",");
@@ -4850,6 +4855,19 @@ int __init intel_iommu_init(void)
 	}
 
 	if (no_iommu || dmar_disabled) {
+		/*
+		 * We exit the function here to ensure IOMMU's remapping and
+		 * mempool aren't setup, which means that the IOMMU's PMRs
+		 * won't be disabled via the call to init_dmars(). So disable
+		 * it explicitly here. The PMRs were setup by tboot prior to
+		 * calling SENTER, but the kernel is expected to reset/tear
+		 * down the PMRs.
+		 */
+		if (intel_iommu_tboot_noforce) {
+			for_each_iommu(iommu, drhd)
+				iommu_disable_protect_mem_regions(iommu);
+		}
+
 		/*
 		 * Make sure the IOMMUs are switched off, even when we
 		 * boot into a kexec kernel and the previous kernel left
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
index 187c10299722..90884072fa73 100644
--- a/include/linux/dma_remapping.h
+++ b/include/linux/dma_remapping.h
@@ -39,6 +39,7 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu);
 extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
 extern int dmar_disabled;
 extern int intel_iommu_enabled;
+extern int intel_iommu_tboot_noforce;
 #else
 static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
 {
-- 
cgit v1.2.3


From 1e9a038b7fe9a8c10ef1238f4e695d5fbe0dd594 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 24 Apr 2017 16:02:09 -0700
Subject: srcu: Expedited grace periods with reduced memory contention

Commit f60d231a87c5 ("srcu: Crude control of expedited grace periods")
introduced a per-srcu_struct atomic counter to track outstanding
requests for grace periods.  This works, but represents a memory-contention
bottleneck.  This commit therefore uses the srcu_node combining tree
to remove this bottleneck.

This commit adds new ->srcu_gp_seq_needed_exp fields to the
srcu_data, srcu_node, and srcu_struct structures, which track the
farthest-in-the-future grace period that must be expedited, which in
turn requires that all nearer-term grace periods also be expedited.
Requests for expediting start with the srcu_data structure, run up
through the srcu_node tree, and end at the srcu_struct structure.
Note that it may be necessary to expedite a grace period that just
now started, and this is handled by a new srcu_funnel_exp_start()
function, which is invoked when the grace period itself is already
in its way, but when that grace period was not marked as expedited.

A new srcu_get_delay() function returns zero if there is at least one
expedited SRCU grace period in flight, or SRCU_INTERVAL otherwise.
This function is used to calculate delays:  Normal grace periods
are allowed to extend in order to cover more requests with a given
grace-period computation, which decreases per-request overhead.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/srcutree.h |   4 +-
 kernel/rcu/srcutree.c    | 135 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 98 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 3865717df124..86df48d3e97b 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -43,6 +43,7 @@ struct srcu_data {
 	spinlock_t lock ____cacheline_internodealigned_in_smp;
 	struct rcu_segcblist srcu_cblist;	/* List of callbacks.*/
 	unsigned long srcu_gp_seq_needed;	/* Furthest future GP needed. */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	bool srcu_cblist_invoking;		/* Invoking these CBs? */
 	struct delayed_work work;		/* Context for CB invoking. */
 	struct rcu_head srcu_barrier_head;	/* For srcu_barrier() use. */
@@ -63,6 +64,7 @@ struct srcu_node {
 						/*  is > ->srcu_gq_seq. */
 	unsigned long srcu_data_have_cbs[4];	/* Which srcu_data structs */
 						/*  have CBs for given GP? */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	struct srcu_node *srcu_parent;		/* Next up in tree. */
 	int grplo;				/* Least CPU for node. */
 	int grphi;				/* Biggest CPU for node. */
@@ -81,7 +83,7 @@ struct srcu_struct {
 	unsigned int srcu_idx;			/* Current rdr array element. */
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
 	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
-	atomic_t srcu_exp_cnt;			/* # ongoing expedited GPs. */
+	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
 	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
 	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 72b6cce5f591..4b98e6f45166 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -72,6 +72,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 			snp->srcu_have_cbs[i] = 0;
 			snp->srcu_data_have_cbs[i] = 0;
 		}
+		snp->srcu_gp_seq_needed_exp = 0;
 		snp->grplo = -1;
 		snp->grphi = -1;
 		if (snp == &sp->node[0]) {
@@ -102,6 +103,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static)
 		rcu_segcblist_init(&sdp->srcu_cblist);
 		sdp->srcu_cblist_invoking = false;
 		sdp->srcu_gp_seq_needed = sp->srcu_gp_seq;
+		sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq;
 		sdp->mynode = &snp_first[cpu / levelspread[level]];
 		for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) {
 			if (snp->grplo < 0)
@@ -135,7 +137,6 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 	mutex_init(&sp->srcu_gp_mutex);
 	sp->srcu_idx = 0;
 	sp->srcu_gp_seq = 0;
-	atomic_set(&sp->srcu_exp_cnt, 0);
 	sp->srcu_barrier_seq = 0;
 	mutex_init(&sp->srcu_barrier_mutex);
 	atomic_set(&sp->srcu_barrier_cpu_cnt, 0);
@@ -143,6 +144,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 	if (!is_static)
 		sp->sda = alloc_percpu(struct srcu_data);
 	init_srcu_struct_nodes(sp, is_static);
+	sp->srcu_gp_seq_needed_exp = 0;
 	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
 	return sp->sda ? 0 : -ENOMEM;
 }
@@ -307,6 +309,18 @@ static bool srcu_readers_active(struct srcu_struct *sp)
 
 #define SRCU_INTERVAL		1
 
+/*
+ * Return grace-period delay, zero if there are expedited grace
+ * periods pending, SRCU_INTERVAL otherwise.
+ */
+static unsigned long srcu_get_delay(struct srcu_struct *sp)
+{
+	if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq),
+			 READ_ONCE(sp->srcu_gp_seq_needed_exp)))
+		return 0;
+	return SRCU_INTERVAL;
+}
+
 /**
  * cleanup_srcu_struct - deconstruct a sleep-RCU structure
  * @sp: structure to clean up.
@@ -318,7 +332,8 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
 {
 	int cpu;
 
-	WARN_ON_ONCE(atomic_read(&sp->srcu_exp_cnt));
+	if (WARN_ON(!srcu_get_delay(sp)))
+		return; /* Leakage unless caller handles error. */
 	if (WARN_ON(srcu_readers_active(sp)))
 		return; /* Leakage unless caller handles error. */
 	flush_delayed_work(&sp->work);
@@ -444,15 +459,14 @@ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay)
  * schedule this invocation on the corresponding CPUs.
  */
 static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
-				  unsigned long mask)
+				  unsigned long mask, unsigned long delay)
 {
 	int cpu;
 
 	for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) {
 		if (!(mask & (1 << (cpu - snp->grplo))))
 			continue;
-		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu),
-				      atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+		srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay);
 	}
 }
 
@@ -467,6 +481,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp,
  */
 static void srcu_gp_end(struct srcu_struct *sp)
 {
+	unsigned long cbdelay;
 	bool cbs;
 	unsigned long gpseq;
 	int idx;
@@ -481,8 +496,11 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	spin_lock_irq(&sp->gp_lock);
 	idx = rcu_seq_state(sp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
+	cbdelay = srcu_get_delay(sp);
 	rcu_seq_end(&sp->srcu_gp_seq);
 	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
+	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
+		sp->srcu_gp_seq_needed_exp = gpseq;
 	spin_unlock_irq(&sp->gp_lock);
 	mutex_unlock(&sp->srcu_gp_mutex);
 	/* A new grace period can start at this point.  But only one. */
@@ -497,12 +515,14 @@ static void srcu_gp_end(struct srcu_struct *sp)
 			cbs = snp->srcu_have_cbs[idx] == gpseq;
 		snp->srcu_have_cbs[idx] = gpseq;
 		rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
+		if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
+			snp->srcu_gp_seq_needed_exp = gpseq;
 		mask = snp->srcu_data_have_cbs[idx];
 		snp->srcu_data_have_cbs[idx] = 0;
 		spin_unlock_irq(&snp->lock);
 		if (cbs) {
 			smp_mb(); /* GP end before CB invocation. */
-			srcu_schedule_cbs_snp(sp, snp, mask);
+			srcu_schedule_cbs_snp(sp, snp, mask, cbdelay);
 		}
 	}
 
@@ -517,15 +537,43 @@ static void srcu_gp_end(struct srcu_struct *sp)
 		srcu_gp_start(sp);
 		spin_unlock_irq(&sp->gp_lock);
 		/* Throttle expedited grace periods: Should be rare! */
-		srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) &&
-				    rcu_seq_ctr(gpseq) & 0xf
-				    ? 0
-				    : SRCU_INTERVAL);
+		srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff
+				    ? 0 : SRCU_INTERVAL);
 	} else {
 		spin_unlock_irq(&sp->gp_lock);
 	}
 }
 
+/*
+ * Funnel-locking scheme to scalably mediate many concurrent expedited
+ * grace-period requests.  This function is invoked for the first known
+ * expedited request for a grace period that has already been requested,
+ * but without expediting.  To start a completely new grace period,
+ * whether expedited or not, use srcu_funnel_gp_start() instead.
+ */
+static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp,
+				  unsigned long s)
+{
+	unsigned long flags;
+
+	for (; snp != NULL; snp = snp->srcu_parent) {
+		if (rcu_seq_done(&sp->srcu_gp_seq, s) ||
+		    ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s))
+			return;
+		spin_lock_irqsave(&snp->lock, flags);
+		if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) {
+			spin_unlock_irqrestore(&snp->lock, flags);
+			return;
+		}
+		WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
+		spin_unlock_irqrestore(&snp->lock, flags);
+	}
+	spin_lock_irqsave(&sp->gp_lock, flags);
+	if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+		sp->srcu_gp_seq_needed_exp = s;
+	spin_unlock_irqrestore(&sp->gp_lock, flags);
+}
+
 /*
  * Funnel-locking scheme to scalably mediate many concurrent grace-period
  * requests.  The winner has to do the work of actually starting grace
@@ -533,9 +581,8 @@ static void srcu_gp_end(struct srcu_struct *sp)
  * number is recorded on at least their leaf srcu_node structure, or they
  * must take steps to invoke their own callbacks.
  */
-static void srcu_funnel_gp_start(struct srcu_struct *sp,
-				 struct srcu_data *sdp,
-				 unsigned long s)
+static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp,
+				 unsigned long s, bool do_norm)
 {
 	unsigned long flags;
 	int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs);
@@ -554,13 +601,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 			spin_unlock_irqrestore(&snp->lock, flags);
 			if (snp == sdp->mynode && snp_seq != s) {
 				smp_mb(); /* CBs after GP! */
-				srcu_schedule_cbs_sdp(sdp, 0);
+				srcu_schedule_cbs_sdp(sdp, do_norm
+							   ? SRCU_INTERVAL
+							   : 0);
+				return;
 			}
+			if (!do_norm)
+				srcu_funnel_exp_start(sp, snp, s);
 			return;
 		}
 		snp->srcu_have_cbs[idx] = s;
 		if (snp == sdp->mynode)
 			snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
+		if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
+			snp->srcu_gp_seq_needed_exp = s;
 		spin_unlock_irqrestore(&snp->lock, flags);
 	}
 
@@ -573,6 +627,8 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 		 */
 		smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/
 	}
+	if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s))
+		sp->srcu_gp_seq_needed_exp = s;
 
 	/* If grace period not already done and none in progress, start it. */
 	if (!rcu_seq_done(&sp->srcu_gp_seq, s) &&
@@ -580,9 +636,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp,
 		WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed));
 		srcu_gp_start(sp);
 		queue_delayed_work(system_power_efficient_wq, &sp->work,
-				   atomic_read(&sp->srcu_exp_cnt)
-				   ? 0
-				   : SRCU_INTERVAL);
+				   srcu_get_delay(sp));
 	}
 	spin_unlock_irqrestore(&sp->gp_lock, flags);
 }
@@ -597,7 +651,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount)
 	for (;;) {
 		if (srcu_readers_active_idx_check(sp, idx))
 			return true;
-		if (--trycount + !!atomic_read(&sp->srcu_exp_cnt) <= 0)
+		if (--trycount + !srcu_get_delay(sp) <= 0)
 			return false;
 		udelay(SRCU_RETRY_CHECK_DELAY);
 	}
@@ -650,10 +704,11 @@ static void srcu_flip(struct srcu_struct *sp)
  * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
  * srcu_struct structure.
  */
-void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
-	       rcu_callback_t func)
+void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+		 rcu_callback_t func, bool do_norm)
 {
 	unsigned long flags;
+	bool needexp = false;
 	bool needgp = false;
 	unsigned long s;
 	struct srcu_data *sdp;
@@ -672,16 +727,28 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 		sdp->srcu_gp_seq_needed = s;
 		needgp = true;
 	}
+	if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
+		sdp->srcu_gp_seq_needed_exp = s;
+		needexp = true;
+	}
 	spin_unlock_irqrestore(&sdp->lock, flags);
 	if (needgp)
-		srcu_funnel_gp_start(sp, sdp, s);
+		srcu_funnel_gp_start(sp, sdp, s, do_norm);
+	else if (needexp)
+		srcu_funnel_exp_start(sp, sdp->mynode, s);
+}
+
+void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
+	       rcu_callback_t func)
+{
+	__call_srcu(sp, rhp, func, true);
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
-static void __synchronize_srcu(struct srcu_struct *sp)
+static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
 {
 	struct rcu_synchronize rcu;
 
@@ -697,7 +764,7 @@ static void __synchronize_srcu(struct srcu_struct *sp)
 	check_init_srcu_struct(sp);
 	init_completion(&rcu.completion);
 	init_rcu_head_on_stack(&rcu.head);
-	call_srcu(sp, &rcu.head, wakeme_after_rcu);
+	__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
 	wait_for_completion(&rcu.completion);
 	destroy_rcu_head_on_stack(&rcu.head);
 }
@@ -714,18 +781,7 @@ static void __synchronize_srcu(struct srcu_struct *sp)
  */
 void synchronize_srcu_expedited(struct srcu_struct *sp)
 {
-	bool do_norm = rcu_gp_is_normal();
-
-	check_init_srcu_struct(sp);
-	if (!do_norm) {
-		atomic_inc(&sp->srcu_exp_cnt);
-		smp_mb__after_atomic(); /* increment before GP. */
-	}
-	__synchronize_srcu(sp);
-	if (!do_norm) {
-		smp_mb__before_atomic(); /* GP before decrement. */
-		WARN_ON_ONCE(atomic_dec_return(&sp->srcu_exp_cnt) < 0);
-	}
+	__synchronize_srcu(sp, rcu_gp_is_normal());
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 
@@ -773,7 +829,7 @@ void synchronize_srcu(struct srcu_struct *sp)
 	if (rcu_gp_is_expedited())
 		synchronize_srcu_expedited(sp);
 	else
-		__synchronize_srcu(sp);
+		__synchronize_srcu(sp, true);
 }
 EXPORT_SYMBOL_GPL(synchronize_srcu);
 
@@ -1008,14 +1064,13 @@ void process_srcu(struct work_struct *work)
 	sp = container_of(work, struct srcu_struct, work.work);
 
 	srcu_advance_state(sp);
-	srcu_reschedule(sp, atomic_read(&sp->srcu_exp_cnt) ? 0 : SRCU_INTERVAL);
+	srcu_reschedule(sp, srcu_get_delay(sp));
 }
 EXPORT_SYMBOL_GPL(process_srcu);
 
 void srcutorture_get_gp_data(enum rcutorture_type test_type,
-					   struct srcu_struct *sp, int *flags,
-					   unsigned long *gpnum,
-					   unsigned long *completed)
+			     struct srcu_struct *sp, int *flags,
+			     unsigned long *gpnum, unsigned long *completed)
 {
 	if (test_type != SRCU_FLAVOR)
 		return;
-- 
cgit v1.2.3


From 22607d66bbc3e81140d3bcf08894f4378eb36428 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 25 Apr 2017 14:03:11 -0700
Subject: srcu: Specify auto-expedite holdoff time

On small systems, in the absence of readers, expedited SRCU grace
periods can complete in less than a microsecond.  This means that an
eight-CPU system can have all CPUs doing synchronize_srcu() in a tight
loop and almost always expedite.  This might actually be desirable in
some situations, but in general it is a good way to needlessly burn
CPU cycles.  And in those situations where it is desirable, your friend
is the function synchronize_srcu_expedited().

For other situations, this commit adds a kernel parameter that specifies
a holdoff between completing the last SRCU grace period and auto-expediting
the next.  If the next grace period starts before the holdoff expires,
auto-expediting is disabled.  The holdoff is 50 microseconds by default,
and can be tuned to the desired number of nanoseconds.  A value of zero
disables auto-expediting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Mike Galbraith <efault@gmx.de>
---
 Documentation/admin-guide/kernel-parameters.txt |  8 ++++++++
 include/linux/srcutree.h                        |  1 +
 kernel/rcu/srcutree.c                           | 18 +++++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index facc20a3f962..4a4b9266c4de 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3779,6 +3779,14 @@
 	spia_pedr=
 	spia_peddr=
 
+	srcutree.exp_holdoff [KNL]
+			Specifies how many nanoseconds must elapse
+			since the end of the last SRCU grace period for
+			a given srcu_struct until the next normal SRCU
+			grace period will be considered for automatic
+			expediting.  Set to zero to disable automatic
+			expediting.
+
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 86df48d3e97b..32e86d85fd11 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -84,6 +84,7 @@ struct srcu_struct {
 	unsigned long srcu_gp_seq;		/* Grace-period seq #. */
 	unsigned long srcu_gp_seq_needed;	/* Latest gp_seq needed. */
 	unsigned long srcu_gp_seq_needed_exp;	/* Furthest future exp GP. */
+	unsigned long srcu_last_gp_end;		/* Last GP end timestamp (ns) */
 	struct srcu_data __percpu *sda;		/* Per-CPU srcu_data array. */
 	unsigned long srcu_barrier_seq;		/* srcu_barrier seq #. */
 	struct mutex srcu_barrier_mutex;	/* Serialize barrier ops. */
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 2286e06fd159..74c283f9d15e 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -34,10 +34,14 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/srcu.h>
 
 #include "rcu.h"
 
+ulong exp_holdoff = 50 * 1000; /* Holdoff (ns) for auto-expediting. */
+module_param(exp_holdoff, ulong, 0444);
+
 static void srcu_invoke_callbacks(struct work_struct *work);
 static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
 
@@ -145,6 +149,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static)
 		sp->sda = alloc_percpu(struct srcu_data);
 	init_srcu_struct_nodes(sp, is_static);
 	sp->srcu_gp_seq_needed_exp = 0;
+	sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */
 	return sp->sda ? 0 : -ENOMEM;
 }
@@ -498,6 +503,7 @@ static void srcu_gp_end(struct srcu_struct *sp)
 	idx = rcu_seq_state(sp->srcu_gp_seq);
 	WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
 	cbdelay = srcu_get_delay(sp);
+	sp->srcu_last_gp_end = ktime_get_mono_fast_ns();
 	rcu_seq_end(&sp->srcu_gp_seq);
 	gpseq = rcu_seq_current(&sp->srcu_gp_seq);
 	if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq))
@@ -700,9 +706,10 @@ static void srcu_flip(struct srcu_struct *sp)
  */
 static bool srcu_might_be_idle(struct srcu_struct *sp)
 {
+	unsigned long curseq;
 	unsigned long flags;
 	struct srcu_data *sdp;
-	unsigned long curseq;
+	unsigned long t;
 
 	/* If the local srcu_data structure has callbacks, not idle.  */
 	local_irq_save(flags);
@@ -718,6 +725,15 @@ static bool srcu_might_be_idle(struct srcu_struct *sp)
 	 * Exact information would require acquiring locks, which would
 	 * kill scalability, hence the probabalistic nature of the probe.
 	 */
+
+	/* First, see if enough time has passed since the last GP. */
+	t = ktime_get_mono_fast_ns();
+	if (exp_holdoff == 0 ||
+	    time_in_range_open(t, sp->srcu_last_gp_end,
+			       sp->srcu_last_gp_end + exp_holdoff))
+		return false; /* Too soon after last GP. */
+
+	/* Next, check for probable idleness. */
 	curseq = rcu_seq_current(&sp->srcu_gp_seq);
 	smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
 	if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed)))
-- 
cgit v1.2.3


From d2d58e0e0d6c750941147e505f4263239427e359 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Sat, 15 Apr 2017 08:54:56 +0200
Subject: fs/affs: import amigaffs.h

Have that file in global include/linux is not needed.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/affs.h           |   2 +-
 fs/affs/amigaffs.h       | 144 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/amigaffs.h | 144 -----------------------------------------------
 3 files changed, 145 insertions(+), 145 deletions(-)
 create mode 100644 fs/affs/amigaffs.h
 delete mode 100644 include/linux/amigaffs.h

(limited to 'include/linux')

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 2f8bab390d13..ba26a316f46e 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -7,7 +7,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/amigaffs.h>
+#include "amigaffs.h"
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
 
diff --git a/fs/affs/amigaffs.h b/fs/affs/amigaffs.h
new file mode 100644
index 000000000000..43b41c06aa37
--- /dev/null
+++ b/fs/affs/amigaffs.h
@@ -0,0 +1,144 @@
+#ifndef AMIGAFFS_H
+#define AMIGAFFS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define FS_OFS		0x444F5300
+#define FS_FFS		0x444F5301
+#define FS_INTLOFS	0x444F5302
+#define FS_INTLFFS	0x444F5303
+#define FS_DCOFS	0x444F5304
+#define FS_DCFFS	0x444F5305
+#define MUFS_FS		0x6d754653   /* 'muFS' */
+#define MUFS_OFS	0x6d754600   /* 'muF\0' */
+#define MUFS_FFS	0x6d754601   /* 'muF\1' */
+#define MUFS_INTLOFS	0x6d754602   /* 'muF\2' */
+#define MUFS_INTLFFS	0x6d754603   /* 'muF\3' */
+#define MUFS_DCOFS	0x6d754604   /* 'muF\4' */
+#define MUFS_DCFFS	0x6d754605   /* 'muF\5' */
+
+#define T_SHORT		2
+#define T_LIST		16
+#define T_DATA		8
+
+#define ST_LINKFILE	-4
+#define ST_FILE		-3
+#define ST_ROOT		1
+#define ST_USERDIR	2
+#define ST_SOFTLINK	3
+#define ST_LINKDIR	4
+
+#define AFFS_ROOT_BMAPS		25
+
+struct affs_date {
+	__be32 days;
+	__be32 mins;
+	__be32 ticks;
+};
+
+struct affs_short_date {
+	__be16 days;
+	__be16 mins;
+	__be16 ticks;
+};
+
+struct affs_root_head {
+	__be32 ptype;
+	__be32 spare1;
+	__be32 spare2;
+	__be32 hash_size;
+	__be32 spare3;
+	__be32 checksum;
+	__be32 hashtable[1];
+};
+
+struct affs_root_tail {
+	__be32 bm_flag;
+	__be32 bm_blk[AFFS_ROOT_BMAPS];
+	__be32 bm_ext;
+	struct affs_date root_change;
+	u8 disk_name[32];
+	__be32 spare1;
+	__be32 spare2;
+	struct affs_date disk_change;
+	struct affs_date disk_create;
+	__be32 spare3;
+	__be32 spare4;
+	__be32 dcache;
+	__be32 stype;
+};
+
+struct affs_head {
+	__be32 ptype;
+	__be32 key;
+	__be32 block_count;
+	__be32 spare1;
+	__be32 first_data;
+	__be32 checksum;
+	__be32 table[1];
+};
+
+struct affs_tail {
+	__be32 spare1;
+	__be16 uid;
+	__be16 gid;
+	__be32 protect;
+	__be32 size;
+	u8 comment[92];
+	struct affs_date change;
+	u8 name[32];
+	__be32 spare2;
+	__be32 original;
+	__be32 link_chain;
+	__be32 spare[5];
+	__be32 hash_chain;
+	__be32 parent;
+	__be32 extension;
+	__be32 stype;
+};
+
+struct slink_front
+{
+	__be32 ptype;
+	__be32 key;
+	__be32 spare1[3];
+	__be32 checksum;
+	u8 symname[1];	/* depends on block size */
+};
+
+struct affs_data_head
+{
+	__be32 ptype;
+	__be32 key;
+	__be32 sequence;
+	__be32 size;
+	__be32 next;
+	__be32 checksum;
+	u8 data[1];	/* depends on block size */
+};
+
+/* Permission bits */
+
+#define FIBF_OTR_READ		0x8000
+#define FIBF_OTR_WRITE		0x4000
+#define FIBF_OTR_EXECUTE	0x2000
+#define FIBF_OTR_DELETE		0x1000
+#define FIBF_GRP_READ		0x0800
+#define FIBF_GRP_WRITE		0x0400
+#define FIBF_GRP_EXECUTE	0x0200
+#define FIBF_GRP_DELETE		0x0100
+
+#define FIBF_HIDDEN		0x0080
+#define FIBF_SCRIPT		0x0040
+#define FIBF_PURE		0x0020		/* no use under linux */
+#define FIBF_ARCHIVED		0x0010		/* never set, always cleared on write */
+#define FIBF_NOREAD		0x0008		/* 0 means allowed */
+#define FIBF_NOWRITE		0x0004		/* 0 means allowed */
+#define FIBF_NOEXECUTE		0x0002		/* 0 means allowed, ignored under linux */
+#define FIBF_NODELETE		0x0001		/* 0 means allowed */
+
+#define FIBF_OWNER		0x000F		/* Bits pertaining to owner */
+#define FIBF_MASK		0xEE0E		/* Bits modified by Linux */
+
+#endif
diff --git a/include/linux/amigaffs.h b/include/linux/amigaffs.h
deleted file mode 100644
index 43b41c06aa37..000000000000
--- a/include/linux/amigaffs.h
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef AMIGAFFS_H
-#define AMIGAFFS_H
-
-#include <linux/types.h>
-#include <asm/byteorder.h>
-
-#define FS_OFS		0x444F5300
-#define FS_FFS		0x444F5301
-#define FS_INTLOFS	0x444F5302
-#define FS_INTLFFS	0x444F5303
-#define FS_DCOFS	0x444F5304
-#define FS_DCFFS	0x444F5305
-#define MUFS_FS		0x6d754653   /* 'muFS' */
-#define MUFS_OFS	0x6d754600   /* 'muF\0' */
-#define MUFS_FFS	0x6d754601   /* 'muF\1' */
-#define MUFS_INTLOFS	0x6d754602   /* 'muF\2' */
-#define MUFS_INTLFFS	0x6d754603   /* 'muF\3' */
-#define MUFS_DCOFS	0x6d754604   /* 'muF\4' */
-#define MUFS_DCFFS	0x6d754605   /* 'muF\5' */
-
-#define T_SHORT		2
-#define T_LIST		16
-#define T_DATA		8
-
-#define ST_LINKFILE	-4
-#define ST_FILE		-3
-#define ST_ROOT		1
-#define ST_USERDIR	2
-#define ST_SOFTLINK	3
-#define ST_LINKDIR	4
-
-#define AFFS_ROOT_BMAPS		25
-
-struct affs_date {
-	__be32 days;
-	__be32 mins;
-	__be32 ticks;
-};
-
-struct affs_short_date {
-	__be16 days;
-	__be16 mins;
-	__be16 ticks;
-};
-
-struct affs_root_head {
-	__be32 ptype;
-	__be32 spare1;
-	__be32 spare2;
-	__be32 hash_size;
-	__be32 spare3;
-	__be32 checksum;
-	__be32 hashtable[1];
-};
-
-struct affs_root_tail {
-	__be32 bm_flag;
-	__be32 bm_blk[AFFS_ROOT_BMAPS];
-	__be32 bm_ext;
-	struct affs_date root_change;
-	u8 disk_name[32];
-	__be32 spare1;
-	__be32 spare2;
-	struct affs_date disk_change;
-	struct affs_date disk_create;
-	__be32 spare3;
-	__be32 spare4;
-	__be32 dcache;
-	__be32 stype;
-};
-
-struct affs_head {
-	__be32 ptype;
-	__be32 key;
-	__be32 block_count;
-	__be32 spare1;
-	__be32 first_data;
-	__be32 checksum;
-	__be32 table[1];
-};
-
-struct affs_tail {
-	__be32 spare1;
-	__be16 uid;
-	__be16 gid;
-	__be32 protect;
-	__be32 size;
-	u8 comment[92];
-	struct affs_date change;
-	u8 name[32];
-	__be32 spare2;
-	__be32 original;
-	__be32 link_chain;
-	__be32 spare[5];
-	__be32 hash_chain;
-	__be32 parent;
-	__be32 extension;
-	__be32 stype;
-};
-
-struct slink_front
-{
-	__be32 ptype;
-	__be32 key;
-	__be32 spare1[3];
-	__be32 checksum;
-	u8 symname[1];	/* depends on block size */
-};
-
-struct affs_data_head
-{
-	__be32 ptype;
-	__be32 key;
-	__be32 sequence;
-	__be32 size;
-	__be32 next;
-	__be32 checksum;
-	u8 data[1];	/* depends on block size */
-};
-
-/* Permission bits */
-
-#define FIBF_OTR_READ		0x8000
-#define FIBF_OTR_WRITE		0x4000
-#define FIBF_OTR_EXECUTE	0x2000
-#define FIBF_OTR_DELETE		0x1000
-#define FIBF_GRP_READ		0x0800
-#define FIBF_GRP_WRITE		0x0400
-#define FIBF_GRP_EXECUTE	0x0200
-#define FIBF_GRP_DELETE		0x0100
-
-#define FIBF_HIDDEN		0x0080
-#define FIBF_SCRIPT		0x0040
-#define FIBF_PURE		0x0020		/* no use under linux */
-#define FIBF_ARCHIVED		0x0010		/* never set, always cleared on write */
-#define FIBF_NOREAD		0x0008		/* 0 means allowed */
-#define FIBF_NOWRITE		0x0004		/* 0 means allowed */
-#define FIBF_NOEXECUTE		0x0002		/* 0 means allowed, ignored under linux */
-#define FIBF_NODELETE		0x0001		/* 0 means allowed */
-
-#define FIBF_OWNER		0x000F		/* Bits pertaining to owner */
-#define FIBF_MASK		0xEE0E		/* Bits modified by Linux */
-
-#endif
-- 
cgit v1.2.3


From a0c111b49bbe11c3970bc668600e3b61fbbb7fca Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Sun, 9 Apr 2017 09:32:14 +0800
Subject: fs: drop duplicate header percpu-rwsem.h

Drop duplicate header percpu-rwsem.h from linux/fs.h.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..dee12c171e07 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -29,7 +29,6 @@
 #include <linux/lockdep.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
-#include <linux/percpu-rwsem.h>
 #include <linux/delayed_call.h>
 
 #include <asm/byteorder.h>
-- 
cgit v1.2.3


From cda37124f4e95ad5ccb11394a5802b0972668b32 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 25 Mar 2017 21:15:37 -0700
Subject: fs: constify tree_descr arrays passed to simple_fill_super()

simple_fill_super() is passed an array of tree_descr structures which
describe the files to create in the filesystem's root directory.  Since
these arrays are never modified intentionally, they should be 'const' so
that they are placed in .rodata and benefit from memory protection.
This patch updates the function signature and all users, and also
constifies tree_descr.name.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/infiniband/hw/qib/qib_fs.c | 2 +-
 drivers/xen/xenfs/super.c          | 4 ++--
 fs/binfmt_misc.c                   | 2 +-
 fs/debugfs/inode.c                 | 2 +-
 fs/fuse/control.c                  | 2 +-
 fs/libfs.c                         | 2 +-
 fs/nfsd/nfsctl.c                   | 2 +-
 fs/tracefs/inode.c                 | 2 +-
 include/linux/fs.h                 | 5 +++--
 kernel/bpf/inode.c                 | 2 +-
 security/inode.c                   | 2 +-
 security/selinux/selinuxfs.c       | 4 ++--
 security/smack/smackfs.c           | 2 +-
 13 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index f1e66efea98a..1d940a2885c9 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -512,7 +512,7 @@ static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned long flags;
 	int ret;
 
-	static struct tree_descr files[] = {
+	static const struct tree_descr files[] = {
 		[2] = {"driver_stats", &driver_ops[0], S_IRUGO},
 		[3] = {"driver_stats_names", &driver_ops[1], S_IRUGO},
 		{""},
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 328c3987b112..967f069385d0 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -44,14 +44,14 @@ static const struct file_operations capabilities_file_ops = {
 
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 {
-	static struct tree_descr xenfs_files[] = {
+	static const struct tree_descr xenfs_files[] = {
 		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
 		{""},
 	};
 
-	static struct tree_descr xenfs_init_files[] = {
+	static const struct tree_descr xenfs_init_files[] = {
 		[2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR },
 		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{ "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR },
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index bee1a36bc2ec..f4718098ac31 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -818,7 +818,7 @@ static const struct super_operations s_ops = {
 static int bm_fill_super(struct super_block *sb, void *data, int silent)
 {
 	int err;
-	static struct tree_descr bm_files[] = {
+	static const struct tree_descr bm_files[] = {
 		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7fd4ec4bb214..e892ae7d89f8 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = {
 
 static int debug_fill_super(struct super_block *sb, void *data, int silent)
 {
-	static struct tree_descr debug_files[] = {{""}};
+	static const struct tree_descr debug_files[] = {{""}};
 	struct debugfs_fs_info *fsi;
 	int err;
 
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 6e22748b0704..b9ea99c5b5b3 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
 
 static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
 {
-	struct tree_descr empty_descr = {""};
+	static const struct tree_descr empty_descr = {""};
 	struct fuse_conn *fc;
 	int err;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index a8b62e5d43a9..a04395334bb1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end);
  * to pass it an appropriate max_reserved value to avoid collisions.
  */
 int simple_fill_super(struct super_block *s, unsigned long magic,
-		      struct tree_descr *files)
+		      const struct tree_descr *files)
 {
 	struct inode *inode;
 	struct dentry *root;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 8bf8f667a8cf..6493df6b1bd5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
 
 static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 {
-	static struct tree_descr nfsd_files[] = {
+	static const struct tree_descr nfsd_files[] = {
 		[NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
 		[NFSD_Export_features] = {"export_features",
 					&export_features_operations, S_IRUGO},
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 21d36d284735..328e89c2cf83 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = {
 
 static int trace_fill_super(struct super_block *sb, void *data, int silent)
 {
-	static struct tree_descr trace_files[] = {{""}};
+	static const struct tree_descr trace_files[] = {{""}};
 	struct tracefs_fs_info *fsi;
 	int err;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dee12c171e07..fc1b4faa6272 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2995,9 +2995,10 @@ extern const struct file_operations simple_dir_operations;
 extern const struct inode_operations simple_dir_inode_operations;
 extern void make_empty_dir_inode(struct inode *inode);
 extern bool is_empty_dir_inode(struct inode *inode);
-struct tree_descr { char *name; const struct file_operations *ops; int mode; };
+struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
-extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *);
+extern int simple_fill_super(struct super_block *, unsigned long,
+			     const struct tree_descr *);
 extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
 extern void simple_release_fs(struct vfsmount **mount, int *count);
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fddcae801724..9bbd33497d3d 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -429,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts)
 
 static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 {
-	static struct tree_descr bpf_rfiles[] = { { "" } };
+	static const struct tree_descr bpf_rfiles[] = { { "" } };
 	struct bpf_mount_opts opts;
 	struct inode *inode;
 	int ret;
diff --git a/security/inode.c b/security/inode.c
index 2cb14162ff8d..eccd58ef2ae8 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -28,7 +28,7 @@ static int mount_count;
 
 static int fill_super(struct super_block *sb, void *data, int silent)
 {
-	static struct tree_descr files[] = {{""}};
+	static const struct tree_descr files[] = {{""}};
 
 	return simple_fill_super(sb, SECURITYFS_MAGIC, files);
 }
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index cb3fd98fb05a..6a9efedf7eb2 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1496,7 +1496,7 @@ static const struct file_operations sel_avc_cache_stats_ops = {
 static int sel_make_avc_files(struct dentry *dir)
 {
 	int i;
-	static struct tree_descr files[] = {
+	static const struct tree_descr files[] = {
 		{ "cache_threshold",
 		  &sel_avc_cache_threshold_ops, S_IRUGO|S_IWUSR },
 		{ "hash_stats", &sel_avc_hash_stats_ops, S_IRUGO },
@@ -1805,7 +1805,7 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	struct inode_security_struct *isec;
 
-	static struct tree_descr selinux_files[] = {
+	static const struct tree_descr selinux_files[] = {
 		[SEL_LOAD] = {"load", &sel_load_ops, S_IRUSR|S_IWUSR},
 		[SEL_ENFORCE] = {"enforce", &sel_enforce_ops, S_IRUGO|S_IWUSR},
 		[SEL_CONTEXT] = {"context", &transaction_ops, S_IRUGO|S_IWUGO},
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 366b8356f75b..f6482e53d55a 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -2855,7 +2855,7 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
 	int rc;
 	struct inode *root_inode;
 
-	static struct tree_descr smack_files[] = {
+	static const struct tree_descr smack_files[] = {
 		[SMK_LOAD] = {
 			"load", &smk_load_ops, S_IRUGO|S_IWUSR},
 		[SMK_CIPSO] = {
-- 
cgit v1.2.3


From 020c2833dbc76b4069c9a9886b71511052d160df Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 25 Mar 2017 21:02:18 -0700
Subject: fs: remove _submit_bh()

_submit_bh() allowed submitting a buffer_head for I/O using custom
bio_flags.  It used to be used by jbd to set BIO_SNAP_STABLE, introduced
by commit 713685111774 ("mm: make snapshotting pages for stable writes a
per-bio operation").  However, the code and flag has since been removed
and no _submit_bh() users remain.

These days, bio_flags are mostly used internally by the block layer to
track the state of bio's.  As such, it doesn't really make sense for
filesystems to use them instead of op_flags when wanting special
behavior for block requests.

Therefore, remove _submit_bh() and trim the bio_flags argument from
submit_bh_wbc().

Cc: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                 | 19 +++++--------------
 include/linux/buffer_head.h |  2 --
 2 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..68dc05ce06a5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,7 +49,6 @@
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-			 unsigned long bio_flags,
 			 struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -1830,7 +1829,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1884,7 +1883,7 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc);
+			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -3095,7 +3094,7 @@ void guard_bio_eod(int op, struct bio *bio)
 }
 
 static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
-			 unsigned long bio_flags, struct writeback_control *wbc)
+			 struct writeback_control *wbc)
 {
 	struct bio *bio;
 
@@ -3130,7 +3129,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
-	bio->bi_flags |= bio_flags;
 
 	/* Take care of bh's that straddle the end of the device */
 	guard_bio_eod(op, bio);
@@ -3145,16 +3143,9 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 	return 0;
 }
 
-int _submit_bh(int op, int op_flags, struct buffer_head *bh,
-	       unsigned long bio_flags)
+int submit_bh(int op, int op_flags, struct buffer_head *bh)
 {
-	return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL);
-}
-EXPORT_SYMBOL_GPL(_submit_bh);
-
-int submit_bh(int op, int op_flags,  struct buffer_head *bh)
-{
-	return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+	return submit_bh_wbc(op, op_flags, bh, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 79591c3660cc..bd029e52ef5e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -196,8 +196,6 @@ void ll_rw_block(int, int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, int op_flags);
 void write_dirty_buffer(struct buffer_head *bh, int op_flags);
-int _submit_bh(int op, int op_flags, struct buffer_head *bh,
-	       unsigned long bio_flags);
 int submit_bh(int, int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
-- 
cgit v1.2.3


From e04653a9dcf4d98defe2149c885382e5cc72082f Mon Sep 17 00:00:00 2001
From: Archana Patni <archana.patni@intel.com>
Date: Wed, 1 Feb 2017 17:22:03 +0100
Subject: mfd: cros_ec: Add ACPI GPE handler for LID0 devices

This patch installs an ACPI GPE handler for LID0 ACPI device to indicate
ACPI core that this GPE should stay enabled for lid to work in suspend
to idle path.

Signed-off-by: Archana Patni <archana.patni@intel.com>
Signed-off-by: Thierry Escande <thierry.escande@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Makefile           |   4 +-
 drivers/mfd/cros_ec.c          |  15 ++++--
 drivers/mfd/cros_ec_acpi_gpe.c | 103 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/cros_ec.h    |  18 +++++++
 4 files changed, 136 insertions(+), 4 deletions(-)
 create mode 100644 drivers/mfd/cros_ec_acpi_gpe.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 790698a892ba..e314ba1058dc 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -10,7 +10,9 @@ obj-$(CONFIG_MFD_ACT8945A)	+= act8945a.o
 obj-$(CONFIG_MFD_SM501)		+= sm501.o
 obj-$(CONFIG_MFD_ASIC3)		+= asic3.o tmio_core.o
 obj-$(CONFIG_MFD_BCM590XX)	+= bcm590xx.o
-obj-$(CONFIG_MFD_CROS_EC)	+= cros_ec.o
+cros_ec_core-objs		:= cros_ec.o
+cros_ec_core-$(CONFIG_ACPI)	+= cros_ec_acpi_gpe.o
+obj-$(CONFIG_MFD_CROS_EC)	+= cros_ec_core.o
 obj-$(CONFIG_MFD_CROS_EC_I2C)	+= cros_ec_i2c.o
 obj-$(CONFIG_MFD_CROS_EC_SPI)	+= cros_ec_spi.o
 obj-$(CONFIG_MFD_EXYNOS_LPASS)	+= exynos-lpass.o
diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
index 9b66a98ba4bf..d4a407e466b5 100644
--- a/drivers/mfd/cros_ec.c
+++ b/drivers/mfd/cros_ec.c
@@ -166,6 +166,8 @@ int cros_ec_register(struct cros_ec_device *ec_dev)
 
 	dev_info(dev, "Chrome EC device registered\n");
 
+	cros_ec_acpi_install_gpe_handler(dev);
+
 	return 0;
 
 fail_mfd:
@@ -179,6 +181,8 @@ int cros_ec_remove(struct cros_ec_device *ec_dev)
 {
 	mfd_remove_devices(ec_dev->dev);
 
+	cros_ec_acpi_remove_gpe_handler();
+
 	return 0;
 }
 EXPORT_SYMBOL(cros_ec_remove);
@@ -190,9 +194,14 @@ int cros_ec_suspend(struct cros_ec_device *ec_dev)
 	int ret;
 	u8 sleep_event;
 
-	sleep_event = (!IS_ENABLED(CONFIG_ACPI) || pm_suspend_via_firmware()) ?
-		      HOST_SLEEP_EVENT_S3_RESUME :
-		      HOST_SLEEP_EVENT_S0IX_RESUME;
+	if (!IS_ENABLED(CONFIG_ACPI) || pm_suspend_via_firmware()) {
+		sleep_event = HOST_SLEEP_EVENT_S3_SUSPEND;
+	} else {
+		sleep_event = HOST_SLEEP_EVENT_S0IX_SUSPEND;
+
+		/* Clearing the GPE status for any pending event */
+		cros_ec_acpi_clear_gpe();
+	}
 
 	ret = cros_ec_sleep_event(ec_dev, sleep_event);
 	if (ret < 0)
diff --git a/drivers/mfd/cros_ec_acpi_gpe.c b/drivers/mfd/cros_ec_acpi_gpe.c
new file mode 100644
index 000000000000..56d305dab2d4
--- /dev/null
+++ b/drivers/mfd/cros_ec_acpi_gpe.c
@@ -0,0 +1,103 @@
+/*
+ * ChromeOS EC multi-function device
+ *
+ * Copyright (C) 2017 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * The ChromeOS EC multi function device is used to mux all the requests
+ * to the EC device for its multiple features: keyboard controller,
+ * battery charging and regulator control, firmware update.
+ */
+#include <linux/acpi.h>
+
+#define ACPI_LID_DEVICE      "LID0"
+
+static int ec_wake_gpe = -EINVAL;
+
+/*
+ * This handler indicates to ACPI core that this GPE should stay enabled for
+ * lid to work in suspend to idle path.
+ */
+static u32 cros_ec_gpe_handler(acpi_handle gpe_device, u32 gpe_number,
+			       void *data)
+{
+	return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
+}
+
+/*
+ * Get ACPI GPE for LID0 device.
+ */
+static int cros_ec_get_ec_wake_gpe(struct device *dev)
+{
+	struct acpi_device *cros_acpi_dev;
+	struct acpi_device *adev;
+	acpi_handle handle;
+	acpi_status status;
+	int ret;
+
+	cros_acpi_dev = ACPI_COMPANION(dev);
+
+	if (!cros_acpi_dev || !cros_acpi_dev->parent ||
+	   !cros_acpi_dev->parent->handle)
+		return -EINVAL;
+
+	status = acpi_get_handle(cros_acpi_dev->parent->handle, ACPI_LID_DEVICE,
+				 &handle);
+	if (ACPI_FAILURE(status))
+		return -EINVAL;
+
+	ret = acpi_bus_get_device(handle, &adev);
+	if (ret)
+		return ret;
+
+	return adev->wakeup.gpe_number;
+}
+
+int cros_ec_acpi_install_gpe_handler(struct device *dev)
+{
+	acpi_status status;
+
+	ec_wake_gpe = cros_ec_get_ec_wake_gpe(dev);
+
+	if (ec_wake_gpe < 0)
+		return ec_wake_gpe;
+
+	status = acpi_install_gpe_handler(NULL, ec_wake_gpe,
+					  ACPI_GPE_EDGE_TRIGGERED,
+					  &cros_ec_gpe_handler, NULL);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	dev_info(dev, "Initialized, GPE = 0x%x\n", ec_wake_gpe);
+
+	return 0;
+}
+
+void cros_ec_acpi_remove_gpe_handler(void)
+{
+	acpi_status status;
+
+	if (ec_wake_gpe < 0)
+		return;
+
+	status = acpi_remove_gpe_handler(NULL, ec_wake_gpe,
+						 &cros_ec_gpe_handler);
+	if (ACPI_FAILURE(status))
+		pr_err("failed to remove gpe handler\n");
+}
+
+void cros_ec_acpi_clear_gpe(void)
+{
+	if (ec_wake_gpe < 0)
+		return;
+
+	acpi_clear_gpe(NULL, ec_wake_gpe);
+}
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index 7a01c94496f1..b3d04de684d4 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -304,4 +304,22 @@ extern struct attribute_group cros_ec_attr_group;
 extern struct attribute_group cros_ec_lightbar_attr_group;
 extern struct attribute_group cros_ec_vbc_attr_group;
 
+/* ACPI GPE handler */
+#ifdef CONFIG_ACPI
+
+int cros_ec_acpi_install_gpe_handler(struct device *dev);
+void cros_ec_acpi_remove_gpe_handler(void);
+void cros_ec_acpi_clear_gpe(void);
+
+#else /* CONFIG_ACPI */
+
+static inline int cros_ec_acpi_install_gpe_handler(struct device *dev)
+{
+	return -ENODEV;
+}
+static inline void cros_ec_acpi_remove_gpe_handler(void) {}
+static inline void cros_ec_acpi_clear_gpe(void) {}
+
+#endif /* CONFIG_ACPI */
+
 #endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit v1.2.3


From d5aa11bfe9cebb4a3912b11748fd84aa15454229 Mon Sep 17 00:00:00 2001
From: Milo Kim <milo.kim@ti.com>
Date: Tue, 28 Feb 2017 15:45:15 +0900
Subject: mfd: Add TI LMU driver

TI LMU (Lighting Management Unit) driver supports lighting devices below.

  LM3532, LM3631, LM3632, LM3633, LM3695 and LM3697.

LMU devices have common features.
  - I2C interface for accessing device registers
  - Hardware enable pin control
  - Backlight brightness control
  - Notifier for hardware fault monitoring
  - Regulators for LCD display bias

It contains fault monitor, backlight, LED and regulator driver.

LMU fault monitor
-----------------
  LM3633 and LM3697 provide hardware monitoring feature.
  It enables open or short circuit detection.
  After monitoring is done, each device should be re-initialized.
  Notifier is used for this case.
  Separate patch for 'ti-lmu-fault-monitor' will be sent later.

Backlight
---------
  It's handled by TI LMU backlight consolidated driver and
  chip dependent data. Separate patchset will be sent later.

LED indicator
-------------
  LM3633 has 6 indicator LEDs. Programmable dimming pattern is also
  supported. Separate patch for 'leds-lm3633' will be sent later.

Regulator
---------
  LM3631 has 5 regulators for the display bias.
  LM3632 supports 3 regulators. One consolidated driver enables it.
  The lm363x regulator driver is already upstreamed.

Signed-off-by: Milo Kim <milo.kim@ti.com>
Tested-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                 |  12 ++
 drivers/mfd/Makefile                |   2 +
 drivers/mfd/ti-lmu.c                | 259 +++++++++++++++++++++++++++++++++
 include/linux/mfd/ti-lmu-register.h | 280 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/ti-lmu.h          |  87 +++++++++++
 5 files changed, 640 insertions(+)
 create mode 100644 drivers/mfd/ti-lmu.c
 create mode 100644 include/linux/mfd/ti-lmu-register.h
 create mode 100644 include/linux/mfd/ti-lmu.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 8bbc91b5186e..add0c35c38c5 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1181,6 +1181,18 @@ config MFD_LP8788
 	  TI LP8788 PMU supports regulators, battery charger, RTC,
 	  ADC, backlight driver and current sinks.
 
+config MFD_TI_LMU
+	tristate "TI Lighting Management Unit driver"
+	depends on I2C
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  Say yes here to enable support for TI LMU chips.
+
+	  TI LMU MFD supports LM3532, LM3631, LM3632, LM3633, LM3695 and LM3697.
+	  It consists of backlight, LED and regulator driver.
+	  It provides consistent device controls for lighting functions.
+
 config MFD_OMAP_USB_HOST
 	bool "TI OMAP USBHS core and TLL driver"
 	depends on USB_EHCI_HCD_OMAP || USB_OHCI_HCD_OMAP3
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index e314ba1058dc..294a0525aeb1 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -127,6 +127,8 @@ obj-$(CONFIG_MFD_AXP20X_RSB)	+= axp20x-rsb.o
 obj-$(CONFIG_MFD_LP3943)	+= lp3943.o
 obj-$(CONFIG_MFD_LP8788)	+= lp8788.o lp8788-irq.o
 
+obj-$(CONFIG_MFD_TI_LMU)	+= ti-lmu.o
+
 da9055-objs			:= da9055-core.o da9055-i2c.o
 obj-$(CONFIG_MFD_DA9055)	+= da9055.o
 obj-$(CONFIG_MFD_DA9062)	+= da9062-core.o
diff --git a/drivers/mfd/ti-lmu.c b/drivers/mfd/ti-lmu.c
new file mode 100644
index 000000000000..cfb411cde51c
--- /dev/null
+++ b/drivers/mfd/ti-lmu.c
@@ -0,0 +1,259 @@
+/*
+ * TI LMU (Lighting Management Unit) Core Driver
+ *
+ * Copyright 2017 Texas Instruments
+ *
+ * Author: Milo Kim <milo.kim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/i2c.h>
+#include <linux/kernel.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/ti-lmu.h>
+#include <linux/mfd/ti-lmu-register.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_gpio.h>
+#include <linux/slab.h>
+
+struct ti_lmu_data {
+	struct mfd_cell *cells;
+	int num_cells;
+	unsigned int max_register;
+};
+
+static int ti_lmu_enable_hw(struct ti_lmu *lmu, enum ti_lmu_id id)
+{
+	int ret;
+
+	if (gpio_is_valid(lmu->en_gpio)) {
+		ret = devm_gpio_request_one(lmu->dev, lmu->en_gpio,
+					    GPIOF_OUT_INIT_HIGH, "lmu_hwen");
+		if (ret) {
+			dev_err(lmu->dev, "Can not request enable GPIO: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
+	/* Delay about 1ms after HW enable pin control */
+	usleep_range(1000, 1500);
+
+	/* LM3631 has additional power up sequence - enable LCD_EN bit. */
+	if (id == LM3631) {
+		return regmap_update_bits(lmu->regmap, LM3631_REG_DEVCTRL,
+					  LM3631_LCD_EN_MASK,
+					  LM3631_LCD_EN_MASK);
+	}
+
+	return 0;
+}
+
+static void ti_lmu_disable_hw(struct ti_lmu *lmu)
+{
+	if (gpio_is_valid(lmu->en_gpio))
+		gpio_set_value(lmu->en_gpio, 0);
+}
+
+static struct mfd_cell lm3532_devices[] = {
+	{
+		.name          = "ti-lmu-backlight",
+		.id            = LM3532,
+		.of_compatible = "ti,lm3532-backlight",
+	},
+};
+
+#define LM363X_REGULATOR(_id)			\
+{						\
+	.name          = "lm363x-regulator",	\
+	.id            = _id,			\
+	.of_compatible = "ti,lm363x-regulator",	\
+}						\
+
+static struct mfd_cell lm3631_devices[] = {
+	LM363X_REGULATOR(LM3631_BOOST),
+	LM363X_REGULATOR(LM3631_LDO_CONT),
+	LM363X_REGULATOR(LM3631_LDO_OREF),
+	LM363X_REGULATOR(LM3631_LDO_POS),
+	LM363X_REGULATOR(LM3631_LDO_NEG),
+	{
+		.name          = "ti-lmu-backlight",
+		.id            = LM3631,
+		.of_compatible = "ti,lm3631-backlight",
+	},
+};
+
+static struct mfd_cell lm3632_devices[] = {
+	LM363X_REGULATOR(LM3632_BOOST),
+	LM363X_REGULATOR(LM3632_LDO_POS),
+	LM363X_REGULATOR(LM3632_LDO_NEG),
+	{
+		.name          = "ti-lmu-backlight",
+		.id            = LM3632,
+		.of_compatible = "ti,lm3632-backlight",
+	},
+};
+
+static struct mfd_cell lm3633_devices[] = {
+	{
+		.name          = "ti-lmu-backlight",
+		.id            = LM3633,
+		.of_compatible = "ti,lm3633-backlight",
+	},
+	{
+		.name          = "lm3633-leds",
+		.of_compatible = "ti,lm3633-leds",
+	},
+	/* Monitoring driver for open/short circuit detection */
+	{
+		.name          = "ti-lmu-fault-monitor",
+		.id            = LM3633,
+		.of_compatible = "ti,lm3633-fault-monitor",
+	},
+};
+
+static struct mfd_cell lm3695_devices[] = {
+	{
+		.name          = "ti-lmu-backlight",
+		.id            = LM3695,
+		.of_compatible = "ti,lm3695-backlight",
+	},
+};
+
+static struct mfd_cell lm3697_devices[] = {
+	{
+		.name          = "ti-lmu-backlight",
+		.id            = LM3697,
+		.of_compatible = "ti,lm3697-backlight",
+	},
+	/* Monitoring driver for open/short circuit detection */
+	{
+		.name          = "ti-lmu-fault-monitor",
+		.id            = LM3697,
+		.of_compatible = "ti,lm3697-fault-monitor",
+	},
+};
+
+#define TI_LMU_DATA(chip, max_reg)		\
+static const struct ti_lmu_data chip##_data =	\
+{						\
+	.cells = chip##_devices,		\
+	.num_cells = ARRAY_SIZE(chip##_devices),\
+	.max_register = max_reg,		\
+}						\
+
+TI_LMU_DATA(lm3532, LM3532_MAX_REG);
+TI_LMU_DATA(lm3631, LM3631_MAX_REG);
+TI_LMU_DATA(lm3632, LM3632_MAX_REG);
+TI_LMU_DATA(lm3633, LM3633_MAX_REG);
+TI_LMU_DATA(lm3695, LM3695_MAX_REG);
+TI_LMU_DATA(lm3697, LM3697_MAX_REG);
+
+static const struct of_device_id ti_lmu_of_match[] = {
+	{ .compatible = "ti,lm3532", .data = &lm3532_data },
+	{ .compatible = "ti,lm3631", .data = &lm3631_data },
+	{ .compatible = "ti,lm3632", .data = &lm3632_data },
+	{ .compatible = "ti,lm3633", .data = &lm3633_data },
+	{ .compatible = "ti,lm3695", .data = &lm3695_data },
+	{ .compatible = "ti,lm3697", .data = &lm3697_data },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, ti_lmu_of_match);
+
+static int ti_lmu_probe(struct i2c_client *cl, const struct i2c_device_id *id)
+{
+	struct device *dev = &cl->dev;
+	const struct of_device_id *match;
+	const struct ti_lmu_data *data;
+	struct regmap_config regmap_cfg;
+	struct ti_lmu *lmu;
+	int ret;
+
+	match = of_match_device(ti_lmu_of_match, dev);
+	if (!match)
+		return -ENODEV;
+	/*
+	 * Get device specific data from of_match table.
+	 * This data is defined by using TI_LMU_DATA() macro.
+	 */
+	data = (struct ti_lmu_data *)match->data;
+
+	lmu = devm_kzalloc(dev, sizeof(*lmu), GFP_KERNEL);
+	if (!lmu)
+		return -ENOMEM;
+
+	lmu->dev = &cl->dev;
+
+	/* Setup regmap */
+	memset(&regmap_cfg, 0, sizeof(struct regmap_config));
+	regmap_cfg.reg_bits = 8;
+	regmap_cfg.val_bits = 8;
+	regmap_cfg.name = id->name;
+	regmap_cfg.max_register = data->max_register;
+
+	lmu->regmap = devm_regmap_init_i2c(cl, &regmap_cfg);
+	if (IS_ERR(lmu->regmap))
+		return PTR_ERR(lmu->regmap);
+
+	/* HW enable pin control and additional power up sequence if required */
+	lmu->en_gpio = of_get_named_gpio(dev->of_node, "enable-gpios", 0);
+	ret = ti_lmu_enable_hw(lmu, id->driver_data);
+	if (ret)
+		return ret;
+
+	/*
+	 * Fault circuit(open/short) can be detected by ti-lmu-fault-monitor.
+	 * After fault detection is done, some devices should re-initialize
+	 * configuration. The notifier enables such kind of handling.
+	 */
+	BLOCKING_INIT_NOTIFIER_HEAD(&lmu->notifier);
+
+	i2c_set_clientdata(cl, lmu);
+
+	return mfd_add_devices(lmu->dev, 0, data->cells,
+			       data->num_cells, NULL, 0, NULL);
+}
+
+static int ti_lmu_remove(struct i2c_client *cl)
+{
+	struct ti_lmu *lmu = i2c_get_clientdata(cl);
+
+	ti_lmu_disable_hw(lmu);
+	mfd_remove_devices(lmu->dev);
+	return 0;
+}
+
+static const struct i2c_device_id ti_lmu_ids[] = {
+	{ "lm3532", LM3532 },
+	{ "lm3631", LM3631 },
+	{ "lm3632", LM3632 },
+	{ "lm3633", LM3633 },
+	{ "lm3695", LM3695 },
+	{ "lm3697", LM3697 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, ti_lmu_ids);
+
+static struct i2c_driver ti_lmu_driver = {
+	.probe = ti_lmu_probe,
+	.remove = ti_lmu_remove,
+	.driver = {
+		.name = "ti-lmu",
+		.of_match_table = ti_lmu_of_match,
+	},
+	.id_table = ti_lmu_ids,
+};
+
+module_i2c_driver(ti_lmu_driver);
+
+MODULE_DESCRIPTION("TI LMU MFD Core Driver");
+MODULE_AUTHOR("Milo Kim");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mfd/ti-lmu-register.h b/include/linux/mfd/ti-lmu-register.h
new file mode 100644
index 000000000000..2125c7c02818
--- /dev/null
+++ b/include/linux/mfd/ti-lmu-register.h
@@ -0,0 +1,280 @@
+/*
+ * TI LMU (Lighting Management Unit) Device Register Map
+ *
+ * Copyright 2017 Texas Instruments
+ *
+ * Author: Milo Kim <milo.kim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MFD_TI_LMU_REGISTER_H__
+#define __MFD_TI_LMU_REGISTER_H__
+
+#include <linux/bitops.h>
+
+/* LM3532 */
+#define LM3532_REG_OUTPUT_CFG			0x10
+#define LM3532_ILED1_CFG_MASK			0x03
+#define LM3532_ILED2_CFG_MASK			0x0C
+#define LM3532_ILED3_CFG_MASK			0x30
+#define LM3532_ILED1_CFG_SHIFT			0
+#define LM3532_ILED2_CFG_SHIFT			2
+#define LM3532_ILED3_CFG_SHIFT			4
+
+#define LM3532_REG_RAMPUP			0x12
+#define LM3532_REG_RAMPDN			LM3532_REG_RAMPUP
+#define LM3532_RAMPUP_MASK			0x07
+#define LM3532_RAMPUP_SHIFT			0
+#define LM3532_RAMPDN_MASK			0x38
+#define LM3532_RAMPDN_SHIFT			3
+
+#define LM3532_REG_ENABLE			0x1D
+
+#define LM3532_REG_PWM_A_CFG			0x13
+#define LM3532_PWM_A_MASK			0x05	/* zone 0 */
+#define LM3532_PWM_ZONE_0			BIT(2)
+
+#define LM3532_REG_PWM_B_CFG			0x14
+#define LM3532_PWM_B_MASK			0x09	/* zone 1 */
+#define LM3532_PWM_ZONE_1			BIT(3)
+
+#define LM3532_REG_PWM_C_CFG			0x15
+#define LM3532_PWM_C_MASK			0x11	/* zone 2 */
+#define LM3532_PWM_ZONE_2			BIT(4)
+
+#define LM3532_REG_ZONE_CFG_A			0x16
+#define LM3532_REG_ZONE_CFG_B			0x18
+#define LM3532_REG_ZONE_CFG_C			0x1A
+#define LM3532_ZONE_MASK			(BIT(2) | BIT(3) | BIT(4))
+#define LM3532_ZONE_0				0
+#define LM3532_ZONE_1				BIT(2)
+#define LM3532_ZONE_2				BIT(3)
+
+#define LM3532_REG_BRT_A			0x70	/* zone 0 */
+#define LM3532_REG_BRT_B			0x76	/* zone 1 */
+#define LM3532_REG_BRT_C			0x7C	/* zone 2 */
+
+#define LM3532_MAX_REG				0x7E
+
+/* LM3631 */
+#define LM3631_REG_DEVCTRL			0x00
+#define LM3631_LCD_EN_MASK			BIT(1)
+#define LM3631_BL_EN_MASK			BIT(0)
+
+#define LM3631_REG_BRT_LSB			0x01
+#define LM3631_REG_BRT_MSB			0x02
+
+#define LM3631_REG_BL_CFG			0x06
+#define LM3631_BL_CHANNEL_MASK			BIT(3)
+#define LM3631_BL_DUAL_CHANNEL			0
+#define LM3631_BL_SINGLE_CHANNEL		BIT(3)
+#define LM3631_MAP_MASK				BIT(5)
+#define LM3631_EXPONENTIAL_MAP			0
+
+#define LM3631_REG_BRT_MODE			0x08
+#define LM3631_MODE_MASK			(BIT(1) | BIT(2) | BIT(3))
+#define LM3631_DEFAULT_MODE			(BIT(1) | BIT(3))
+
+#define LM3631_REG_SLOPE			0x09
+#define LM3631_SLOPE_MASK			0xF0
+#define LM3631_SLOPE_SHIFT			4
+
+#define LM3631_REG_LDO_CTRL1			0x0A
+#define LM3631_EN_OREF_MASK			BIT(0)
+#define LM3631_EN_VNEG_MASK			BIT(1)
+#define LM3631_EN_VPOS_MASK			BIT(2)
+
+#define LM3631_REG_LDO_CTRL2			0x0B
+#define LM3631_EN_CONT_MASK			BIT(0)
+
+#define LM3631_REG_VOUT_CONT			0x0C
+#define LM3631_VOUT_CONT_MASK			(BIT(6) | BIT(7))
+
+#define LM3631_REG_VOUT_BOOST			0x0C
+#define LM3631_REG_VOUT_POS			0x0D
+#define LM3631_REG_VOUT_NEG			0x0E
+#define LM3631_REG_VOUT_OREF			0x0F
+#define LM3631_VOUT_MASK			0x3F
+
+#define LM3631_REG_ENTIME_VCONT			0x0B
+#define LM3631_ENTIME_CONT_MASK			0x70
+
+#define LM3631_REG_ENTIME_VOREF			0x0F
+#define LM3631_REG_ENTIME_VPOS			0x10
+#define LM3631_REG_ENTIME_VNEG			0x11
+#define LM3631_ENTIME_MASK			0xF0
+#define LM3631_ENTIME_SHIFT			4
+
+#define LM3631_MAX_REG				0x16
+
+/* LM3632 */
+#define LM3632_REG_CONFIG1			0x02
+#define LM3632_OVP_MASK				(BIT(5) | BIT(6) | BIT(7))
+#define LM3632_OVP_25V				BIT(6)
+
+#define LM3632_REG_CONFIG2			0x03
+#define LM3632_SWFREQ_MASK			BIT(7)
+#define LM3632_SWFREQ_1MHZ			BIT(7)
+
+#define LM3632_REG_BRT_LSB			0x04
+#define LM3632_REG_BRT_MSB			0x05
+
+#define LM3632_REG_IO_CTRL			0x09
+#define LM3632_PWM_MASK				BIT(6)
+#define LM3632_I2C_MODE				0
+#define LM3632_PWM_MODE				BIT(6)
+
+#define LM3632_REG_ENABLE			0x0A
+#define LM3632_BL_EN_MASK			BIT(0)
+#define LM3632_BL_CHANNEL_MASK			(BIT(3) | BIT(4))
+#define LM3632_BL_SINGLE_CHANNEL			BIT(4)
+#define LM3632_BL_DUAL_CHANNEL			BIT(3)
+
+#define LM3632_REG_BIAS_CONFIG			0x0C
+#define LM3632_EXT_EN_MASK			BIT(0)
+#define LM3632_EN_VNEG_MASK			BIT(1)
+#define LM3632_EN_VPOS_MASK			BIT(2)
+
+#define LM3632_REG_VOUT_BOOST			0x0D
+#define LM3632_REG_VOUT_POS			0x0E
+#define LM3632_REG_VOUT_NEG			0x0F
+#define LM3632_VOUT_MASK			0x3F
+
+#define LM3632_MAX_REG				0x10
+
+/* LM3633 */
+#define LM3633_REG_HVLED_OUTPUT_CFG		0x10
+#define LM3633_HVLED1_CFG_MASK			BIT(0)
+#define LM3633_HVLED2_CFG_MASK			BIT(1)
+#define LM3633_HVLED3_CFG_MASK			BIT(2)
+#define LM3633_HVLED1_CFG_SHIFT			0
+#define LM3633_HVLED2_CFG_SHIFT			1
+#define LM3633_HVLED3_CFG_SHIFT			2
+
+#define LM3633_REG_BANK_SEL			0x11
+
+#define LM3633_REG_BL0_RAMP			0x12
+#define LM3633_REG_BL1_RAMP			0x13
+#define LM3633_BL_RAMPUP_MASK			0xF0
+#define LM3633_BL_RAMPUP_SHIFT			4
+#define LM3633_BL_RAMPDN_MASK			0x0F
+#define LM3633_BL_RAMPDN_SHIFT			0
+
+#define LM3633_REG_BL_RAMP_CONF			0x1B
+#define LM3633_BL_RAMP_MASK			0x0F
+#define LM3633_BL_RAMP_EACH			0x05
+
+#define LM3633_REG_PTN0_RAMP			0x1C
+#define LM3633_REG_PTN1_RAMP			0x1D
+#define LM3633_PTN_RAMPUP_MASK			0x70
+#define LM3633_PTN_RAMPUP_SHIFT			4
+#define LM3633_PTN_RAMPDN_MASK			0x07
+#define LM3633_PTN_RAMPDN_SHIFT			0
+
+#define LM3633_REG_LED_MAPPING_MODE		0x1F
+#define LM3633_LED_EXPONENTIAL			BIT(1)
+
+#define LM3633_REG_IMAX_HVLED_A			0x20
+#define LM3633_REG_IMAX_HVLED_B			0x21
+#define LM3633_REG_IMAX_LVLED_BASE		0x22
+
+#define LM3633_REG_BL_FEEDBACK_ENABLE		0x28
+
+#define LM3633_REG_ENABLE			0x2B
+#define LM3633_LED_BANK_OFFSET			2
+
+#define LM3633_REG_PATTERN			0x2C
+
+#define LM3633_REG_BOOST_CFG			0x2D
+#define LM3633_OVP_MASK				(BIT(1) | BIT(2))
+#define LM3633_OVP_40V				0x6
+
+#define LM3633_REG_PWM_CFG			0x2F
+#define LM3633_PWM_A_MASK			BIT(0)
+#define LM3633_PWM_B_MASK			BIT(1)
+
+#define LM3633_REG_BRT_HVLED_A_LSB		0x40
+#define LM3633_REG_BRT_HVLED_A_MSB		0x41
+#define LM3633_REG_BRT_HVLED_B_LSB		0x42
+#define LM3633_REG_BRT_HVLED_B_MSB		0x43
+
+#define LM3633_REG_BRT_LVLED_BASE		0x44
+
+#define LM3633_REG_PTN_DELAY			0x50
+
+#define LM3633_REG_PTN_LOWTIME			0x51
+
+#define LM3633_REG_PTN_HIGHTIME			0x52
+
+#define LM3633_REG_PTN_LOWBRT			0x53
+
+#define LM3633_REG_PTN_HIGHBRT			LM3633_REG_BRT_LVLED_BASE
+
+#define LM3633_REG_BL_OPEN_FAULT_STATUS		0xB0
+
+#define LM3633_REG_BL_SHORT_FAULT_STATUS	0xB2
+
+#define LM3633_REG_MONITOR_ENABLE		0xB4
+
+#define LM3633_MAX_REG				0xB4
+
+/* LM3695 */
+#define LM3695_REG_GP				0x10
+#define LM3695_BL_CHANNEL_MASK			BIT(3)
+#define LM3695_BL_DUAL_CHANNEL			0
+#define LM3695_BL_SINGLE_CHANNEL			BIT(3)
+#define LM3695_BRT_RW_MASK			BIT(2)
+#define LM3695_BL_EN_MASK			BIT(0)
+
+#define LM3695_REG_BRT_LSB			0x13
+#define LM3695_REG_BRT_MSB			0x14
+
+#define LM3695_MAX_REG				0x14
+
+/* LM3697 */
+#define LM3697_REG_HVLED_OUTPUT_CFG		0x10
+#define LM3697_HVLED1_CFG_MASK			BIT(0)
+#define LM3697_HVLED2_CFG_MASK			BIT(1)
+#define LM3697_HVLED3_CFG_MASK			BIT(2)
+#define LM3697_HVLED1_CFG_SHIFT			0
+#define LM3697_HVLED2_CFG_SHIFT			1
+#define LM3697_HVLED3_CFG_SHIFT			2
+
+#define LM3697_REG_BL0_RAMP			0x11
+#define LM3697_REG_BL1_RAMP			0x12
+#define LM3697_RAMPUP_MASK			0xF0
+#define LM3697_RAMPUP_SHIFT			4
+#define LM3697_RAMPDN_MASK			0x0F
+#define LM3697_RAMPDN_SHIFT			0
+
+#define LM3697_REG_RAMP_CONF			0x14
+#define LM3697_RAMP_MASK			0x0F
+#define LM3697_RAMP_EACH			0x05
+
+#define LM3697_REG_PWM_CFG			0x1C
+#define LM3697_PWM_A_MASK			BIT(0)
+#define LM3697_PWM_B_MASK			BIT(1)
+
+#define LM3697_REG_IMAX_A			0x17
+#define LM3697_REG_IMAX_B			0x18
+
+#define LM3697_REG_FEEDBACK_ENABLE		0x19
+
+#define LM3697_REG_BRT_A_LSB			0x20
+#define LM3697_REG_BRT_A_MSB			0x21
+#define LM3697_REG_BRT_B_LSB			0x22
+#define LM3697_REG_BRT_B_MSB			0x23
+
+#define LM3697_REG_ENABLE			0x24
+
+#define LM3697_REG_OPEN_FAULT_STATUS		0xB0
+
+#define LM3697_REG_SHORT_FAULT_STATUS		0xB2
+
+#define LM3697_REG_MONITOR_ENABLE		0xB4
+
+#define LM3697_MAX_REG				0xB4
+#endif
diff --git a/include/linux/mfd/ti-lmu.h b/include/linux/mfd/ti-lmu.h
new file mode 100644
index 000000000000..09d5f30384e5
--- /dev/null
+++ b/include/linux/mfd/ti-lmu.h
@@ -0,0 +1,87 @@
+/*
+ * TI LMU (Lighting Management Unit) Devices
+ *
+ * Copyright 2017 Texas Instruments
+ *
+ * Author: Milo Kim <milo.kim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __MFD_TI_LMU_H__
+#define __MFD_TI_LMU_H__
+
+#include <linux/gpio.h>
+#include <linux/notifier.h>
+#include <linux/regmap.h>
+
+/* Notifier event */
+#define LMU_EVENT_MONITOR_DONE		0x01
+
+enum ti_lmu_id {
+	LM3532,
+	LM3631,
+	LM3632,
+	LM3633,
+	LM3695,
+	LM3697,
+	LMU_MAX_ID,
+};
+
+enum ti_lmu_max_current {
+	LMU_IMAX_5mA,
+	LMU_IMAX_6mA,
+	LMU_IMAX_7mA = 0x03,
+	LMU_IMAX_8mA,
+	LMU_IMAX_9mA,
+	LMU_IMAX_10mA = 0x07,
+	LMU_IMAX_11mA,
+	LMU_IMAX_12mA,
+	LMU_IMAX_13mA,
+	LMU_IMAX_14mA,
+	LMU_IMAX_15mA = 0x0D,
+	LMU_IMAX_16mA,
+	LMU_IMAX_17mA,
+	LMU_IMAX_18mA,
+	LMU_IMAX_19mA,
+	LMU_IMAX_20mA = 0x13,
+	LMU_IMAX_21mA,
+	LMU_IMAX_22mA,
+	LMU_IMAX_23mA = 0x17,
+	LMU_IMAX_24mA,
+	LMU_IMAX_25mA,
+	LMU_IMAX_26mA,
+	LMU_IMAX_27mA = 0x1C,
+	LMU_IMAX_28mA,
+	LMU_IMAX_29mA,
+	LMU_IMAX_30mA,
+};
+
+enum lm363x_regulator_id {
+	LM3631_BOOST,		/* Boost output */
+	LM3631_LDO_CONT,	/* Display panel controller */
+	LM3631_LDO_OREF,	/* Gamma reference */
+	LM3631_LDO_POS,		/* Positive display bias output */
+	LM3631_LDO_NEG,		/* Negative display bias output */
+	LM3632_BOOST,		/* Boost output */
+	LM3632_LDO_POS,		/* Positive display bias output */
+	LM3632_LDO_NEG,		/* Negative display bias output */
+};
+
+/**
+ * struct ti_lmu
+ *
+ * @dev:	Parent device pointer
+ * @regmap:	Used for i2c communcation on accessing registers
+ * @en_gpio:	GPIO for HWEN pin [Optional]
+ * @notifier:	Notifier for reporting hwmon event
+ */
+struct ti_lmu {
+	struct device *dev;
+	struct regmap *regmap;
+	int en_gpio;
+	struct blocking_notifier_head notifier;
+};
+#endif
-- 
cgit v1.2.3


From ed7311f0d089553f39ff3e1e2d9f55f94324c42f Mon Sep 17 00:00:00 2001
From: Quentin Schulz <quentin.schulz@free-electrons.com>
Date: Mon, 20 Mar 2017 09:16:45 +0100
Subject: mfd: axp20x: Correct name of temperature data ADC registers

The registers 0x56 and 0x57 of AXP22X PMIC store the value of the
internal temperature of the PMIC.

This patch modifies the name of these registers from AXP22X_PMIC_ADC_H/L
to AXP22X_PMIC_TEMP_H/L so their purpose is clearer.

Signed-off-by: Quentin Schulz <quentin.schulz@free-electrons.com>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x.c       | 2 +-
 include/linux/mfd/axp20x.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index 05129004ed90..e02edf1a9ad3 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -101,7 +101,7 @@ static const struct regmap_range axp22x_volatile_ranges[] = {
 	regmap_reg_range(AXP20X_PWR_INPUT_STATUS, AXP20X_PWR_OP_MODE),
 	regmap_reg_range(AXP20X_IRQ1_EN, AXP20X_IRQ5_STATE),
 	regmap_reg_range(AXP22X_GPIO_STATE, AXP22X_GPIO_STATE),
-	regmap_reg_range(AXP22X_PMIC_ADC_H, AXP20X_IPSOUT_V_HIGH_L),
+	regmap_reg_range(AXP22X_PMIC_TEMP_H, AXP20X_IPSOUT_V_HIGH_L),
 	regmap_reg_range(AXP20X_FG_RES, AXP20X_FG_RES),
 };
 
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 0d9a1ff38393..dc8798cf2a24 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -228,8 +228,8 @@ enum axp20x_variants {
 #define AXP20X_OCV_MAX			0xf
 
 /* AXP22X specific registers */
-#define AXP22X_PMIC_ADC_H		0x56
-#define AXP22X_PMIC_ADC_L		0x57
+#define AXP22X_PMIC_TEMP_H		0x56
+#define AXP22X_PMIC_TEMP_L		0x57
 #define AXP22X_TS_ADC_H			0x58
 #define AXP22X_TS_ADC_L			0x59
 #define AXP22X_BATLOW_THRES1		0xe6
-- 
cgit v1.2.3


From f1e34ad849ad78770af067fd8e409e61b018f9d0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 17 Mar 2017 17:37:14 +0200
Subject: mfd: intel_soc_pmic_bxtwc: Move inclusion to c-file

There is no need to include intel_soc_pmic.h into header which doesn't
require it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/intel_soc_pmic_bxtwc.c | 1 +
 include/linux/mfd/intel_bxtwc.h    | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c
index 699c8c7c9052..c71db687b7ca 100644
--- a/drivers/mfd/intel_soc_pmic_bxtwc.c
+++ b/drivers/mfd/intel_soc_pmic_bxtwc.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/intel_bxtwc.h>
+#include <linux/mfd/intel_soc_pmic.h>
 #include <asm/intel_pmc_ipc.h>
 
 /* PMIC device registers */
diff --git a/include/linux/mfd/intel_bxtwc.h b/include/linux/mfd/intel_bxtwc.h
index 1a0ee9d6efe9..240d6752ec64 100644
--- a/include/linux/mfd/intel_bxtwc.h
+++ b/include/linux/mfd/intel_bxtwc.h
@@ -13,8 +13,6 @@
  * more details.
  */
 
-#include <linux/mfd/intel_soc_pmic.h>
-
 #ifndef __INTEL_BXTWC_H__
 #define __INTEL_BXTWC_H__
 
-- 
cgit v1.2.3


From 0c227c51b98c03c6e7fb4f342f930cf576292064 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 17 Mar 2017 17:37:15 +0200
Subject: mfd: intel_soc_pmic_bxtwc: Rename header to follow c-file

For better understanding of relationship between headers and modules
rename:
	intel_bxtwc.h -> intel_soc_pmic_bxtwc.h

While here, remove file name from the file itself.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/intel_soc_pmic_bxtwc.c       |  2 +-
 include/linux/mfd/intel_bxtwc.h          | 67 --------------------------------
 include/linux/mfd/intel_soc_pmic_bxtwc.h | 67 ++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 68 deletions(-)
 delete mode 100644 include/linux/mfd/intel_bxtwc.h
 create mode 100644 include/linux/mfd/intel_soc_pmic_bxtwc.h

(limited to 'include/linux')

diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c
index c71db687b7ca..1496a862baa6 100644
--- a/drivers/mfd/intel_soc_pmic_bxtwc.c
+++ b/drivers/mfd/intel_soc_pmic_bxtwc.c
@@ -20,8 +20,8 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/mfd/core.h>
-#include <linux/mfd/intel_bxtwc.h>
 #include <linux/mfd/intel_soc_pmic.h>
+#include <linux/mfd/intel_soc_pmic_bxtwc.h>
 #include <asm/intel_pmc_ipc.h>
 
 /* PMIC device registers */
diff --git a/include/linux/mfd/intel_bxtwc.h b/include/linux/mfd/intel_bxtwc.h
deleted file mode 100644
index 240d6752ec64..000000000000
--- a/include/linux/mfd/intel_bxtwc.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * intel_bxtwc.h - Header file for Intel Broxton Whiskey Cove PMIC
- *
- * Copyright (C) 2015 Intel Corporation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#ifndef __INTEL_BXTWC_H__
-#define __INTEL_BXTWC_H__
-
-/* BXT WC devices */
-#define BXTWC_DEVICE1_ADDR		0x4E
-#define BXTWC_DEVICE2_ADDR		0x4F
-#define BXTWC_DEVICE3_ADDR		0x5E
-
-/* device1 Registers */
-#define BXTWC_CHIPID			0x4E00
-#define BXTWC_CHIPVER			0x4E01
-
-#define BXTWC_SCHGRIRQ0_ADDR		0x5E1A
-#define BXTWC_CHGRCTRL0_ADDR		0x5E16
-#define BXTWC_CHGRCTRL1_ADDR		0x5E17
-#define BXTWC_CHGRCTRL2_ADDR		0x5E18
-#define BXTWC_CHGRSTATUS_ADDR		0x5E19
-#define BXTWC_THRMBATZONE_ADDR		0x4F22
-
-#define BXTWC_USBPATH_ADDR		0x5E19
-#define BXTWC_USBPHYCTRL_ADDR		0x5E07
-#define BXTWC_USBIDCTRL_ADDR		0x5E05
-#define BXTWC_USBIDEN_MASK		0x01
-#define BXTWC_USBIDSTAT_ADDR		0x00FF
-#define BXTWC_USBSRCDETSTATUS_ADDR	0x5E29
-
-#define BXTWC_DBGUSBBC1_ADDR		0x5FE0
-#define BXTWC_DBGUSBBC2_ADDR		0x5FE1
-#define BXTWC_DBGUSBBCSTAT_ADDR		0x5FE2
-
-#define BXTWC_WAKESRC_ADDR		0x4E22
-#define BXTWC_WAKESRC2_ADDR		0x4EE5
-#define BXTWC_CHRTTADDR_ADDR		0x5E22
-#define BXTWC_CHRTTDATA_ADDR		0x5E23
-
-#define BXTWC_STHRMIRQ0_ADDR		0x4F19
-#define WC_MTHRMIRQ1_ADDR		0x4E12
-#define WC_STHRMIRQ1_ADDR		0x4F1A
-#define WC_STHRMIRQ2_ADDR		0x4F1B
-
-#define BXTWC_THRMZN0H_ADDR		0x4F44
-#define BXTWC_THRMZN0L_ADDR		0x4F45
-#define BXTWC_THRMZN1H_ADDR		0x4F46
-#define BXTWC_THRMZN1L_ADDR		0x4F47
-#define BXTWC_THRMZN2H_ADDR		0x4F48
-#define BXTWC_THRMZN2L_ADDR		0x4F49
-#define BXTWC_THRMZN3H_ADDR		0x4F4A
-#define BXTWC_THRMZN3L_ADDR		0x4F4B
-#define BXTWC_THRMZN4H_ADDR		0x4F4C
-#define BXTWC_THRMZN4L_ADDR		0x4F4D
-
-#endif
diff --git a/include/linux/mfd/intel_soc_pmic_bxtwc.h b/include/linux/mfd/intel_soc_pmic_bxtwc.h
new file mode 100644
index 000000000000..0c351bc85d2d
--- /dev/null
+++ b/include/linux/mfd/intel_soc_pmic_bxtwc.h
@@ -0,0 +1,67 @@
+/*
+ * Header file for Intel Broxton Whiskey Cove PMIC
+ *
+ * Copyright (C) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __INTEL_BXTWC_H__
+#define __INTEL_BXTWC_H__
+
+/* BXT WC devices */
+#define BXTWC_DEVICE1_ADDR		0x4E
+#define BXTWC_DEVICE2_ADDR		0x4F
+#define BXTWC_DEVICE3_ADDR		0x5E
+
+/* device1 Registers */
+#define BXTWC_CHIPID			0x4E00
+#define BXTWC_CHIPVER			0x4E01
+
+#define BXTWC_SCHGRIRQ0_ADDR		0x5E1A
+#define BXTWC_CHGRCTRL0_ADDR		0x5E16
+#define BXTWC_CHGRCTRL1_ADDR		0x5E17
+#define BXTWC_CHGRCTRL2_ADDR		0x5E18
+#define BXTWC_CHGRSTATUS_ADDR		0x5E19
+#define BXTWC_THRMBATZONE_ADDR		0x4F22
+
+#define BXTWC_USBPATH_ADDR		0x5E19
+#define BXTWC_USBPHYCTRL_ADDR		0x5E07
+#define BXTWC_USBIDCTRL_ADDR		0x5E05
+#define BXTWC_USBIDEN_MASK		0x01
+#define BXTWC_USBIDSTAT_ADDR		0x00FF
+#define BXTWC_USBSRCDETSTATUS_ADDR	0x5E29
+
+#define BXTWC_DBGUSBBC1_ADDR		0x5FE0
+#define BXTWC_DBGUSBBC2_ADDR		0x5FE1
+#define BXTWC_DBGUSBBCSTAT_ADDR		0x5FE2
+
+#define BXTWC_WAKESRC_ADDR		0x4E22
+#define BXTWC_WAKESRC2_ADDR		0x4EE5
+#define BXTWC_CHRTTADDR_ADDR		0x5E22
+#define BXTWC_CHRTTDATA_ADDR		0x5E23
+
+#define BXTWC_STHRMIRQ0_ADDR		0x4F19
+#define WC_MTHRMIRQ1_ADDR		0x4E12
+#define WC_STHRMIRQ1_ADDR		0x4F1A
+#define WC_STHRMIRQ2_ADDR		0x4F1B
+
+#define BXTWC_THRMZN0H_ADDR		0x4F44
+#define BXTWC_THRMZN0L_ADDR		0x4F45
+#define BXTWC_THRMZN1H_ADDR		0x4F46
+#define BXTWC_THRMZN1L_ADDR		0x4F47
+#define BXTWC_THRMZN2H_ADDR		0x4F48
+#define BXTWC_THRMZN2L_ADDR		0x4F49
+#define BXTWC_THRMZN3H_ADDR		0x4F4A
+#define BXTWC_THRMZN3L_ADDR		0x4F4B
+#define BXTWC_THRMZN4H_ADDR		0x4F4C
+#define BXTWC_THRMZN4L_ADDR		0x4F4D
+
+#endif
-- 
cgit v1.2.3


From fe9d7cb22ef3a26a74e49730c0efdbdae4b17d4b Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 16 Mar 2017 09:30:28 +0100
Subject: mfd: syscon: atmel-smc: Add new helpers to ease SMC regs manipulation

These new helpers + macro definitions are meant to replace the old ones
which are unpractical to use.

Note that the macros and function prefixes have been intentionally
changed to ATMEL_[H]SMC_XX and atmel_[h]smc_ to reflect the fact that
this IP is also embedded in avr32 SoCs (and not only in at91 ones).

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |   4 +
 drivers/mfd/Makefile                 |   1 +
 drivers/mfd/atmel-smc.c              | 314 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/syscon/atmel-smc.h |  87 ++++++++++
 4 files changed, 406 insertions(+)
 create mode 100644 drivers/mfd/atmel-smc.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index add0c35c38c5..ce3a9180a765 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -121,6 +121,10 @@ config MFD_ATMEL_HLCDC
 	  additional drivers must be enabled in order to use the
 	  functionality of the device.
 
+config MFD_ATMEL_SMC
+	bool
+	select MFD_SYSCON
+
 config MFD_BCM590XX
 	tristate "Broadcom BCM590xx PMUs"
 	select MFD_CORE
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 294a0525aeb1..fa86dbe65e52 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -185,6 +185,7 @@ obj-$(CONFIG_MFD_TPS65090)	+= tps65090.o
 obj-$(CONFIG_MFD_AAT2870_CORE)	+= aat2870-core.o
 obj-$(CONFIG_MFD_ATMEL_FLEXCOM)	+= atmel-flexcom.o
 obj-$(CONFIG_MFD_ATMEL_HLCDC)	+= atmel-hlcdc.o
+obj-$(CONFIG_MFD_ATMEL_SMC)	+= atmel-smc.o
 obj-$(CONFIG_MFD_INTEL_LPSS)	+= intel-lpss.o
 obj-$(CONFIG_MFD_INTEL_LPSS_PCI)	+= intel-lpss-pci.o
 obj-$(CONFIG_MFD_INTEL_LPSS_ACPI)	+= intel-lpss-acpi.o
diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c
new file mode 100644
index 000000000000..954cf0f66a31
--- /dev/null
+++ b/drivers/mfd/atmel-smc.c
@@ -0,0 +1,314 @@
+/*
+ * Atmel SMC (Static Memory Controller) helper functions.
+ *
+ * Copyright (C) 2017 Atmel
+ * Copyright (C) 2017 Free Electrons
+ *
+ * Author: Boris Brezillon <boris.brezillon@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/mfd/syscon/atmel-smc.h>
+
+/**
+ * atmel_smc_cs_conf_init - initialize a SMC CS conf
+ * @conf: the SMC CS conf to initialize
+ *
+ * Set all fields to 0 so that one can start defining a new config.
+ */
+void atmel_smc_cs_conf_init(struct atmel_smc_cs_conf *conf)
+{
+	memset(conf, 0, sizeof(*conf));
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_init);
+
+/**
+ * atmel_smc_cs_encode_ncycles - encode a number of MCK clk cycles in the
+ *				 format expected by the SMC engine
+ * @ncycles: number of MCK clk cycles
+ * @msbpos: position of the MSB part of the timing field
+ * @msbwidth: width of the MSB part of the timing field
+ * @msbfactor: factor applied to the MSB
+ * @encodedval: param used to store the encoding result
+ *
+ * This function encodes the @ncycles value as described in the datasheet
+ * (section "SMC Setup/Pulse/Cycle/Timings Register"). This is a generic
+ * helper which called with different parameter depending on the encoding
+ * scheme.
+ *
+ * If the @ncycles value is too big to be encoded, -ERANGE is returned and
+ * the encodedval is contains the maximum val. Otherwise, 0 is returned.
+ */
+static int atmel_smc_cs_encode_ncycles(unsigned int ncycles,
+				       unsigned int msbpos,
+				       unsigned int msbwidth,
+				       unsigned int msbfactor,
+				       unsigned int *encodedval)
+{
+	unsigned int lsbmask = GENMASK(msbpos - 1, 0);
+	unsigned int msbmask = GENMASK(msbwidth - 1, 0);
+	unsigned int msb, lsb;
+	int ret = 0;
+
+	msb = ncycles / msbfactor;
+	lsb = ncycles % msbfactor;
+
+	if (lsb > lsbmask) {
+		lsb = 0;
+		msb++;
+	}
+
+	/*
+	 * Let's just put the maximum we can if the requested setting does
+	 * not fit in the register field.
+	 * We still return -ERANGE in case the caller cares.
+	 */
+	if (msb > msbmask) {
+		msb = msbmask;
+		lsb = lsbmask;
+		ret = -ERANGE;
+	}
+
+	*encodedval = (msb << msbpos) | lsb;
+
+	return ret;
+}
+
+/**
+ * atmel_smc_cs_conf_set_timing - set the SMC CS conf Txx parameter to a
+ *				  specific value
+ * @conf: SMC CS conf descriptor
+ * @shift: the position of the Txx field in the TIMINGS register
+ * @ncycles: value (expressed in MCK clk cycles) to assign to this Txx
+ *	     parameter
+ *
+ * This function encodes the @ncycles value as described in the datasheet
+ * (section "SMC Timings Register"), and then stores the result in the
+ * @conf->timings field at @shift position.
+ *
+ * Returns -EINVAL if shift is invalid, -ERANGE if ncycles does not fit in
+ * the field, and 0 otherwise.
+ */
+int atmel_smc_cs_conf_set_timing(struct atmel_smc_cs_conf *conf,
+				 unsigned int shift, unsigned int ncycles)
+{
+	unsigned int val;
+	int ret;
+
+	if (shift != ATMEL_HSMC_TIMINGS_TCLR_SHIFT &&
+	    shift != ATMEL_HSMC_TIMINGS_TADL_SHIFT &&
+	    shift != ATMEL_HSMC_TIMINGS_TAR_SHIFT &&
+	    shift != ATMEL_HSMC_TIMINGS_TRR_SHIFT &&
+	    shift != ATMEL_HSMC_TIMINGS_TWB_SHIFT)
+		return -EINVAL;
+
+	/*
+	 * The formula described in atmel datasheets (section "HSMC Timings
+	 * Register"):
+	 *
+	 * ncycles = (Txx[3] * 64) + Txx[2:0]
+	 */
+	ret = atmel_smc_cs_encode_ncycles(ncycles, 3, 1, 64, &val);
+	conf->timings &= ~GENMASK(shift + 3, shift);
+	conf->timings |= val << shift;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_set_timing);
+
+/**
+ * atmel_smc_cs_conf_set_setup - set the SMC CS conf xx_SETUP parameter to a
+ *				 specific value
+ * @conf: SMC CS conf descriptor
+ * @shift: the position of the xx_SETUP field in the SETUP register
+ * @ncycles: value (expressed in MCK clk cycles) to assign to this xx_SETUP
+ *	     parameter
+ *
+ * This function encodes the @ncycles value as described in the datasheet
+ * (section "SMC Setup Register"), and then stores the result in the
+ * @conf->setup field at @shift position.
+ *
+ * Returns -EINVAL if @shift is invalid, -ERANGE if @ncycles does not fit in
+ * the field, and 0 otherwise.
+ */
+int atmel_smc_cs_conf_set_setup(struct atmel_smc_cs_conf *conf,
+				unsigned int shift, unsigned int ncycles)
+{
+	unsigned int val;
+	int ret;
+
+	if (shift != ATMEL_SMC_NWE_SHIFT && shift != ATMEL_SMC_NCS_WR_SHIFT &&
+	    shift != ATMEL_SMC_NRD_SHIFT && shift != ATMEL_SMC_NCS_RD_SHIFT)
+		return -EINVAL;
+
+	/*
+	 * The formula described in atmel datasheets (section "SMC Setup
+	 * Register"):
+	 *
+	 * ncycles = (128 * xx_SETUP[5]) + xx_SETUP[4:0]
+	 */
+	ret = atmel_smc_cs_encode_ncycles(ncycles, 5, 1, 128, &val);
+	conf->setup &= ~GENMASK(shift + 7, shift);
+	conf->setup |= val << shift;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_set_setup);
+
+/**
+ * atmel_smc_cs_conf_set_pulse - set the SMC CS conf xx_PULSE parameter to a
+ *				 specific value
+ * @conf: SMC CS conf descriptor
+ * @shift: the position of the xx_PULSE field in the PULSE register
+ * @ncycles: value (expressed in MCK clk cycles) to assign to this xx_PULSE
+ *	     parameter
+ *
+ * This function encodes the @ncycles value as described in the datasheet
+ * (section "SMC Pulse Register"), and then stores the result in the
+ * @conf->setup field at @shift position.
+ *
+ * Returns -EINVAL if @shift is invalid, -ERANGE if @ncycles does not fit in
+ * the field, and 0 otherwise.
+ */
+int atmel_smc_cs_conf_set_pulse(struct atmel_smc_cs_conf *conf,
+				unsigned int shift, unsigned int ncycles)
+{
+	unsigned int val;
+	int ret;
+
+	if (shift != ATMEL_SMC_NWE_SHIFT && shift != ATMEL_SMC_NCS_WR_SHIFT &&
+	    shift != ATMEL_SMC_NRD_SHIFT && shift != ATMEL_SMC_NCS_RD_SHIFT)
+		return -EINVAL;
+
+	/*
+	 * The formula described in atmel datasheets (section "SMC Pulse
+	 * Register"):
+	 *
+	 * ncycles = (256 * xx_PULSE[6]) + xx_PULSE[5:0]
+	 */
+	ret = atmel_smc_cs_encode_ncycles(ncycles, 6, 1, 256, &val);
+	conf->pulse &= ~GENMASK(shift + 7, shift);
+	conf->pulse |= val << shift;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_set_pulse);
+
+/**
+ * atmel_smc_cs_conf_set_cycle - set the SMC CS conf xx_CYCLE parameter to a
+ *				 specific value
+ * @conf: SMC CS conf descriptor
+ * @shift: the position of the xx_CYCLE field in the CYCLE register
+ * @ncycles: value (expressed in MCK clk cycles) to assign to this xx_CYCLE
+ *	     parameter
+ *
+ * This function encodes the @ncycles value as described in the datasheet
+ * (section "SMC Pulse Register"), and then stores the result in the
+ * @conf->setup field at @shift position.
+ *
+ * Returns -EINVAL if @shift is invalid, -ERANGE if @ncycles does not fit in
+ * the field, and 0 otherwise.
+ */
+int atmel_smc_cs_conf_set_cycle(struct atmel_smc_cs_conf *conf,
+				unsigned int shift, unsigned int ncycles)
+{
+	unsigned int val;
+	int ret;
+
+	if (shift != ATMEL_SMC_NWE_SHIFT && shift != ATMEL_SMC_NRD_SHIFT)
+		return -EINVAL;
+
+	/*
+	 * The formula described in atmel datasheets (section "SMC Cycle
+	 * Register"):
+	 *
+	 * ncycles = (xx_CYCLE[8:7] * 256) + xx_CYCLE[6:0]
+	 */
+	ret = atmel_smc_cs_encode_ncycles(ncycles, 7, 2, 256, &val);
+	conf->cycle &= ~GENMASK(shift + 15, shift);
+	conf->cycle |= val << shift;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_set_cycle);
+
+/**
+ * atmel_smc_cs_conf_apply - apply an SMC CS conf
+ * @regmap: the SMC regmap
+ * @cs: the CS id
+ * @conf the SMC CS conf to apply
+ *
+ * Applies an SMC CS configuration.
+ * Only valid on at91sam9/avr32 SoCs.
+ */
+void atmel_smc_cs_conf_apply(struct regmap *regmap, int cs,
+			     const struct atmel_smc_cs_conf *conf)
+{
+	regmap_write(regmap, ATMEL_SMC_SETUP(cs), conf->setup);
+	regmap_write(regmap, ATMEL_SMC_PULSE(cs), conf->pulse);
+	regmap_write(regmap, ATMEL_SMC_CYCLE(cs), conf->cycle);
+	regmap_write(regmap, ATMEL_SMC_MODE(cs), conf->mode);
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_apply);
+
+/**
+ * atmel_hsmc_cs_conf_apply - apply an SMC CS conf
+ * @regmap: the HSMC regmap
+ * @cs: the CS id
+ * @conf the SMC CS conf to apply
+ *
+ * Applies an SMC CS configuration.
+ * Only valid on post-sama5 SoCs.
+ */
+void atmel_hsmc_cs_conf_apply(struct regmap *regmap, int cs,
+			      const struct atmel_smc_cs_conf *conf)
+{
+	regmap_write(regmap, ATMEL_HSMC_SETUP(cs), conf->setup);
+	regmap_write(regmap, ATMEL_HSMC_PULSE(cs), conf->pulse);
+	regmap_write(regmap, ATMEL_HSMC_CYCLE(cs), conf->cycle);
+	regmap_write(regmap, ATMEL_HSMC_TIMINGS(cs), conf->timings);
+	regmap_write(regmap, ATMEL_HSMC_MODE(cs), conf->mode);
+}
+EXPORT_SYMBOL_GPL(atmel_hsmc_cs_conf_apply);
+
+/**
+ * atmel_smc_cs_conf_get - retrieve the current SMC CS conf
+ * @regmap: the SMC regmap
+ * @cs: the CS id
+ * @conf: the SMC CS conf object to store the current conf
+ *
+ * Retrieve the SMC CS configuration.
+ * Only valid on at91sam9/avr32 SoCs.
+ */
+void atmel_smc_cs_conf_get(struct regmap *regmap, int cs,
+			   struct atmel_smc_cs_conf *conf)
+{
+	regmap_read(regmap, ATMEL_SMC_SETUP(cs), &conf->setup);
+	regmap_read(regmap, ATMEL_SMC_PULSE(cs), &conf->pulse);
+	regmap_read(regmap, ATMEL_SMC_CYCLE(cs), &conf->cycle);
+	regmap_read(regmap, ATMEL_SMC_MODE(cs), &conf->mode);
+}
+EXPORT_SYMBOL_GPL(atmel_smc_cs_conf_get);
+
+/**
+ * atmel_hsmc_cs_conf_get - retrieve the current SMC CS conf
+ * @regmap: the HSMC regmap
+ * @cs: the CS id
+ * @conf: the SMC CS conf object to store the current conf
+ *
+ * Retrieve the SMC CS configuration.
+ * Only valid on post-sama5 SoCs.
+ */
+void atmel_hsmc_cs_conf_get(struct regmap *regmap, int cs,
+			    struct atmel_smc_cs_conf *conf)
+{
+	regmap_read(regmap, ATMEL_HSMC_SETUP(cs), &conf->setup);
+	regmap_read(regmap, ATMEL_HSMC_PULSE(cs), &conf->pulse);
+	regmap_read(regmap, ATMEL_HSMC_CYCLE(cs), &conf->cycle);
+	regmap_read(regmap, ATMEL_HSMC_TIMINGS(cs), &conf->timings);
+	regmap_read(regmap, ATMEL_HSMC_MODE(cs), &conf->mode);
+}
+EXPORT_SYMBOL_GPL(atmel_hsmc_cs_conf_get);
diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h
index be6ebe64eebe..00e6e3c8ee6f 100644
--- a/include/linux/mfd/syscon/atmel-smc.h
+++ b/include/linux/mfd/syscon/atmel-smc.h
@@ -69,6 +69,93 @@
 #define AT91_SMC_PS_16			(2 << 28)
 #define AT91_SMC_PS_32			(3 << 28)
 
+#define ATMEL_SMC_SETUP(cs)			(((cs) * 0x10))
+#define ATMEL_HSMC_SETUP(cs)			(0x600 + ((cs) * 0x14))
+#define ATMEL_SMC_PULSE(cs)			(((cs) * 0x10) + 0x4)
+#define ATMEL_HSMC_PULSE(cs)			(0x600 + ((cs) * 0x14) + 0x4)
+#define ATMEL_SMC_CYCLE(cs)			(((cs) * 0x10) + 0x8)
+#define ATMEL_HSMC_CYCLE(cs)			(0x600 + ((cs) * 0x14) + 0x8)
+#define ATMEL_SMC_NWE_SHIFT			0
+#define ATMEL_SMC_NCS_WR_SHIFT			8
+#define ATMEL_SMC_NRD_SHIFT			16
+#define ATMEL_SMC_NCS_RD_SHIFT			24
+
+#define ATMEL_SMC_MODE(cs)			(((cs) * 0x10) + 0xc)
+#define ATMEL_HSMC_MODE(cs)			(0x600 + ((cs) * 0x14) + 0x10)
+#define ATMEL_SMC_MODE_READMODE_MASK		BIT(0)
+#define ATMEL_SMC_MODE_READMODE_NCS		(0 << 0)
+#define ATMEL_SMC_MODE_READMODE_NRD		(1 << 0)
+#define ATMEL_SMC_MODE_WRITEMODE_MASK		BIT(1)
+#define ATMEL_SMC_MODE_WRITEMODE_NCS		(0 << 1)
+#define ATMEL_SMC_MODE_WRITEMODE_NWE		(1 << 1)
+#define ATMEL_SMC_MODE_EXNWMODE_MASK		GENMASK(5, 4)
+#define ATMEL_SMC_MODE_EXNWMODE_DISABLE		(0 << 4)
+#define ATMEL_SMC_MODE_EXNWMODE_FROZEN		(2 << 4)
+#define ATMEL_SMC_MODE_EXNWMODE_READY		(3 << 4)
+#define ATMEL_SMC_MODE_BAT_MASK			BIT(8)
+#define ATMEL_SMC_MODE_BAT_SELECT		(0 << 8)
+#define ATMEL_SMC_MODE_BAT_WRITE		(1 << 8)
+#define ATMEL_SMC_MODE_DBW_MASK			GENMASK(13, 12)
+#define ATMEL_SMC_MODE_DBW_8			(0 << 12)
+#define ATMEL_SMC_MODE_DBW_16			(1 << 12)
+#define ATMEL_SMC_MODE_DBW_32			(2 << 12)
+#define ATMEL_SMC_MODE_TDF_MASK			GENMASK(19, 16)
+#define ATMEL_SMC_MODE_TDF(x)			(((x) - 1) << 16)
+#define ATMEL_SMC_MODE_TDF_MAX			16
+#define ATMEL_SMC_MODE_TDF_MIN			1
+#define ATMEL_SMC_MODE_TDFMODE_OPTIMIZED	BIT(20)
+#define ATMEL_SMC_MODE_PMEN			BIT(24)
+#define ATMEL_SMC_MODE_PS_MASK			GENMASK(29, 28)
+#define ATMEL_SMC_MODE_PS_4			(0 << 28)
+#define ATMEL_SMC_MODE_PS_8			(1 << 28)
+#define ATMEL_SMC_MODE_PS_16			(2 << 28)
+#define ATMEL_SMC_MODE_PS_32			(3 << 28)
+
+#define ATMEL_HSMC_TIMINGS(cs)			(0x600 + ((cs) * 0x14) + 0xc)
+#define ATMEL_HSMC_TIMINGS_OCMS			BIT(12)
+#define ATMEL_HSMC_TIMINGS_RBNSEL(x)		((x) << 28)
+#define ATMEL_HSMC_TIMINGS_NFSEL		BIT(31)
+#define ATMEL_HSMC_TIMINGS_TCLR_SHIFT		0
+#define ATMEL_HSMC_TIMINGS_TADL_SHIFT		4
+#define ATMEL_HSMC_TIMINGS_TAR_SHIFT		8
+#define ATMEL_HSMC_TIMINGS_TRR_SHIFT		16
+#define ATMEL_HSMC_TIMINGS_TWB_SHIFT		24
+
+/**
+ * struct atmel_smc_cs_conf - SMC CS config as described in the datasheet.
+ * @setup: NCS/NWE/NRD setup timings (not applicable to at91rm9200)
+ * @pulse: NCS/NWE/NRD pulse timings (not applicable to at91rm9200)
+ * @cycle: NWE/NRD cycle timings (not applicable to at91rm9200)
+ * @timings: advanced NAND related timings (only applicable to HSMC)
+ * @mode: all kind of config parameters (see the fields definition above).
+ *	  The mode fields are different on at91rm9200
+ */
+struct atmel_smc_cs_conf {
+	u32 setup;
+	u32 pulse;
+	u32 cycle;
+	u32 timings;
+	u32 mode;
+};
+
+void atmel_smc_cs_conf_init(struct atmel_smc_cs_conf *conf);
+int atmel_smc_cs_conf_set_timing(struct atmel_smc_cs_conf *conf,
+				 unsigned int shift,
+				 unsigned int ncycles);
+int atmel_smc_cs_conf_set_setup(struct atmel_smc_cs_conf *conf,
+				unsigned int shift, unsigned int ncycles);
+int atmel_smc_cs_conf_set_pulse(struct atmel_smc_cs_conf *conf,
+				unsigned int shift, unsigned int ncycles);
+int atmel_smc_cs_conf_set_cycle(struct atmel_smc_cs_conf *conf,
+				unsigned int shift, unsigned int ncycles);
+void atmel_smc_cs_conf_apply(struct regmap *regmap, int cs,
+			     const struct atmel_smc_cs_conf *conf);
+void atmel_hsmc_cs_conf_apply(struct regmap *regmap, int cs,
+			      const struct atmel_smc_cs_conf *conf);
+void atmel_smc_cs_conf_get(struct regmap *regmap, int cs,
+			   struct atmel_smc_cs_conf *conf);
+void atmel_hsmc_cs_conf_get(struct regmap *regmap, int cs,
+			    struct atmel_smc_cs_conf *conf);
 
 /*
  * This function converts a setup timing expressed in nanoseconds into an
-- 
cgit v1.2.3


From 0d69080d9e01d5d60f1887def2080ce3f66f5856 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Thu, 16 Mar 2017 09:30:31 +0100
Subject: mfd: syscon: atmel-smc: Remove unused helpers/macros

All macros prefixed with AT91[SAM9]_SMC have been replaced by equivalent
definitions prefixed with ATMEL_SMC, and the at91sam9_smc_xxxx() helpers
are no longer used.
Drop these definitions before someone starts using them again.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/syscon/atmel-smc.h | 152 -----------------------------------
 1 file changed, 152 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h
index 00e6e3c8ee6f..afa266169800 100644
--- a/include/linux/mfd/syscon/atmel-smc.h
+++ b/include/linux/mfd/syscon/atmel-smc.h
@@ -17,58 +17,6 @@
 #include <linux/kernel.h>
 #include <linux/regmap.h>
 
-#define AT91SAM9_SMC_GENERIC		0x00
-#define AT91SAM9_SMC_GENERIC_BLK_SZ	0x10
-
-#define SAMA5_SMC_GENERIC		0x600
-#define SAMA5_SMC_GENERIC_BLK_SZ	0x14
-
-#define AT91SAM9_SMC_SETUP(o)		((o) + 0x00)
-#define AT91SAM9_SMC_NWESETUP(x)	(x)
-#define AT91SAM9_SMC_NCS_WRSETUP(x)	((x) << 8)
-#define AT91SAM9_SMC_NRDSETUP(x)	((x) << 16)
-#define AT91SAM9_SMC_NCS_NRDSETUP(x)	((x) << 24)
-
-#define AT91SAM9_SMC_PULSE(o)		((o) + 0x04)
-#define AT91SAM9_SMC_NWEPULSE(x)	(x)
-#define AT91SAM9_SMC_NCS_WRPULSE(x)	((x) << 8)
-#define AT91SAM9_SMC_NRDPULSE(x)	((x) << 16)
-#define AT91SAM9_SMC_NCS_NRDPULSE(x)	((x) << 24)
-
-#define AT91SAM9_SMC_CYCLE(o)		((o) + 0x08)
-#define AT91SAM9_SMC_NWECYCLE(x)	(x)
-#define AT91SAM9_SMC_NRDCYCLE(x)	((x) << 16)
-
-#define AT91SAM9_SMC_MODE(o)		((o) + 0x0c)
-#define SAMA5_SMC_MODE(o)		((o) + 0x10)
-#define AT91_SMC_READMODE		BIT(0)
-#define AT91_SMC_READMODE_NCS		(0 << 0)
-#define AT91_SMC_READMODE_NRD		(1 << 0)
-#define AT91_SMC_WRITEMODE		BIT(1)
-#define AT91_SMC_WRITEMODE_NCS		(0 << 1)
-#define AT91_SMC_WRITEMODE_NWE		(1 << 1)
-#define AT91_SMC_EXNWMODE		GENMASK(5, 4)
-#define AT91_SMC_EXNWMODE_DISABLE	(0 << 4)
-#define AT91_SMC_EXNWMODE_FROZEN	(2 << 4)
-#define AT91_SMC_EXNWMODE_READY		(3 << 4)
-#define AT91_SMC_BAT			BIT(8)
-#define AT91_SMC_BAT_SELECT		(0 << 8)
-#define AT91_SMC_BAT_WRITE		(1 << 8)
-#define AT91_SMC_DBW			GENMASK(13, 12)
-#define AT91_SMC_DBW_8			(0 << 12)
-#define AT91_SMC_DBW_16			(1 << 12)
-#define AT91_SMC_DBW_32			(2 << 12)
-#define AT91_SMC_TDF			GENMASK(19, 16)
-#define AT91_SMC_TDF_(x)		((((x) - 1) << 16) & AT91_SMC_TDF)
-#define AT91_SMC_TDF_MAX		16
-#define AT91_SMC_TDFMODE_OPTIMIZED	BIT(20)
-#define AT91_SMC_PMEN			BIT(24)
-#define AT91_SMC_PS			GENMASK(29, 28)
-#define AT91_SMC_PS_4			(0 << 28)
-#define AT91_SMC_PS_8			(1 << 28)
-#define AT91_SMC_PS_16			(2 << 28)
-#define AT91_SMC_PS_32			(3 << 28)
-
 #define ATMEL_SMC_SETUP(cs)			(((cs) * 0x10))
 #define ATMEL_HSMC_SETUP(cs)			(0x600 + ((cs) * 0x14))
 #define ATMEL_SMC_PULSE(cs)			(((cs) * 0x10) + 0x4)
@@ -157,104 +105,4 @@ void atmel_smc_cs_conf_get(struct regmap *regmap, int cs,
 void atmel_hsmc_cs_conf_get(struct regmap *regmap, int cs,
 			    struct atmel_smc_cs_conf *conf);
 
-/*
- * This function converts a setup timing expressed in nanoseconds into an
- * encoded value that can be written in the SMC_SETUP register.
- *
- * The following formula is described in atmel datasheets (section
- * "SMC Setup Register"):
- *
- * setup length = (128* SETUP[5] + SETUP[4:0])
- *
- * where setup length is the timing expressed in cycles.
- */
-static inline u32 at91sam9_smc_setup_ns_to_cycles(unsigned int clk_rate,
-						  u32 timing_ns)
-{
-	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
-	u32 coded_cycles = 0;
-	u32 cycles;
-
-	cycles = DIV_ROUND_UP(timing_ns, clk_period);
-	if (cycles / 32) {
-		coded_cycles |= 1 << 5;
-		if (cycles < 128)
-			cycles = 0;
-	}
-
-	coded_cycles |= cycles % 32;
-
-	return coded_cycles;
-}
-
-/*
- * This function converts a pulse timing expressed in nanoseconds into an
- * encoded value that can be written in the SMC_PULSE register.
- *
- * The following formula is described in atmel datasheets (section
- * "SMC Pulse Register"):
- *
- * pulse length = (256* PULSE[6] + PULSE[5:0])
- *
- * where pulse length is the timing expressed in cycles.
- */
-static inline u32 at91sam9_smc_pulse_ns_to_cycles(unsigned int clk_rate,
-						  u32 timing_ns)
-{
-	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
-	u32 coded_cycles = 0;
-	u32 cycles;
-
-	cycles = DIV_ROUND_UP(timing_ns, clk_period);
-	if (cycles / 64) {
-		coded_cycles |= 1 << 6;
-		if (cycles < 256)
-			cycles = 0;
-	}
-
-	coded_cycles |= cycles % 64;
-
-	return coded_cycles;
-}
-
-/*
- * This function converts a cycle timing expressed in nanoseconds into an
- * encoded value that can be written in the SMC_CYCLE register.
- *
- * The following formula is described in atmel datasheets (section
- * "SMC Cycle Register"):
- *
- * cycle length = (CYCLE[8:7]*256 + CYCLE[6:0])
- *
- * where cycle length is the timing expressed in cycles.
- */
-static inline u32 at91sam9_smc_cycle_ns_to_cycles(unsigned int clk_rate,
-						  u32 timing_ns)
-{
-	u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate);
-	u32 coded_cycles = 0;
-	u32 cycles;
-
-	cycles = DIV_ROUND_UP(timing_ns, clk_period);
-	if (cycles / 128) {
-		coded_cycles = cycles / 256;
-		cycles %= 256;
-		if (cycles >= 128) {
-			coded_cycles++;
-			cycles = 0;
-		}
-
-		if (coded_cycles > 0x3) {
-			coded_cycles = 0x3;
-			cycles = 0x7f;
-		}
-
-		coded_cycles <<= 7;
-	}
-
-	coded_cycles |= cycles % 128;
-
-	return coded_cycles;
-}
-
 #endif /* _LINUX_MFD_SYSCON_ATMEL_SMC_H_ */
-- 
cgit v1.2.3


From 656211b1dfb9e0b68d4e634931432e29a6facf46 Mon Sep 17 00:00:00 2001
From: Steve Twiss <stwiss.opensource@diasemi.com>
Date: Mon, 3 Apr 2017 15:46:40 +0100
Subject: mfd: Add support for DA9061

MFD support for DA9061 is provided as part of the DA9062 device driver.

The registers header file adds two new chip variant IDs defined in DA9061
and DA9062 hardware. The core header file adds new software enumerations
for listing the valid DA9061 IRQs and a da9062_compatible_types enumeration
for distinguishing between DA9061/62 devices in software.

The core source code adds a new .compatible of_device_id entry. This is
extended from DA9062 to support both "dlg,da9061" and "dlg,da9062". The
.data entry now holds a reference to the enumerated device type.

A new regmap_irq_chip model is added for DA9061 and this supports the new
list of regmap_irq entries. A new mfd_cell da9061_devs[] array lists the
new sub system components for DA9061. Support is added for a new DA9061
regmap_config which lists the correct readable, writable and volatile
ranges for this chip.

The probe function uses the device tree compatible string to switch on the
da9062_compatible_types and configure the correct mfd cells, irq chip and
regmap config.

Kconfig is updated to reflect support for DA9061 and DA9062 PMICs.

Signed-off-by: Steve Twiss <stwiss.opensource@diasemi.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |   5 +-
 drivers/mfd/da9062-core.c            | 427 +++++++++++++++++++++++++++++++++--
 include/linux/mfd/da9062/core.h      |  29 ++-
 include/linux/mfd/da9062/registers.h |   5 +-
 4 files changed, 443 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index ce3a9180a765..de68b5ba8741 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -267,13 +267,14 @@ config MFD_DA9055
 	  called "da9055"
 
 config MFD_DA9062
-	tristate "Dialog Semiconductor DA9062 PMIC Support"
+	tristate "Dialog Semiconductor DA9062/61 PMIC Support"
 	select MFD_CORE
 	select REGMAP_I2C
 	select REGMAP_IRQ
 	depends on I2C
 	help
-	  Say yes here for support for the Dialog Semiconductor DA9062 PMIC.
+	  Say yes here for support for the Dialog Semiconductor DA9061 and
+	  DA9062 PMICs.
 	  This includes the I2C driver and core APIs.
 	  Additional drivers must be enabled in order to use the functionality
 	  of the device.
diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
index 8f873866ea60..7f5e8be0a9ea 100644
--- a/drivers/mfd/da9062-core.c
+++ b/drivers/mfd/da9062-core.c
@@ -1,6 +1,6 @@
 /*
- * Core, IRQ and I2C device driver for DA9062 PMIC
- * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ * Core, IRQ and I2C device driver for DA9061 and DA9062 PMICs
+ * Copyright (C) 2015-2017  Dialog Semiconductor
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -30,6 +30,70 @@
 #define	DA9062_REG_EVENT_B_OFFSET	1
 #define	DA9062_REG_EVENT_C_OFFSET	2
 
+static struct regmap_irq da9061_irqs[] = {
+	/* EVENT A */
+	[DA9061_IRQ_ONKEY] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_NONKEY_MASK,
+	},
+	[DA9061_IRQ_WDG_WARN] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_WDG_WARN_MASK,
+	},
+	[DA9061_IRQ_SEQ_RDY] = {
+		.reg_offset = DA9062_REG_EVENT_A_OFFSET,
+		.mask = DA9062AA_M_SEQ_RDY_MASK,
+	},
+	/* EVENT B */
+	[DA9061_IRQ_TEMP] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_TEMP_MASK,
+	},
+	[DA9061_IRQ_LDO_LIM] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_LDO_LIM_MASK,
+	},
+	[DA9061_IRQ_DVC_RDY] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_DVC_RDY_MASK,
+	},
+	[DA9061_IRQ_VDD_WARN] = {
+		.reg_offset = DA9062_REG_EVENT_B_OFFSET,
+		.mask = DA9062AA_M_VDD_WARN_MASK,
+	},
+	/* EVENT C */
+	[DA9061_IRQ_GPI0] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI0_MASK,
+	},
+	[DA9061_IRQ_GPI1] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI1_MASK,
+	},
+	[DA9061_IRQ_GPI2] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI2_MASK,
+	},
+	[DA9061_IRQ_GPI3] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI3_MASK,
+	},
+	[DA9061_IRQ_GPI4] = {
+		.reg_offset = DA9062_REG_EVENT_C_OFFSET,
+		.mask = DA9062AA_M_GPI4_MASK,
+	},
+};
+
+static struct regmap_irq_chip da9061_irq_chip = {
+	.name = "da9061-irq",
+	.irqs = da9061_irqs,
+	.num_irqs = DA9061_NUM_IRQ,
+	.num_regs = 3,
+	.status_base = DA9062AA_EVENT_A,
+	.mask_base = DA9062AA_IRQ_MASK_A,
+	.ack_base = DA9062AA_EVENT_A,
+};
+
 static struct regmap_irq da9062_irqs[] = {
 	/* EVENT A */
 	[DA9062_IRQ_ONKEY] = {
@@ -102,6 +166,57 @@ static struct regmap_irq_chip da9062_irq_chip = {
 	.ack_base = DA9062AA_EVENT_A,
 };
 
+static struct resource da9061_core_resources[] = {
+	DEFINE_RES_IRQ_NAMED(DA9061_IRQ_VDD_WARN, "VDD_WARN"),
+};
+
+static struct resource da9061_regulators_resources[] = {
+	DEFINE_RES_IRQ_NAMED(DA9061_IRQ_LDO_LIM, "LDO_LIM"),
+};
+
+static struct resource da9061_thermal_resources[] = {
+	DEFINE_RES_IRQ_NAMED(DA9061_IRQ_TEMP, "THERMAL"),
+};
+
+static struct resource da9061_wdt_resources[] = {
+	DEFINE_RES_IRQ_NAMED(DA9061_IRQ_WDG_WARN, "WD_WARN"),
+};
+
+static struct resource da9061_onkey_resources[] = {
+	DEFINE_RES_IRQ_NAMED(DA9061_IRQ_ONKEY, "ONKEY"),
+};
+
+static const struct mfd_cell da9061_devs[] = {
+	{
+		.name		= "da9061-core",
+		.num_resources	= ARRAY_SIZE(da9061_core_resources),
+		.resources	= da9061_core_resources,
+	},
+	{
+		.name		= "da9062-regulators",
+		.num_resources	= ARRAY_SIZE(da9061_regulators_resources),
+		.resources	= da9061_regulators_resources,
+	},
+	{
+		.name		= "da9061-watchdog",
+		.num_resources	= ARRAY_SIZE(da9061_wdt_resources),
+		.resources	= da9061_wdt_resources,
+		.of_compatible  = "dlg,da9061-watchdog",
+	},
+	{
+		.name		= "da9061-thermal",
+		.num_resources	= ARRAY_SIZE(da9061_thermal_resources),
+		.resources	= da9061_thermal_resources,
+		.of_compatible  = "dlg,da9061-thermal",
+	},
+	{
+		.name		= "da9061-onkey",
+		.num_resources	= ARRAY_SIZE(da9061_onkey_resources),
+		.resources	= da9061_onkey_resources,
+		.of_compatible = "dlg,da9061-onkey",
+	},
+};
+
 static struct resource da9062_core_resources[] = {
 	DEFINE_RES_NAMED(DA9062_IRQ_VDD_WARN, 1, "VDD_WARN", IORESOURCE_IRQ),
 };
@@ -200,7 +315,8 @@ static int da9062_clear_fault_log(struct da9062 *chip)
 
 static int da9062_get_device_type(struct da9062 *chip)
 {
-	int device_id, variant_id, variant_mrc;
+	int device_id, variant_id, variant_mrc, variant_vrc;
+	char *type;
 	int ret;
 
 	ret = regmap_read(chip->regmap, DA9062AA_DEVICE_ID, &device_id);
@@ -219,9 +335,23 @@ static int da9062_get_device_type(struct da9062 *chip)
 		return -EIO;
 	}
 
+	variant_vrc = (variant_id & DA9062AA_VRC_MASK) >> DA9062AA_VRC_SHIFT;
+
+	switch (variant_vrc) {
+	case DA9062_PMIC_VARIANT_VRC_DA9061:
+		type = "DA9061";
+		break;
+	case DA9062_PMIC_VARIANT_VRC_DA9062:
+		type = "DA9062";
+		break;
+	default:
+		type = "Unknown";
+		break;
+	}
+
 	dev_info(chip->dev,
-		 "Device detected (device-ID: 0x%02X, var-ID: 0x%02X)\n",
-		 device_id, variant_id);
+		 "Device detected (device-ID: 0x%02X, var-ID: 0x%02X, %s)\n",
+		 device_id, variant_id, type);
 
 	variant_mrc = (variant_id & DA9062AA_MRC_MASK) >> DA9062AA_MRC_SHIFT;
 
@@ -234,6 +364,234 @@ static int da9062_get_device_type(struct da9062 *chip)
 	return ret;
 }
 
+static const struct regmap_range da9061_aa_readable_ranges[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_STATUS_B,
+	}, {
+		.range_min = DA9062AA_STATUS_D,
+		.range_max = DA9062AA_EVENT_C,
+	}, {
+		.range_min = DA9062AA_IRQ_MASK_A,
+		.range_max = DA9062AA_IRQ_MASK_C,
+	}, {
+		.range_min = DA9062AA_CONTROL_A,
+		.range_max = DA9062AA_GPIO_4,
+	}, {
+		.range_min = DA9062AA_GPIO_WKUP_MODE,
+		.range_max = DA9062AA_GPIO_OUT3_4,
+	}, {
+		.range_min = DA9062AA_BUCK1_CONT,
+		.range_max = DA9062AA_BUCK4_CONT,
+	}, {
+		.range_min = DA9062AA_BUCK3_CONT,
+		.range_max = DA9062AA_BUCK3_CONT,
+	}, {
+		.range_min = DA9062AA_LDO1_CONT,
+		.range_max = DA9062AA_LDO4_CONT,
+	}, {
+		.range_min = DA9062AA_DVC_1,
+		.range_max = DA9062AA_DVC_1,
+	}, {
+		.range_min = DA9062AA_SEQ,
+		.range_max = DA9062AA_ID_4_3,
+	}, {
+		.range_min = DA9062AA_ID_12_11,
+		.range_max = DA9062AA_ID_16_15,
+	}, {
+		.range_min = DA9062AA_ID_22_21,
+		.range_max = DA9062AA_ID_32_31,
+	}, {
+		.range_min = DA9062AA_SEQ_A,
+		.range_max = DA9062AA_WAIT,
+	}, {
+		.range_min = DA9062AA_RESET,
+		.range_max = DA9062AA_BUCK_ILIM_C,
+	}, {
+		.range_min = DA9062AA_BUCK1_CFG,
+		.range_max = DA9062AA_BUCK3_CFG,
+	}, {
+		.range_min = DA9062AA_VBUCK1_A,
+		.range_max = DA9062AA_VBUCK4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK3_A,
+		.range_max = DA9062AA_VBUCK3_A,
+	}, {
+		.range_min = DA9062AA_VLDO1_A,
+		.range_max = DA9062AA_VLDO4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK1_B,
+		.range_max = DA9062AA_VBUCK4_B,
+	}, {
+		.range_min = DA9062AA_VBUCK3_B,
+		.range_max = DA9062AA_VBUCK3_B,
+	}, {
+		.range_min = DA9062AA_VLDO1_B,
+		.range_max = DA9062AA_VLDO4_B,
+	}, {
+		.range_min = DA9062AA_BBAT_CONT,
+		.range_max = DA9062AA_BBAT_CONT,
+	}, {
+		.range_min = DA9062AA_INTERFACE,
+		.range_max = DA9062AA_CONFIG_E,
+	}, {
+		.range_min = DA9062AA_CONFIG_G,
+		.range_max = DA9062AA_CONFIG_K,
+	}, {
+		.range_min = DA9062AA_CONFIG_M,
+		.range_max = DA9062AA_CONFIG_M,
+	}, {
+		.range_min = DA9062AA_GP_ID_0,
+		.range_max = DA9062AA_GP_ID_19,
+	}, {
+		.range_min = DA9062AA_DEVICE_ID,
+		.range_max = DA9062AA_CONFIG_ID,
+	},
+};
+
+static const struct regmap_range da9061_aa_writeable_ranges[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_PAGE_CON,
+	}, {
+		.range_min = DA9062AA_FAULT_LOG,
+		.range_max = DA9062AA_EVENT_C,
+	}, {
+		.range_min = DA9062AA_IRQ_MASK_A,
+		.range_max = DA9062AA_IRQ_MASK_C,
+	}, {
+		.range_min = DA9062AA_CONTROL_A,
+		.range_max = DA9062AA_GPIO_4,
+	}, {
+		.range_min = DA9062AA_GPIO_WKUP_MODE,
+		.range_max = DA9062AA_GPIO_OUT3_4,
+	}, {
+		.range_min = DA9062AA_BUCK1_CONT,
+		.range_max = DA9062AA_BUCK4_CONT,
+	}, {
+		.range_min = DA9062AA_BUCK3_CONT,
+		.range_max = DA9062AA_BUCK3_CONT,
+	}, {
+		.range_min = DA9062AA_LDO1_CONT,
+		.range_max = DA9062AA_LDO4_CONT,
+	}, {
+		.range_min = DA9062AA_DVC_1,
+		.range_max = DA9062AA_DVC_1,
+	}, {
+		.range_min = DA9062AA_SEQ,
+		.range_max = DA9062AA_ID_4_3,
+	}, {
+		.range_min = DA9062AA_ID_12_11,
+		.range_max = DA9062AA_ID_16_15,
+	}, {
+		.range_min = DA9062AA_ID_22_21,
+		.range_max = DA9062AA_ID_32_31,
+	}, {
+		.range_min = DA9062AA_SEQ_A,
+		.range_max = DA9062AA_WAIT,
+	}, {
+		.range_min = DA9062AA_RESET,
+		.range_max = DA9062AA_BUCK_ILIM_C,
+	}, {
+		.range_min = DA9062AA_BUCK1_CFG,
+		.range_max = DA9062AA_BUCK3_CFG,
+	}, {
+		.range_min = DA9062AA_VBUCK1_A,
+		.range_max = DA9062AA_VBUCK4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK3_A,
+		.range_max = DA9062AA_VBUCK3_A,
+	}, {
+		.range_min = DA9062AA_VLDO1_A,
+		.range_max = DA9062AA_VLDO4_A,
+	}, {
+		.range_min = DA9062AA_VBUCK1_B,
+		.range_max = DA9062AA_VBUCK4_B,
+	}, {
+		.range_min = DA9062AA_VBUCK3_B,
+		.range_max = DA9062AA_VBUCK3_B,
+	}, {
+		.range_min = DA9062AA_VLDO1_B,
+		.range_max = DA9062AA_VLDO4_B,
+	}, {
+		.range_min = DA9062AA_BBAT_CONT,
+		.range_max = DA9062AA_BBAT_CONT,
+	}, {
+		.range_min = DA9062AA_GP_ID_0,
+		.range_max = DA9062AA_GP_ID_19,
+	},
+};
+
+static const struct regmap_range da9061_aa_volatile_ranges[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_STATUS_B,
+	}, {
+		.range_min = DA9062AA_STATUS_D,
+		.range_max = DA9062AA_EVENT_C,
+	}, {
+		.range_min = DA9062AA_CONTROL_A,
+		.range_max = DA9062AA_CONTROL_B,
+	}, {
+		.range_min = DA9062AA_CONTROL_E,
+		.range_max = DA9062AA_CONTROL_F,
+	}, {
+		.range_min = DA9062AA_BUCK1_CONT,
+		.range_max = DA9062AA_BUCK4_CONT,
+	}, {
+		.range_min = DA9062AA_BUCK3_CONT,
+		.range_max = DA9062AA_BUCK3_CONT,
+	}, {
+		.range_min = DA9062AA_LDO1_CONT,
+		.range_max = DA9062AA_LDO4_CONT,
+	}, {
+		.range_min = DA9062AA_DVC_1,
+		.range_max = DA9062AA_DVC_1,
+	}, {
+		.range_min = DA9062AA_SEQ,
+		.range_max = DA9062AA_SEQ,
+	},
+};
+
+static const struct regmap_access_table da9061_aa_readable_table = {
+	.yes_ranges = da9061_aa_readable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(da9061_aa_readable_ranges),
+};
+
+static const struct regmap_access_table da9061_aa_writeable_table = {
+	.yes_ranges = da9061_aa_writeable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(da9061_aa_writeable_ranges),
+};
+
+static const struct regmap_access_table da9061_aa_volatile_table = {
+	.yes_ranges = da9061_aa_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(da9061_aa_volatile_ranges),
+};
+
+static const struct regmap_range_cfg da9061_range_cfg[] = {
+	{
+		.range_min = DA9062AA_PAGE_CON,
+		.range_max = DA9062AA_CONFIG_ID,
+		.selector_reg = DA9062AA_PAGE_CON,
+		.selector_mask = 1 << DA9062_I2C_PAGE_SEL_SHIFT,
+		.selector_shift = DA9062_I2C_PAGE_SEL_SHIFT,
+		.window_start = 0,
+		.window_len = 256,
+	}
+};
+
+static struct regmap_config da9061_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.ranges = da9061_range_cfg,
+	.num_ranges = ARRAY_SIZE(da9061_range_cfg),
+	.max_register = DA9062AA_CONFIG_ID,
+	.cache_type = REGCACHE_RBTREE,
+	.rd_table = &da9061_aa_readable_table,
+	.wr_table = &da9061_aa_writeable_table,
+	.volatile_table = &da9061_aa_volatile_table,
+};
+
 static const struct regmap_range da9062_aa_readable_ranges[] = {
 	{
 		.range_min = DA9062AA_PAGE_CON,
@@ -456,17 +814,39 @@ static struct regmap_config da9062_regmap_config = {
 	.volatile_table = &da9062_aa_volatile_table,
 };
 
+static const struct of_device_id da9062_dt_ids[] = {
+	{ .compatible = "dlg,da9061", .data = (void *)COMPAT_TYPE_DA9061, },
+	{ .compatible = "dlg,da9062", .data = (void *)COMPAT_TYPE_DA9062, },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, da9062_dt_ids);
+
 static int da9062_i2c_probe(struct i2c_client *i2c,
 	const struct i2c_device_id *id)
 {
 	struct da9062 *chip;
+	const struct of_device_id *match;
 	unsigned int irq_base;
+	const struct mfd_cell *cell;
+	const struct regmap_irq_chip *irq_chip;
+	const struct regmap_config *config;
+	int cell_num;
 	int ret;
 
 	chip = devm_kzalloc(&i2c->dev, sizeof(*chip), GFP_KERNEL);
 	if (!chip)
 		return -ENOMEM;
 
+	if (i2c->dev.of_node) {
+		match = of_match_node(da9062_dt_ids, i2c->dev.of_node);
+		if (!match)
+			return -EINVAL;
+
+		chip->chip_type = (uintptr_t)match->data;
+	} else {
+		chip->chip_type = id->driver_data;
+	}
+
 	i2c_set_clientdata(i2c, chip);
 	chip->dev = &i2c->dev;
 
@@ -475,7 +855,25 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
 		return -EINVAL;
 	}
 
-	chip->regmap = devm_regmap_init_i2c(i2c, &da9062_regmap_config);
+	switch (chip->chip_type) {
+	case COMPAT_TYPE_DA9061:
+		cell = da9061_devs;
+		cell_num = ARRAY_SIZE(da9061_devs);
+		irq_chip = &da9061_irq_chip;
+		config = &da9061_regmap_config;
+		break;
+	case COMPAT_TYPE_DA9062:
+		cell = da9062_devs;
+		cell_num = ARRAY_SIZE(da9062_devs);
+		irq_chip = &da9062_irq_chip;
+		config = &da9062_regmap_config;
+		break;
+	default:
+		dev_err(chip->dev, "Unrecognised chip type\n");
+		return -ENODEV;
+	}
+
+	chip->regmap = devm_regmap_init_i2c(i2c, config);
 	if (IS_ERR(chip->regmap)) {
 		ret = PTR_ERR(chip->regmap);
 		dev_err(chip->dev, "Failed to allocate register map: %d\n",
@@ -493,7 +891,7 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
 
 	ret = regmap_add_irq_chip(chip->regmap, i2c->irq,
 			IRQF_TRIGGER_LOW | IRQF_ONESHOT | IRQF_SHARED,
-			-1, &da9062_irq_chip,
+			-1, irq_chip,
 			&chip->regmap_irq);
 	if (ret) {
 		dev_err(chip->dev, "Failed to request IRQ %d: %d\n",
@@ -503,8 +901,8 @@ static int da9062_i2c_probe(struct i2c_client *i2c,
 
 	irq_base = regmap_irq_chip_get_base(chip->regmap_irq);
 
-	ret = mfd_add_devices(chip->dev, PLATFORM_DEVID_NONE, da9062_devs,
-			      ARRAY_SIZE(da9062_devs), NULL, irq_base,
+	ret = mfd_add_devices(chip->dev, PLATFORM_DEVID_NONE, cell,
+			      cell_num, NULL, irq_base,
 			      NULL);
 	if (ret) {
 		dev_err(chip->dev, "Cannot register child devices\n");
@@ -526,17 +924,12 @@ static int da9062_i2c_remove(struct i2c_client *i2c)
 }
 
 static const struct i2c_device_id da9062_i2c_id[] = {
-	{ "da9062", 0 },
+	{ "da9061", COMPAT_TYPE_DA9061 },
+	{ "da9062", COMPAT_TYPE_DA9062 },
 	{ },
 };
 MODULE_DEVICE_TABLE(i2c, da9062_i2c_id);
 
-static const struct of_device_id da9062_dt_ids[] = {
-	{ .compatible = "dlg,da9062", },
-	{ }
-};
-MODULE_DEVICE_TABLE(of, da9062_dt_ids);
-
 static struct i2c_driver da9062_i2c_driver = {
 	.driver = {
 		.name = "da9062",
@@ -549,6 +942,6 @@ static struct i2c_driver da9062_i2c_driver = {
 
 module_i2c_driver(da9062_i2c_driver);
 
-MODULE_DESCRIPTION("Core device driver for Dialog DA9062");
+MODULE_DESCRIPTION("Core device driver for Dialog DA9061 and DA9062");
 MODULE_AUTHOR("Steve Twiss <stwiss.opensource@diasemi.com>");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/da9062/core.h b/include/linux/mfd/da9062/core.h
index 376ba84366a0..74d33a01ddae 100644
--- a/include/linux/mfd/da9062/core.h
+++ b/include/linux/mfd/da9062/core.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ * Copyright (C) 2015-2017  Dialog Semiconductor
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -18,7 +18,31 @@
 #include <linux/interrupt.h>
 #include <linux/mfd/da9062/registers.h>
 
-/* Interrupts */
+enum da9062_compatible_types {
+	COMPAT_TYPE_DA9061 = 1,
+	COMPAT_TYPE_DA9062,
+};
+
+enum da9061_irqs {
+	/* IRQ A */
+	DA9061_IRQ_ONKEY,
+	DA9061_IRQ_WDG_WARN,
+	DA9061_IRQ_SEQ_RDY,
+	/* IRQ B*/
+	DA9061_IRQ_TEMP,
+	DA9061_IRQ_LDO_LIM,
+	DA9061_IRQ_DVC_RDY,
+	DA9061_IRQ_VDD_WARN,
+	/* IRQ C */
+	DA9061_IRQ_GPI0,
+	DA9061_IRQ_GPI1,
+	DA9061_IRQ_GPI2,
+	DA9061_IRQ_GPI3,
+	DA9061_IRQ_GPI4,
+
+	DA9061_NUM_IRQ,
+};
+
 enum da9062_irqs {
 	/* IRQ A */
 	DA9062_IRQ_ONKEY,
@@ -45,6 +69,7 @@ struct da9062 {
 	struct device *dev;
 	struct regmap *regmap;
 	struct regmap_irq_chip_data *regmap_irq;
+	enum da9062_compatible_types chip_type;
 };
 
 #endif /* __MFD_DA9062_CORE_H__ */
diff --git a/include/linux/mfd/da9062/registers.h b/include/linux/mfd/da9062/registers.h
index 97790d1b02c5..18d576aed902 100644
--- a/include/linux/mfd/da9062/registers.h
+++ b/include/linux/mfd/da9062/registers.h
@@ -1,6 +1,5 @@
 /*
- * registers.h - REGISTERS H for DA9062
- * Copyright (C) 2015  Dialog Semiconductor Ltd.
+ * Copyright (C) 2015-2017  Dialog Semiconductor
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -18,6 +17,8 @@
 
 #define DA9062_PMIC_DEVICE_ID		0x62
 #define DA9062_PMIC_VARIANT_MRC_AA	0x01
+#define DA9062_PMIC_VARIANT_VRC_DA9061	0x01
+#define DA9062_PMIC_VARIANT_VRC_DA9062	0x02
 
 #define DA9062_I2C_PAGE_SEL_SHIFT	1
 
-- 
cgit v1.2.3


From addebf1588ab812b891651ef5fba194659f71ea5 Mon Sep 17 00:00:00 2001
From: Marek Szyprowski <m.szyprowski@samsung.com>
Date: Thu, 23 Mar 2017 09:03:24 +0100
Subject: mfd: exynos-lpass: Remove pad retention control

Pad retention should be controlled from pin control driver, so remove it
from Exynos LPASS driver. After this change, no more access to PMU regmap
is needed, so remove also the code for handling PMU regmap.

Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-for-MFD-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 .../devicetree/bindings/mfd/samsung,exynos5433-lpass.txt |  2 --
 drivers/mfd/exynos-lpass.c                               | 16 ----------------
 include/linux/mfd/syscon/exynos5-pmu.h                   |  3 ---
 3 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt b/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt
index c110e118b79f..a8deaee82c44 100644
--- a/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt
+++ b/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt
@@ -5,7 +5,6 @@ Required properties:
  - compatible		: "samsung,exynos5433-lpass"
  - reg			: should contain the LPASS top SFR region location
 			  and size
- - samsung,pmu-syscon	: the phandle to the Power Management Unit node
  - #address-cells	: should be 1
  - #size-cells		: should be 1
  - ranges		: must be present
@@ -25,7 +24,6 @@ Example:
 audio-subsystem {
 	compatible = "samsung,exynos5433-lpass";
 	reg = <0x11400000 0x100>, <0x11500000 0x08>;
-	samsung,pmu-syscon = <&pmu_system_controller>;
 	#address-cells = <1>;
 	#size-cells = <1>;
 	ranges;
diff --git a/drivers/mfd/exynos-lpass.c b/drivers/mfd/exynos-lpass.c
index 8bebad92a385..39be39bbefc4 100644
--- a/drivers/mfd/exynos-lpass.c
+++ b/drivers/mfd/exynos-lpass.c
@@ -51,8 +51,6 @@
 #define  LPASS_INTR_SFR			BIT(0)
 
 struct exynos_lpass {
-	/* pointer to the Power Management Unit regmap */
-	struct regmap *pmu;
 	/* pointer to the LPASS TOP regmap */
 	struct regmap *top;
 };
@@ -81,10 +79,6 @@ static void exynos_lpass_enable(struct exynos_lpass *lpass)
 	regmap_write(lpass->top, SFR_LPASS_INTR_CPU_MASK,
 		     LPASS_INTR_SFR | LPASS_INTR_DMA | LPASS_INTR_I2S);
 
-	/* Activate related PADs from retention state */
-	regmap_write(lpass->pmu, EXYNOS5433_PAD_RETENTION_AUD_OPTION,
-		     EXYNOS_WAKEUP_FROM_LOWPWR);
-
 	exynos_lpass_core_sw_reset(lpass, LPASS_I2S_SW_RESET);
 	exynos_lpass_core_sw_reset(lpass, LPASS_DMA_SW_RESET);
 	exynos_lpass_core_sw_reset(lpass, LPASS_MEM_SW_RESET);
@@ -95,9 +89,6 @@ static void exynos_lpass_disable(struct exynos_lpass *lpass)
 	/* Mask any unmasked IP interrupt sources */
 	regmap_write(lpass->top, SFR_LPASS_INTR_CPU_MASK, 0);
 	regmap_write(lpass->top, SFR_LPASS_INTR_CA5_MASK, 0);
-
-	/* Deactivate related PADs from retention state */
-	regmap_write(lpass->pmu, EXYNOS5433_PAD_RETENTION_AUD_OPTION, 0);
 }
 
 static const struct regmap_config exynos_lpass_reg_conf = {
@@ -131,13 +122,6 @@ static int exynos_lpass_probe(struct platform_device *pdev)
 		return PTR_ERR(lpass->top);
 	}
 
-	lpass->pmu = syscon_regmap_lookup_by_phandle(dev->of_node,
-						"samsung,pmu-syscon");
-	if (IS_ERR(lpass->pmu)) {
-		dev_err(dev, "Failed to lookup PMU regmap\n");
-		return PTR_ERR(lpass->pmu);
-	}
-
 	platform_set_drvdata(pdev, lpass);
 	exynos_lpass_enable(lpass);
 
diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h
index c28ff21ca4d2..0622ae86f9db 100644
--- a/include/linux/mfd/syscon/exynos5-pmu.h
+++ b/include/linux/mfd/syscon/exynos5-pmu.h
@@ -46,7 +46,4 @@
 #define EXYNOS5_MIPI_PHY_S_RESETN		BIT(1)
 #define EXYNOS5_MIPI_PHY_M_RESETN		BIT(2)
 
-#define EXYNOS5433_PAD_RETENTION_AUD_OPTION		(0x3028)
-#define EXYNOS5433_PAD_INITIATE_WAKEUP_FROM_LOWPWR	BIT(28)
-
 #endif /* _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_ */
-- 
cgit v1.2.3


From 80f18379a7c350c011d30332658aa15fe49a8fa5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 27 Apr 2017 09:42:24 +0200
Subject: fs: add a VALID_OPEN_FLAGS

Add a central define for all valid open flags, and use it in the uniqueness
check.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c            | 14 ++++----------
 include/linux/fcntl.h |  6 ++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index be8fbe289087..de1b16bb6a29 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -742,16 +742,10 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
-		O_RDONLY	| O_WRONLY	| O_RDWR	|
-		O_CREAT		| O_EXCL	| O_NOCTTY	|
-		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
-		__O_SYNC	| O_DSYNC	| FASYNC	|
-		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
-		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		__FMODE_EXEC	| O_PATH	| __O_TMPFILE	|
-		__FMODE_NONOTIFY
-		));
+	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+		HWEIGHT32(
+			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
+			__FMODE_EXEC | __FMODE_NONOTIFY));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
 		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 76ce329e656d..1b48d9c9a561 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -3,6 +3,12 @@
 
 #include <uapi/linux/fcntl.h>
 
+/* list of all valid flags for the open/openat flags argument: */
+#define VALID_OPEN_FLAGS \
+	(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
+	 O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \
+	 FASYNC	| O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
+	 O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
 
 #ifndef force_o_largefile
 #define force_o_largefile() (BITS_PER_LONG != 32)
-- 
cgit v1.2.3


From e8245c1b1a3bb8474f91c69ccd13637d3589bb2c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:34:06 +0200
Subject: iommu: Include device.h in iommu.h

We make use of 'struct device' in iommu.h, so include
device.h to make it available explicitly.

Re-order the other headers while at it.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 include/linux/iommu.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6a6de187ddc0..3b4fe4b79d20 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -19,11 +19,13 @@
 #ifndef __LINUX_IOMMU_H
 #define __LINUX_IOMMU_H
 
+#include <linux/scatterlist.h>
+#include <linux/device.h>
+#include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/of.h>
-#include <linux/types.h>
-#include <linux/scatterlist.h>
+
 #include <trace/events/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
-- 
cgit v1.2.3


From 207c6e36f122ebb1164d611c9f34f128313f47d5 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:39:28 +0200
Subject: iommu: Move report_iommu_fault() to iommu.c

The function is in no fast-path, there is no need for it to
be static inline in a header file. This also removes the
need to include iommu trace-points in iommu.h.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/iommu.h | 41 ++---------------------------------------
 2 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9170fd498f46..4cb5792cbc21 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1655,6 +1655,48 @@ void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr)
 }
 EXPORT_SYMBOL_GPL(iommu_domain_window_disable);
 
+/**
+ * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
+ * @domain: the iommu domain where the fault has happened
+ * @dev: the device where the fault has happened
+ * @iova: the faulting address
+ * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...)
+ *
+ * This function should be called by the low-level IOMMU implementations
+ * whenever IOMMU faults happen, to allow high-level users, that are
+ * interested in such events, to know about them.
+ *
+ * This event may be useful for several possible use cases:
+ * - mere logging of the event
+ * - dynamic TLB/PTE loading
+ * - if restarting of the faulting device is required
+ *
+ * Returns 0 on success and an appropriate error code otherwise (if dynamic
+ * PTE/TLB loading will one day be supported, implementations will be able
+ * to tell whether it succeeded or not according to this return value).
+ *
+ * Specifically, -ENOSYS is returned if a fault handler isn't installed
+ * (though fault handlers can also return -ENOSYS, in case they want to
+ * elicit the default behavior of the IOMMU drivers).
+ */
+int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
+		       unsigned long iova, int flags)
+{
+	int ret = -ENOSYS;
+
+	/*
+	 * if upper layers showed interest and installed a fault handler,
+	 * invoke it.
+	 */
+	if (domain->handler)
+		ret = domain->handler(domain, dev, iova, flags,
+						domain->handler_token);
+
+	trace_io_page_fault(dev, iova, flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(report_iommu_fault);
+
 static int __init iommu_init(void)
 {
 	iommu_group_kset = kset_create_and_add("iommu_groups",
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3b4fe4b79d20..abaa0ca848bc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -330,46 +330,9 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
 				      phys_addr_t offset, u64 size,
 				      int prot);
 extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr);
-/**
- * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
- * @domain: the iommu domain where the fault has happened
- * @dev: the device where the fault has happened
- * @iova: the faulting address
- * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...)
- *
- * This function should be called by the low-level IOMMU implementations
- * whenever IOMMU faults happen, to allow high-level users, that are
- * interested in such events, to know about them.
- *
- * This event may be useful for several possible use cases:
- * - mere logging of the event
- * - dynamic TLB/PTE loading
- * - if restarting of the faulting device is required
- *
- * Returns 0 on success and an appropriate error code otherwise (if dynamic
- * PTE/TLB loading will one day be supported, implementations will be able
- * to tell whether it succeeded or not according to this return value).
- *
- * Specifically, -ENOSYS is returned if a fault handler isn't installed
- * (though fault handlers can also return -ENOSYS, in case they want to
- * elicit the default behavior of the IOMMU drivers).
- */
-static inline int report_iommu_fault(struct iommu_domain *domain,
-		struct device *dev, unsigned long iova, int flags)
-{
-	int ret = -ENOSYS;
 
-	/*
-	 * if upper layers showed interest and installed a fault handler,
-	 * invoke it.
-	 */
-	if (domain->handler)
-		ret = domain->handler(domain, dev, iova, flags,
-						domain->handler_token);
-
-	trace_io_page_fault(dev, iova, flags);
-	return ret;
-}
+extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev,
+			      unsigned long iova, int flags);
 
 static inline size_t iommu_map_sg(struct iommu_domain *domain,
 				  unsigned long iova, struct scatterlist *sg,
-- 
cgit v1.2.3


From 1578353e05cd23b10a9e5e8d1626e5bd0849d873 Mon Sep 17 00:00:00 2001
From: Icenowy Zheng <icenowy@aosc.io>
Date: Mon, 17 Apr 2017 19:57:40 +0800
Subject: mfd: axp20x: Support AXP803 variant

AXP803 is a new PMIC chip produced by X-Powers, usually paired with A64
via RSB bus. The PMIC itself is like AXP288, but with RSB support and
dedicated VBUS and ACIN.

Add support for it in the axp20x mfd driver.

Currently only power key function is supported.

Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/axp20x-rsb.c   |  1 +
 drivers/mfd/axp20x.c       | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 40 ++++++++++++++++++++++-
 3 files changed, 119 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index a732cb50bcff..fd5c7267b136 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -61,6 +61,7 @@ static int axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
 
 static const struct of_device_id axp20x_rsb_of_match[] = {
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
+	{ .compatible = "x-powers,axp803", .data = (void *)AXP803_ID },
 	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
 	{ .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
 	{ },
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index e6f55079876e..1dc6235778eb 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -41,6 +41,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP221",
 	"AXP223",
 	"AXP288",
+	"AXP803",
 	"AXP806",
 	"AXP809",
 };
@@ -117,6 +118,7 @@ static const struct regmap_access_table axp22x_volatile_table = {
 	.n_yes_ranges	= ARRAY_SIZE(axp22x_volatile_ranges),
 };
 
+/* AXP288 ranges are shared with the AXP803, as they cover the same range */
 static const struct regmap_range axp288_writeable_ranges[] = {
 	regmap_reg_range(AXP20X_DATACACHE(0), AXP20X_IRQ6_STATE),
 	regmap_reg_range(AXP20X_DCDC_MODE, AXP288_FG_TUNE5),
@@ -264,6 +266,20 @@ static struct resource axp288_fuel_gauge_resources[] = {
 	},
 };
 
+static struct resource axp803_pek_resources[] = {
+	{
+		.name   = "PEK_DBR",
+		.start  = AXP803_IRQ_PEK_RIS_EDGE,
+		.end    = AXP803_IRQ_PEK_RIS_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	}, {
+		.name   = "PEK_DBF",
+		.start  = AXP803_IRQ_PEK_FAL_EDGE,
+		.end    = AXP803_IRQ_PEK_FAL_EDGE,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
 static struct resource axp809_pek_resources[] = {
 	{
 		.name   = "PEK_DBR",
@@ -457,6 +473,43 @@ static const struct regmap_irq axp288_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP288, BC_USB_CHNG,            5, 1),
 };
 
+static const struct regmap_irq axp803_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP803, ACIN_OVER_V,		0, 7),
+	INIT_REGMAP_IRQ(AXP803, ACIN_PLUGIN,		0, 6),
+	INIT_REGMAP_IRQ(AXP803, ACIN_REMOVAL,	        0, 5),
+	INIT_REGMAP_IRQ(AXP803, VBUS_OVER_V,		0, 4),
+	INIT_REGMAP_IRQ(AXP803, VBUS_PLUGIN,		0, 3),
+	INIT_REGMAP_IRQ(AXP803, VBUS_REMOVAL,	        0, 2),
+	INIT_REGMAP_IRQ(AXP803, BATT_PLUGIN,		1, 7),
+	INIT_REGMAP_IRQ(AXP803, BATT_REMOVAL,	        1, 6),
+	INIT_REGMAP_IRQ(AXP803, BATT_ENT_ACT_MODE,	1, 5),
+	INIT_REGMAP_IRQ(AXP803, BATT_EXIT_ACT_MODE,	1, 4),
+	INIT_REGMAP_IRQ(AXP803, CHARG,		        1, 3),
+	INIT_REGMAP_IRQ(AXP803, CHARG_DONE,		1, 2),
+	INIT_REGMAP_IRQ(AXP803, BATT_CHG_TEMP_HIGH,	2, 7),
+	INIT_REGMAP_IRQ(AXP803, BATT_CHG_TEMP_HIGH_END,	2, 6),
+	INIT_REGMAP_IRQ(AXP803, BATT_CHG_TEMP_LOW,	2, 5),
+	INIT_REGMAP_IRQ(AXP803, BATT_CHG_TEMP_LOW_END,	2, 4),
+	INIT_REGMAP_IRQ(AXP803, BATT_ACT_TEMP_HIGH,	2, 3),
+	INIT_REGMAP_IRQ(AXP803, BATT_ACT_TEMP_HIGH_END,	2, 2),
+	INIT_REGMAP_IRQ(AXP803, BATT_ACT_TEMP_LOW,	2, 1),
+	INIT_REGMAP_IRQ(AXP803, BATT_ACT_TEMP_LOW_END,	2, 0),
+	INIT_REGMAP_IRQ(AXP803, DIE_TEMP_HIGH,	        3, 7),
+	INIT_REGMAP_IRQ(AXP803, GPADC,		        3, 2),
+	INIT_REGMAP_IRQ(AXP803, LOW_PWR_LVL1,	        3, 1),
+	INIT_REGMAP_IRQ(AXP803, LOW_PWR_LVL2,	        3, 0),
+	INIT_REGMAP_IRQ(AXP803, TIMER,		        4, 7),
+	INIT_REGMAP_IRQ(AXP803, PEK_RIS_EDGE,	        4, 6),
+	INIT_REGMAP_IRQ(AXP803, PEK_FAL_EDGE,	        4, 5),
+	INIT_REGMAP_IRQ(AXP803, PEK_SHORT,		4, 4),
+	INIT_REGMAP_IRQ(AXP803, PEK_LONG,		4, 3),
+	INIT_REGMAP_IRQ(AXP803, PEK_OVER_OFF,		4, 2),
+	INIT_REGMAP_IRQ(AXP803, GPIO1_INPUT,		4, 1),
+	INIT_REGMAP_IRQ(AXP803, GPIO0_INPUT,		4, 0),
+	INIT_REGMAP_IRQ(AXP803, BC_USB_CHNG,            5, 1),
+	INIT_REGMAP_IRQ(AXP803, MV_CHNG,                5, 0),
+};
+
 static const struct regmap_irq axp806_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP806, DIE_TEMP_HIGH_LV1,	0, 0),
 	INIT_REGMAP_IRQ(AXP806, DIE_TEMP_HIGH_LV2,	0, 1),
@@ -557,6 +610,18 @@ static const struct regmap_irq_chip axp288_regmap_irq_chip = {
 
 };
 
+static const struct regmap_irq_chip axp803_regmap_irq_chip = {
+	.name			= "axp803",
+	.status_base		= AXP20X_IRQ1_STATE,
+	.ack_base		= AXP20X_IRQ1_STATE,
+	.mask_base		= AXP20X_IRQ1_EN,
+	.mask_invert		= true,
+	.init_ack_masked	= true,
+	.irqs			= axp803_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp803_regmap_irqs),
+	.num_regs		= 6,
+};
+
 static const struct regmap_irq_chip axp806_regmap_irq_chip = {
 	.name			= "axp806",
 	.status_base		= AXP20X_IRQ1_STATE,
@@ -778,6 +843,14 @@ static struct mfd_cell axp288_cells[] = {
 	},
 };
 
+static struct mfd_cell axp803_cells[] = {
+	{
+		.name			= "axp20x-pek",
+		.num_resources		= ARRAY_SIZE(axp803_pek_resources),
+		.resources		= axp803_pek_resources,
+	}
+};
+
 static struct mfd_cell axp806_cells[] = {
 	{
 		.id			= 2,
@@ -864,6 +937,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_irq_chip = &axp288_regmap_irq_chip;
 		axp20x->irq_flags = IRQF_TRIGGER_LOW;
 		break;
+	case AXP803_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp803_cells);
+		axp20x->cells = axp803_cells;
+		axp20x->regmap_cfg = &axp288_regmap_config;
+		axp20x->regmap_irq_chip = &axp803_regmap_irq_chip;
+		break;
 	case AXP806_ID:
 		axp20x->nr_cells = ARRAY_SIZE(axp806_cells);
 		axp20x->cells = axp806_cells;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index dc8798cf2a24..cde56cfe8446 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -20,6 +20,7 @@ enum axp20x_variants {
 	AXP221_ID,
 	AXP223_ID,
 	AXP288_ID,
+	AXP803_ID,
 	AXP806_ID,
 	AXP809_ID,
 	NR_AXP20X_VARIANTS,
@@ -234,7 +235,7 @@ enum axp20x_variants {
 #define AXP22X_TS_ADC_L			0x59
 #define AXP22X_BATLOW_THRES1		0xe6
 
-/* AXP288 specific registers */
+/* AXP288/AXP803 specific registers */
 #define AXP288_POWER_REASON		0x02
 #define AXP288_BC_GLOBAL		0x2c
 #define AXP288_BC_VBUS_CNTL		0x2d
@@ -475,6 +476,43 @@ enum axp288_irqs {
 	AXP288_IRQ_BC_USB_CHNG,
 };
 
+enum axp803_irqs {
+	AXP803_IRQ_ACIN_OVER_V = 1,
+	AXP803_IRQ_ACIN_PLUGIN,
+	AXP803_IRQ_ACIN_REMOVAL,
+	AXP803_IRQ_VBUS_OVER_V,
+	AXP803_IRQ_VBUS_PLUGIN,
+	AXP803_IRQ_VBUS_REMOVAL,
+	AXP803_IRQ_BATT_PLUGIN,
+	AXP803_IRQ_BATT_REMOVAL,
+	AXP803_IRQ_BATT_ENT_ACT_MODE,
+	AXP803_IRQ_BATT_EXIT_ACT_MODE,
+	AXP803_IRQ_CHARG,
+	AXP803_IRQ_CHARG_DONE,
+	AXP803_IRQ_BATT_CHG_TEMP_HIGH,
+	AXP803_IRQ_BATT_CHG_TEMP_HIGH_END,
+	AXP803_IRQ_BATT_CHG_TEMP_LOW,
+	AXP803_IRQ_BATT_CHG_TEMP_LOW_END,
+	AXP803_IRQ_BATT_ACT_TEMP_HIGH,
+	AXP803_IRQ_BATT_ACT_TEMP_HIGH_END,
+	AXP803_IRQ_BATT_ACT_TEMP_LOW,
+	AXP803_IRQ_BATT_ACT_TEMP_LOW_END,
+	AXP803_IRQ_DIE_TEMP_HIGH,
+	AXP803_IRQ_GPADC,
+	AXP803_IRQ_LOW_PWR_LVL1,
+	AXP803_IRQ_LOW_PWR_LVL2,
+	AXP803_IRQ_TIMER,
+	AXP803_IRQ_PEK_RIS_EDGE,
+	AXP803_IRQ_PEK_FAL_EDGE,
+	AXP803_IRQ_PEK_SHORT,
+	AXP803_IRQ_PEK_LONG,
+	AXP803_IRQ_PEK_OVER_OFF,
+	AXP803_IRQ_GPIO1_INPUT,
+	AXP803_IRQ_GPIO0_INPUT,
+	AXP803_IRQ_BC_USB_CHNG,
+	AXP803_IRQ_MV_CHNG,
+};
+
 enum axp806_irqs {
 	AXP806_IRQ_DIE_TEMP_HIGH_LV1,
 	AXP806_IRQ_DIE_TEMP_HIGH_LV2,
-- 
cgit v1.2.3


From 5af50993850a48ba749b122173d789ea90976c72 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 5 Apr 2017 17:54:56 +1000
Subject: KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller

This patch makes KVM capable of using the XIVE interrupt controller
to provide the standard PAPR "XICS" style hypercalls. It is necessary
for proper operations when the host uses XIVE natively.

This has been lightly tested on an actual system, including PCI
pass-through with a TG3 device.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Cleanup pr_xxx(), unsplit pr_xxx() strings, etc., fix build
 failures by adding KVM_XIVE which depends on KVM_XICS and XIVE, and
 adding empty stubs for the kvm_xive_xxx() routines, fixup subject,
 integrate fixes from Paul for building PR=y HV=n]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |    2 +
 arch/powerpc/include/asm/kvm_host.h       |   28 +-
 arch/powerpc/include/asm/kvm_ppc.h        |   74 ++
 arch/powerpc/include/asm/xive.h           |    9 +-
 arch/powerpc/kernel/asm-offsets.c         |   10 +
 arch/powerpc/kvm/Kconfig                  |    5 +
 arch/powerpc/kvm/Makefile                 |    4 +-
 arch/powerpc/kvm/book3s.c                 |   75 +-
 arch/powerpc/kvm/book3s_hv.c              |   51 +-
 arch/powerpc/kvm/book3s_hv_builtin.c      |  103 ++
 arch/powerpc/kvm/book3s_hv_rm_xics.c      |   10 +-
 arch/powerpc/kvm/book3s_hv_rm_xive.c      |   47 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   62 +-
 arch/powerpc/kvm/book3s_rtas.c            |   21 +-
 arch/powerpc/kvm/book3s_xics.c            |   35 +-
 arch/powerpc/kvm/book3s_xics.h            |    7 +
 arch/powerpc/kvm/book3s_xive.c            | 1893 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_xive.h            |  256 ++++
 arch/powerpc/kvm/book3s_xive_template.c   |  503 ++++++++
 arch/powerpc/kvm/irq.h                    |    1 +
 arch/powerpc/kvm/powerpc.c                |   17 +-
 arch/powerpc/platforms/powernv/opal.c     |    1 +
 arch/powerpc/sysdev/xive/common.c         |  142 ++-
 arch/powerpc/sysdev/xive/native.c         |   86 +-
 include/linux/kvm_host.h                  |    1 -
 virt/kvm/kvm_main.c                       |    4 -
 26 files changed, 3358 insertions(+), 89 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xive.c
 create mode 100644 arch/powerpc/kvm/book3s_xive.c
 create mode 100644 arch/powerpc/kvm/book3s_xive.h
 create mode 100644 arch/powerpc/kvm/book3s_xive_template.c

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 0593d9479f74..b148496ffe36 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -111,6 +111,8 @@ struct kvmppc_host_state {
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	void __iomem *xics_phys;
+	void __iomem *xive_tima_phys;
+	void __iomem *xive_tima_virt;
 	u32 saved_xirr;
 	u64 dabr;
 	u64 host_mmcr[7];	/* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7bba8f415627..5a8ab4a758f1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -205,6 +205,12 @@ struct kvmppc_spapr_tce_table {
 /* XICS components, defined in book3s_xics.c */
 struct kvmppc_xics;
 struct kvmppc_icp;
+extern struct kvm_device_ops kvm_xics_ops;
+
+/* XIVE components, defined in book3s_xive.c */
+struct kvmppc_xive;
+struct kvmppc_xive_vcpu;
+extern struct kvm_device_ops kvm_xive_ops;
 
 struct kvmppc_passthru_irqmap;
 
@@ -293,6 +299,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_xics *xics;
+	struct kvmppc_xive *xive;
 	struct kvmppc_passthru_irqmap *pimap;
 #endif
 	struct kvmppc_ops *kvm_ops;
@@ -421,7 +428,7 @@ struct kvmppc_passthru_irqmap {
 
 #define KVMPPC_IRQ_DEFAULT	0
 #define KVMPPC_IRQ_MPIC		1
-#define KVMPPC_IRQ_XICS		2
+#define KVMPPC_IRQ_XICS		2 /* Includes a XIVE option */
 
 #define MMIO_HPTE_CACHE_SIZE	4
 
@@ -443,6 +450,21 @@ struct mmio_hpte_cache {
 
 struct openpic;
 
+/* W0 and W1 of a XIVE thread management context */
+union xive_tma_w01 {
+	struct {
+		u8	nsr;
+		u8	cppr;
+		u8	ipb;
+		u8	lsmfb;
+		u8	ack;
+		u8	inc;
+		u8	age;
+		u8	pipr;
+	};
+	__be64 w01;
+};
+
 struct kvm_vcpu_arch {
 	ulong host_stack;
 	u32 host_pid;
@@ -688,6 +710,10 @@ struct kvm_vcpu_arch {
 	struct openpic *mpic;	/* KVM_IRQ_MPIC */
 #ifdef CONFIG_KVM_XICS
 	struct kvmppc_icp *icp; /* XICS presentation controller */
+	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
+	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
+	u32 xive_pushed;	 /* Is the VP pushed on the physical CPU ? */
+	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c3877992eff9..ed52b13d9ffb 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -225,6 +225,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
 extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
 extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
 extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+
 extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
 				u32 priority);
 extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
@@ -412,6 +413,14 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 	paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
 }
 
+static inline void kvmppc_set_xive_tima(int cpu,
+					unsigned long phys_addr,
+					void __iomem *virt_addr)
+{
+	paca[cpu].kvm_hstate.xive_tima_phys = (void __iomem *)phys_addr;
+	paca[cpu].kvm_hstate.xive_tima_virt = virt_addr;
+}
+
 static inline u32 kvmppc_get_xics_latch(void)
 {
 	u32 xirr;
@@ -442,6 +451,11 @@ static inline void __init kvm_cma_reserve(void)
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {}
 
+static inline void kvmppc_set_xive_tima(int cpu,
+					unsigned long phys_addr,
+					void __iomem *virt_addr)
+{}
+
 static inline u32 kvmppc_get_xics_latch(void)
 {
 	return 0;
@@ -492,6 +506,10 @@ extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
 					struct kvmppc_irq_map *irq_map,
 					struct kvmppc_passthru_irqmap *pimap,
 					bool *again);
+
+extern int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+			       int level, bool line_status);
+
 extern int h_ipi_redirect;
 #else
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
@@ -509,6 +527,60 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif
 
+#ifdef CONFIG_KVM_XIVE
+/*
+ * Below the first "xive" is the "eXternal Interrupt Virtualization Engine"
+ * ie. P9 new interrupt controller, while the second "xive" is the legacy
+ * "eXternal Interrupt Vector Entry" which is the configuration of an
+ * interrupt on the "xics" interrupt controller on P8 and earlier. Those
+ * two function consume or produce a legacy "XIVE" state from the
+ * new "XIVE" interrupt controller.
+ */
+extern int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				u32 priority);
+extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				u32 *priority);
+extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
+extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
+extern void kvmppc_xive_init_module(void);
+extern void kvmppc_xive_exit_module(void);
+
+extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+				    struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
+extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+				  struct irq_desc *host_desc);
+extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+				  struct irq_desc *host_desc);
+extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+
+extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+			       int level, bool line_status);
+#else
+static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+				       u32 priority) { return -1; }
+static inline int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+				       u32 *priority) { return -1; }
+static inline int kvmppc_xive_int_on(struct kvm *kvm, u32 irq) { return -1; }
+static inline int kvmppc_xive_int_off(struct kvm *kvm, u32 irq) { return -1; }
+static inline void kvmppc_xive_init_module(void) { }
+static inline void kvmppc_xive_exit_module(void) { }
+
+static inline int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+					   struct kvm_vcpu *vcpu, u32 cpu) { return -EBUSY; }
+static inline void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+					 struct irq_desc *host_desc) { return -ENODEV; }
+static inline int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+					 struct irq_desc *host_desc) { return -ENODEV; }
+static inline u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu) { return 0; }
+static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { return -ENOENT; }
+
+static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
+				      int level, bool line_status) { return -ENODEV; }
+#endif /* CONFIG_KVM_XIVE */
+
 /*
  * Prototypes for functions called only from assembler code.
  * Having prototypes reduces sparse errors.
@@ -546,6 +618,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
                           unsigned long slb_v, unsigned int status, bool data);
 unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
+unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu);
+unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
 int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
                     unsigned long mfrr);
 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 3cdbeaeac397..c8a822acf962 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -99,7 +99,6 @@ struct xive_q {
 #define XIVE_ESB_SET_PQ_01	0xd00
 #define XIVE_ESB_SET_PQ_10	0xe00
 #define XIVE_ESB_SET_PQ_11	0xf00
-#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
 
 #define XIVE_ESB_VAL_P		0x2
 #define XIVE_ESB_VAL_Q		0x1
@@ -136,11 +135,11 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 				       __be32 *qpage, u32 order, bool can_escalate);
 extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
 
-extern bool __xive_irq_trigger(struct xive_irq_data *xd);
-extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
-extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
-
+extern void xive_native_sync_source(u32 hw_irq);
 extern bool is_xive_irq(struct irq_chip *chip);
+extern int xive_native_enable_vp(u32 vp_id);
+extern int xive_native_disable_vp(u32 vp_id);
+extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
 
 #else
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4367e7df51a1..1822187813dc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -630,6 +630,8 @@ int main(void)
 	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
 	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_PHYS, xive_tima_phys);
+	HSTATE_FIELD(HSTATE_XIVE_TIMA_VIRT, xive_tima_virt);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_PTID, ptid);
@@ -715,6 +717,14 @@ int main(void)
 	OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6);
 #endif
 
+#ifdef CONFIG_KVM_XICS
+	DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu,
+					       arch.xive_saved_state));
+	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
+					    arch.xive_cam_word));
+	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu);
 	OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl);
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 029be26b5a17..b9d66e53b773 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -196,6 +196,11 @@ config KVM_XICS
 	  Specification) interrupt controller architecture used on
 	  IBM POWER (pSeries) servers.
 
+config KVM_XIVE
+	bool
+	default y
+	depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b87ccde2137a..d91a2604c496 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -74,7 +74,7 @@ kvm-hv-y += \
 	book3s_64_mmu_radix.o
 
 kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
-	book3s_hv_rm_xics.o
+	book3s_hv_rm_xics.o book3s_hv_rm_xive.o
 
 ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
@@ -89,6 +89,8 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
 	book3s_xics.o
 
+kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o
+
 kvm-book3s_64-module-objs := \
 	$(common-objs-y) \
 	book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index aedacefd961d..cb8009cd688d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -35,6 +35,7 @@
 #include <asm/kvm_book3s.h>
 #include <asm/mmu_context.h>
 #include <asm/page.h>
+#include <asm/xive.h>
 
 #include "book3s.h"
 #include "trace.h"
@@ -578,11 +579,14 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
 			break;
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
-			if (!vcpu->arch.icp) {
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
 				r = -ENXIO;
 				break;
 			}
-			*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
+			if (xive_enabled())
+				*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
+			else
+				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
 			break;
 #endif /* CONFIG_KVM_XICS */
 		case KVM_REG_PPC_FSCR:
@@ -648,12 +652,14 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
 #endif /* CONFIG_VSX */
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
-			if (!vcpu->arch.icp) {
+			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
 				r = -ENXIO;
 				break;
 			}
-			r = kvmppc_xics_set_icp(vcpu,
-						set_reg_val(id, *val));
+			if (xive_enabled())
+				r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
+			else
+				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
 			break;
 #endif /* CONFIG_KVM_XICS */
 		case KVM_REG_PPC_FSCR:
@@ -924,6 +930,50 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
 	return kvm->arch.kvm_ops->hcall_implemented(hcall);
 }
 
+#ifdef CONFIG_KVM_XICS
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	if (xive_enabled())
+		return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+	else
+		return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level,
+					   line_status);
+}
+
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
+			      struct kvm *kvm, int irq_source_id,
+			      int level, bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
+			   level, line_status);
+}
+static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e,
+				 struct kvm *kvm, int irq_source_id, int level,
+				 bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
+}
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
+{
+	entries->gsi = gsi;
+	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
+	entries->set = kvmppc_book3s_set_irq;
+	entries->irqchip.irqchip = 0;
+	entries->irqchip.pin = gsi;
+	return 1;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	return pin;
+}
+
+#endif /* CONFIG_KVM_XICS */
+
 static int kvmppc_book3s_init(void)
 {
 	int r;
@@ -934,12 +984,25 @@ static int kvmppc_book3s_init(void)
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	r = kvmppc_book3s_init_pr();
 #endif
-	return r;
 
+#ifdef CONFIG_KVM_XICS
+#ifdef CONFIG_KVM_XIVE
+	if (xive_enabled()) {
+		kvmppc_xive_init_module();
+		kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
+	} else
+#endif
+		kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
+#endif
+	return r;
 }
 
 static void kvmppc_book3s_exit(void)
 {
+#ifdef CONFIG_KVM_XICS
+	if (xive_enabled())
+		kvmppc_xive_exit_module();
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	kvmppc_book3s_exit_pr();
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fadb75abfe37..128efb42ec4e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -67,6 +67,7 @@
 #include <asm/mmu.h>
 #include <asm/opal.h>
 #include <asm/xics.h>
+#include <asm/xive.h>
 
 #include "book3s.h"
 
@@ -837,6 +838,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 	case H_IPOLL:
 	case H_XIRR_X:
 		if (kvmppc_xics_enabled(vcpu)) {
+			if (xive_enabled()) {
+				ret = H_NOT_AVAILABLE;
+				return RESUME_GUEST;
+			}
 			ret = kvmppc_xics_hcall(vcpu, req);
 			break;
 		}
@@ -2947,8 +2952,12 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-		} else if (r == RESUME_PASSTHROUGH)
-			r = kvmppc_xics_rm_complete(vcpu, 0);
+		} else if (r == RESUME_PASSTHROUGH) {
+			if (WARN_ON(xive_enabled()))
+				r = H_SUCCESS;
+			else
+				r = kvmppc_xics_rm_complete(vcpu, 0);
+		}
 	} while (is_kvmppc_resume_guest(r));
 
  out:
@@ -3400,10 +3409,20 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
 	 * Set HVICE bit to enable hypervisor virtualization interrupts.
+	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
+	 * be unnecessary but better safe than sorry in case we re-enable
+	 * EE in HV mode with this LPCR still set)
 	 */
 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 		lpcr &= ~LPCR_VPM0;
-		lpcr |= LPCR_HVICE;
+		lpcr |= LPCR_HVICE | LPCR_HEIC;
+
+		/*
+		 * If xive is enabled, we route 0x500 interrupts directly
+		 * to the guest.
+		 */
+		if (xive_enabled())
+			lpcr |= LPCR_LPES;
 	}
 
 	/*
@@ -3533,7 +3552,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	struct kvmppc_irq_map *irq_map;
 	struct kvmppc_passthru_irqmap *pimap;
 	struct irq_chip *chip;
-	int i;
+	int i, rc = 0;
 
 	if (!kvm_irq_bypass)
 		return 1;
@@ -3558,10 +3577,10 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	/*
 	 * For now, we only support interrupts for which the EOI operation
 	 * is an OPAL call followed by a write to XIRR, since that's
-	 * what our real-mode EOI code does.
+	 * what our real-mode EOI code does, or a XIVE interrupt
 	 */
 	chip = irq_data_get_irq_chip(&desc->irq_data);
-	if (!chip || !is_pnv_opal_msi(chip)) {
+	if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
 		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
 			host_irq, guest_gsi);
 		mutex_unlock(&kvm->lock);
@@ -3603,7 +3622,12 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	if (i == pimap->n_mapped)
 		pimap->n_mapped++;
 
-	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+	if (xive_enabled())
+		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
+	else
+		kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+	if (rc)
+		irq_map->r_hwirq = 0;
 
 	mutex_unlock(&kvm->lock);
 
@@ -3614,7 +3638,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 {
 	struct irq_desc *desc;
 	struct kvmppc_passthru_irqmap *pimap;
-	int i;
+	int i, rc = 0;
 
 	if (!kvm_irq_bypass)
 		return 0;
@@ -3641,9 +3665,12 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 		return -ENODEV;
 	}
 
-	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+	if (xive_enabled())
+		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
+	else
+		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
 
-	/* invalidate the entry */
+	/* invalidate the entry (what do do on error from the above ?) */
 	pimap->mapped[i].r_hwirq = 0;
 
 	/*
@@ -3652,7 +3679,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
 	 */
 
 	mutex_unlock(&kvm->lock);
-	return 0;
+	return rc;
 }
 
 static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
@@ -3930,7 +3957,7 @@ static int kvmppc_book3s_init_hv(void)
 	 * indirectly, via OPAL.
 	 */
 #ifdef CONFIG_SMP
-	if (!get_paca()->kvm_hstate.xics_phys) {
+	if (!xive_enabled() && !get_paca()->kvm_hstate.xics_phys) {
 		struct device_node *np;
 
 		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index a752e29977e0..846b40cb3a62 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -32,6 +32,24 @@
 
 #define KVM_CMA_CHUNK_ORDER	18
 
+#include "book3s_xics.h"
+#include "book3s_xive.h"
+
+/*
+ * The XIVE module will populate these when it loads
+ */
+unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
+unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
+int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+		       unsigned long mfrr);
+int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
+int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
+EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
+EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
+EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
+
 /*
  * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
  * should be power of 2.
@@ -210,6 +228,7 @@ void kvmhv_rm_send_ipi(int cpu)
 		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 		return;
 	}
+
 	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
 	    cpu_first_thread_sibling(cpu) ==
@@ -406,6 +425,9 @@ static long kvmppc_read_one_intr(bool *again)
 	u8 host_ipi;
 	int64_t rc;
 
+	if (xive_enabled())
+		return 1;
+
 	/* see if a host IPI is pending */
 	host_ipi = local_paca->kvm_hstate.host_ipi;
 	if (host_ipi)
@@ -490,3 +512,84 @@ static long kvmppc_read_one_intr(bool *again)
 
 	return kvmppc_check_passthru(xisr, xirr, again);
 }
+
+#ifdef CONFIG_KVM_XICS
+static inline bool is_rm(void)
+{
+	return !(mfmsr() & MSR_DR);
+}
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_xirr(vcpu);
+		if (unlikely(!__xive_vm_h_xirr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_xirr(vcpu);
+	} else
+		return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.gpr[5] = get_tb();
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_xirr(vcpu);
+		if (unlikely(!__xive_vm_h_xirr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_xirr(vcpu);
+	} else
+		return xics_rm_h_xirr(vcpu);
+}
+
+unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_ipoll(vcpu, server);
+		if (unlikely(!__xive_vm_h_ipoll))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_ipoll(vcpu, server);
+	} else
+		return H_TOO_HARD;
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		    unsigned long mfrr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_ipi(vcpu, server, mfrr);
+		if (unlikely(!__xive_vm_h_ipi))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_ipi(vcpu, server, mfrr);
+	} else
+		return xics_rm_h_ipi(vcpu, server, mfrr);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_cppr(vcpu, cppr);
+		if (unlikely(!__xive_vm_h_cppr))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_cppr(vcpu, cppr);
+	} else
+		return xics_rm_h_cppr(vcpu, cppr);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	if (xive_enabled()) {
+		if (is_rm())
+			return xive_rm_h_eoi(vcpu, xirr);
+		if (unlikely(!__xive_vm_h_eoi))
+			return H_NOT_AVAILABLE;
+		return __xive_vm_h_eoi(vcpu, xirr);
+	} else
+		return xics_rm_h_eoi(vcpu, xirr);
+}
+#endif /* CONFIG_KVM_XICS */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 3a1a463a039a..f8068801ac36 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -485,7 +485,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 }
 
 
-unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -523,8 +523,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
-		    unsigned long mfrr)
+int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		  unsigned long mfrr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -610,7 +610,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 	return check_too_hard(xics, this_icp);
 }
 
-int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 {
 	union kvmppc_icp_state old_state, new_state;
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
@@ -730,7 +730,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
 	return check_too_hard(xics, icp);
 }
 
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
new file mode 100644
index 000000000000..abf5f01b6eb1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
@@ -0,0 +1,47 @@
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/kernel_stat.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/cputhreads.h>
+#include <asm/pgtable.h>
+#include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
+#include <asm/smp.h>
+#include <asm/asm-prototypes.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+
+#include "book3s_xive.h"
+
+/* XXX */
+#include <asm/udbg.h>
+//#define DBG(fmt...) udbg_printf(fmt)
+#define DBG(fmt...) do { } while(0)
+
+static inline void __iomem *get_tima_phys(void)
+{
+	return local_paca->kvm_hstate.xive_tima_phys;
+}
+
+#undef XIVE_RUNTIME_CHECKS
+#define X_PFX xive_rm_
+#define X_STATIC
+#define X_STAT_PFX stat_rm_
+#define __x_tima		get_tima_phys()
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_page))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_page))
+#define __x_readb	__raw_rm_readb
+#define __x_writeb	__raw_rm_writeb
+#define __x_readw	__raw_rm_readw
+#define __x_readq	__raw_rm_readq
+#define __x_writeq	__raw_rm_writeq
+
+#include "book3s_xive_template.c"
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7c6477d1840a..bdb3f76ceb6b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -30,6 +30,7 @@
 #include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
 #include <asm/opal.h>
+#include <asm/xive-regs.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
 
@@ -970,6 +971,23 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	cmpwi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+#ifdef CONFIG_KVM_XICS
+	/* We are entering the guest on that thread, push VCPU to XIVE */
+	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
+	cmpldi	cr0, r10, r0
+	beq	no_xive
+	ld	r11, VCPU_XIVE_SAVED_STATE(r4)
+	li	r9, TM_QW1_OS
+	stdcix	r11,r9,r10
+	eieio
+	lwz	r11, VCPU_XIVE_CAM_WORD(r4)
+	li	r9, TM_QW1_OS + TM_WORD2
+	stwcix	r11,r9,r10
+	li	r9, 1
+	stw	r9, VCPU_XIVE_PUSHED(r4)
+no_xive:
+#endif /* CONFIG_KVM_XICS */
+
 deliver_guest_interrupt:
 	ld	r6, VCPU_CTR(r4)
 	ld	r7, VCPU_XER(r4)
@@ -1307,6 +1325,42 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+#ifdef CONFIG_KVM_XICS
+	/* We are exiting, pull the VP from the XIVE */
+	lwz	r0, VCPU_XIVE_PUSHED(r9)
+	cmpwi	cr0, r0, 0
+	beq	1f
+	li	r7, TM_SPC_PULL_OS_CTX
+	li	r6, TM_QW1_OS
+	mfmsr	r0
+	andi.	r0, r0, MSR_IR		/* in real mode? */
+	beq	2f
+	ld	r10, HSTATE_XIVE_TIMA_VIRT(r13)
+	cmpldi	cr0, r10, 0
+	beq	1f
+	/* First load to pull the context, we ignore the value */
+	lwzx	r11, r7, r10
+	eieio
+	/* Second load to recover the context state (Words 0 and 1) */
+	ldx	r11, r6, r10
+	b	3f
+2:	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
+	cmpldi	cr0, r10, 0
+	beq	1f
+	/* First load to pull the context, we ignore the value */
+	lwzcix	r11, r7, r10
+	eieio
+	/* Second load to recover the context state (Words 0 and 1) */
+	ldcix	r11, r6, r10
+3:	std	r11, VCPU_XIVE_SAVED_STATE(r9)
+	/* Fixup some of the state for the next load */
+	li	r10, 0
+	li	r0, 0xff
+	stw	r10, VCPU_XIVE_PUSHED(r9)
+	stb	r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
+	stb	r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
+1:
+#endif /* CONFIG_KVM_XICS */
 	/* Save more register state  */
 	mfdar	r6
 	mfdsisr	r7
@@ -2011,7 +2065,7 @@ hcall_real_table:
 	.long	DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table
-	.long	0		/* 0x70 - H_IPOLL */
+	.long	DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table
 	.long	DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table
 #else
 	.long	0		/* 0x64 - H_EOI */
@@ -2181,7 +2235,11 @@ hcall_real_table:
 	.long	0		/* 0x2f0 */
 	.long	0		/* 0x2f4 */
 	.long	0		/* 0x2f8 */
-	.long	0		/* 0x2fc */
+#ifdef CONFIG_KVM_XICS
+	.long	DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table
+#else
+	.long	0		/* 0x2fc - H_XIRR_X*/
+#endif
 	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
 	.globl	hcall_real_table_end
 hcall_real_table_end:
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 20528701835b..2d3b2b1cc272 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -16,6 +16,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/hvcall.h>
 #include <asm/rtas.h>
+#include <asm/xive.h>
 
 #ifdef CONFIG_KVM_XICS
 static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
@@ -32,7 +33,10 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
 	server = be32_to_cpu(args->args[1]);
 	priority = be32_to_cpu(args->args[2]);
 
-	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (xive_enabled())
+		rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
+	else
+		rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
 	if (rc)
 		rc = -3;
 out:
@@ -52,7 +56,10 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
 	irq = be32_to_cpu(args->args[0]);
 
 	server = priority = 0;
-	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (xive_enabled())
+		rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
+	else
+		rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
 	if (rc) {
 		rc = -3;
 		goto out;
@@ -76,7 +83,10 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
 
 	irq = be32_to_cpu(args->args[0]);
 
-	rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+	if (xive_enabled())
+		rc = kvmppc_xive_int_off(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_off(vcpu->kvm, irq);
 	if (rc)
 		rc = -3;
 out:
@@ -95,7 +105,10 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
 
 	irq = be32_to_cpu(args->args[0]);
 
-	rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+	if (xive_enabled())
+		rc = kvmppc_xive_int_on(vcpu->kvm, irq);
+	else
+		rc = kvmppc_xics_int_on(vcpu->kvm, irq);
 	if (rc)
 		rc = -3;
 out:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index ef4fd528c193..e6829c415bc8 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1307,8 +1307,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	return 0;
 }
 
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
-		bool line_status)
+int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
 {
 	struct kvmppc_xics *xics = kvm->arch.xics;
 
@@ -1317,14 +1317,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 	return ics_deliver_irq(xics, irq, level);
 }
 
-int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
-			      struct kvm *kvm, int irq_source_id,
-			      int level, bool line_status)
-{
-	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
-			   level, line_status);
-}
-
 static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
 	struct kvmppc_xics *xics = dev->private;
@@ -1458,29 +1450,6 @@ void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
 	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
 }
 
-static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e,
-			struct kvm *kvm, int irq_source_id, int level,
-			bool line_status)
-{
-	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
-		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
-{
-	entries->gsi = gsi;
-	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
-	entries->set = xics_set_irq;
-	entries->irqchip.irqchip = 0;
-	entries->irqchip.pin = gsi;
-	return 1;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-	return pin;
-}
-
 void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
 			    unsigned long host_irq)
 {
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index ec5474cf70c6..453c9e518c19 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -10,6 +10,7 @@
 #ifndef _KVM_PPC_BOOK3S_XICS_H
 #define _KVM_PPC_BOOK3S_XICS_H
 
+#ifdef CONFIG_KVM_XICS
 /*
  * We use a two-level tree to store interrupt source information.
  * There are up to 1024 ICS nodes, each of which can represent
@@ -144,5 +145,11 @@ static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
 	return ics;
 }
 
+extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
 
+#endif /* CONFIG_KVM_XICS */
 #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
new file mode 100644
index 000000000000..7807ee17af4b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -0,0 +1,1893 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "xive-kvm: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/debug.h>
+#include <asm/time.h>
+#include <asm/opal.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xive.h"
+
+
+/*
+ * Virtual mode variants of the hcalls for use on radix/radix
+ * with AIL. They require the VCPU's VP to be "pushed"
+ *
+ * We still instanciate them here because we use some of the
+ * generated utility functions as well in this file.
+ */
+#define XIVE_RUNTIME_CHECKS
+#define X_PFX xive_vm_
+#define X_STATIC static
+#define X_STAT_PFX stat_vm_
+#define __x_tima		xive_tima
+#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
+#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
+#define __x_readb	__raw_readb
+#define __x_writeb	__raw_writeb
+#define __x_readw	__raw_readw
+#define __x_readq	__raw_readq
+#define __x_writeq	__raw_writeq
+
+#include "book3s_xive_template.c"
+
+/*
+ * We leave a gap of a couple of interrupts in the queue to
+ * account for the IPI and additional safety guard.
+ */
+#define XIVE_Q_GAP	2
+
+/*
+ * This is a simple trigger for a generic XIVE IRQ. This must
+ * only be called for interrupts that support a trigger page
+ */
+static bool xive_irq_trigger(struct xive_irq_data *xd)
+{
+	/* This should be only for MSIs */
+	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+		return false;
+
+	/* Those interrupts should always have a trigger page */
+	if (WARN_ON(!xd->trig_mmio))
+		return false;
+
+	out_be64(xd->trig_mmio, 0);
+
+	return true;
+}
+
+static irqreturn_t xive_esc_irq(int irq, void *data)
+{
+	struct kvm_vcpu *vcpu = data;
+
+	/* We use the existing H_PROD mechanism to wake up the target */
+	vcpu->arch.prodded = 1;
+	smp_mb();
+	if (vcpu->arch.ceded)
+		kvmppc_fast_vcpu_kick(vcpu);
+
+	return IRQ_HANDLED;
+}
+
+static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q = &xc->queues[prio];
+	char *name = NULL;
+	int rc;
+
+	/* Already there ? */
+	if (xc->esc_virq[prio])
+		return 0;
+
+	/* Hook up the escalation interrupt */
+	xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
+	if (!xc->esc_virq[prio]) {
+		pr_err("Failed to map escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		return -EIO;
+	}
+
+	/*
+	 * Future improvement: start with them disabled
+	 * and handle DD2 and later scheme of merged escalation
+	 * interrupts
+	 */
+	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+			 vcpu->kvm->arch.lpid, xc->server_num, prio);
+	if (!name) {
+		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		rc = -ENOMEM;
+		goto error;
+	}
+	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
+			 IRQF_NO_THREAD, name, vcpu);
+	if (rc) {
+		pr_err("Failed to request escalation interrupt for queue %d of VCPU %d\n",
+		       prio, xc->server_num);
+		goto error;
+	}
+	xc->esc_virq_names[prio] = name;
+	return 0;
+error:
+	irq_dispose_mapping(xc->esc_virq[prio]);
+	xc->esc_virq[prio] = 0;
+	kfree(name);
+	return rc;
+}
+
+static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	struct xive_q *q =  &xc->queues[prio];
+	void *qpage;
+	int rc;
+
+	if (WARN_ON(q->qpage))
+		return 0;
+
+	/* Allocate the queue and retrieve infos on current node for now */
+	qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_page_order);
+	if (!qpage) {
+		pr_err("Failed to allocate queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+		return -ENOMEM;;
+	}
+	memset(qpage, 0, 1 << xive->q_order);
+
+	/*
+	 * Reconfigure the queue. This will set q->qpage only once the
+	 * queue is fully configured. This is a requirement for prio 0
+	 * as we will stop doing EOIs for every IPI as soon as we observe
+	 * qpage being non-NULL, and instead will only EOI when we receive
+	 * corresponding queue 0 entries
+	 */
+	rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
+					 xive->q_order, true);
+	if (rc)
+		pr_err("Failed to configure queue %d for VCPU %d\n",
+		       prio, xc->server_num);
+	return rc;
+}
+
+/* Called with kvm_lock held */
+static int xive_check_provisioning(struct kvm *kvm, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvm_vcpu *vcpu;
+	int i, rc;
+
+	lockdep_assert_held(&kvm->lock);
+
+	/* Already provisioned ? */
+	if (xive->qmap & (1 << prio))
+		return 0;
+
+	pr_devel("Provisioning prio... %d\n", prio);
+
+	/* Provision each VCPU and enable escalations */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_provision_queue(vcpu, prio);
+		if (rc == 0)
+			xive_attach_escalation(vcpu, prio);
+		if (rc)
+			return rc;
+	}
+
+	/* Order previous stores and mark it as provisioned */
+	mb();
+	xive->qmap |= (1 << prio);
+	return 0;
+}
+
+static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvmppc_xive_vcpu *xc;
+	struct xive_q *q;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, server);
+	if (!vcpu) {
+		pr_warn("%s: Can't find server %d\n", __func__, server);
+		return;
+	}
+	xc = vcpu->arch.xive_vcpu;
+	if (WARN_ON(!xc))
+		return;
+
+	q = &xc->queues[prio];
+	atomic_inc(&q->pending_count);
+}
+
+static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_q *q;
+	u32 max;
+
+	if (WARN_ON(!xc))
+		return -ENXIO;
+	if (!xc->valid)
+		return -ENXIO;
+
+	q = &xc->queues[prio];
+	if (WARN_ON(!q->qpage))
+		return -ENXIO;
+
+	/* Calculate max number of interrupts in that queue. */
+	max = (q->msk + 1) - XIVE_Q_GAP;
+	return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
+}
+
+static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
+{
+	struct kvm_vcpu *vcpu;
+	int i, rc;
+
+	/* Locate target server */
+	vcpu = kvmppc_xive_find_server(kvm, *server);
+	if (!vcpu) {
+		pr_devel("Can't find server %d\n", *server);
+		return -EINVAL;
+	}
+
+	pr_devel("Finding irq target on 0x%x/%d...\n", *server, prio);
+
+	/* Try pick it */
+	rc = xive_try_pick_queue(vcpu, prio);
+	if (rc == 0)
+		return rc;
+
+	pr_devel(" .. failed, looking up candidate...\n");
+
+	/* Failed, pick another VCPU */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!vcpu->arch.xive_vcpu)
+			continue;
+		rc = xive_try_pick_queue(vcpu, prio);
+		if (rc == 0) {
+			*server = vcpu->arch.xive_vcpu->server_num;
+			pr_devel("  found on 0x%x/%d\n", *server, prio);
+			return rc;
+		}
+	}
+	pr_devel("  no available target !\n");
+
+	/* No available target ! */
+	return -EBUSY;
+}
+
+static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
+			     struct kvmppc_xive_src_block *sb,
+			     struct kvmppc_xive_irq_state *state)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+	u8 old_prio;
+	u64 val;
+
+	/*
+	 * Take the lock, set masked, try again if racing
+	 * with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		old_prio = state->guest_priority;
+		state->guest_priority = MASKED;
+		mb();
+		if (!state->in_eoi)
+			break;
+		state->guest_priority = old_prio;
+		arch_spin_unlock(&sb->lock);
+	}
+
+	/* No change ? Bail */
+	if (old_prio == MASKED)
+		return old_prio;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/*
+	 * If the interrupt is marked as needing masking via
+	 * firmware, we do it here. Firmware masking however
+	 * is "lossy", it won't return the old p and q bits
+	 * and won't set the interrupt to a state where it will
+	 * record queued ones. If this is an issue we should do
+	 * lazy masking instead.
+	 *
+	 * For now, we work around this in unmask by forcing
+	 * an interrupt whenever we unmask a non-LSI via FW
+	 * (if ever).
+	 */
+	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
+		xive_native_configure_irq(hw_num,
+					  xive->vp_base + state->act_server,
+					  MASKED, state->number);
+		/* set old_p so we can track if an H_EOI was done */
+		state->old_p = true;
+		state->old_q = false;
+	} else {
+		/* Set PQ to 10, return old P and old Q and remember them */
+		val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
+		state->old_p = !!(val & 2);
+		state->old_q = !!(val & 1);
+
+		/*
+		 * Synchronize hardware to sensure the queues are updated
+		 * when masking
+		 */
+		xive_native_sync_source(hw_num);
+	}
+
+	return old_prio;
+}
+
+static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
+				 struct kvmppc_xive_irq_state *state)
+{
+	/*
+	 * Take the lock try again if racing with H_EOI
+	 */
+	for (;;) {
+		arch_spin_lock(&sb->lock);
+		if (!state->in_eoi)
+			break;
+		arch_spin_unlock(&sb->lock);
+	}
+}
+
+static void xive_finish_unmask(struct kvmppc_xive *xive,
+			       struct kvmppc_xive_src_block *sb,
+			       struct kvmppc_xive_irq_state *state,
+			       u8 prio)
+{
+	struct xive_irq_data *xd;
+	u32 hw_num;
+
+	/* If we aren't changing a thing, move on */
+	if (state->guest_priority != MASKED)
+		goto bail;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	/*
+	 * See command in xive_lock_and_mask() concerning masking
+	 * via firmware.
+	 */
+	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
+		xive_native_configure_irq(hw_num,
+					  xive->vp_base + state->act_server,
+					  state->act_priority, state->number);
+		/* If an EOI is needed, do it here */
+		if (!state->old_p)
+			xive_vm_source_eoi(hw_num, xd);
+		/* If this is not an LSI, force a trigger */
+		if (!(xd->flags & OPAL_XIVE_IRQ_LSI))
+			xive_irq_trigger(xd);
+		goto bail;
+	}
+
+	/* Old Q set, set PQ to 11 */
+	if (state->old_q)
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
+
+	/*
+	 * If not old P, then perform an "effective" EOI,
+	 * on the source. This will handle the cases where
+	 * FW EOI is needed.
+	 */
+	if (!state->old_p)
+		xive_vm_source_eoi(hw_num, xd);
+
+	/* Synchronize ordering and mark unmasked */
+	mb();
+bail:
+	state->guest_priority = prio;
+}
+
+/*
+ * Target an interrupt to a given server/prio, this will fallback
+ * to another server if necessary and perform the HW targetting
+ * updates as needed
+ *
+ * NOTE: Must be called with the state lock held
+ */
+static int xive_target_interrupt(struct kvm *kvm,
+				 struct kvmppc_xive_irq_state *state,
+				 u32 server, u8 prio)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	u32 hw_num;
+	int rc;
+
+	/*
+	 * This will return a tentative server and actual
+	 * priority. The count for that new target will have
+	 * already been incremented.
+	 */
+	rc = xive_select_target(kvm, &server, prio);
+
+	/*
+	 * We failed to find a target ? Not much we can do
+	 * at least until we support the GIQ.
+	 */
+	if (rc)
+		return rc;
+
+	/*
+	 * Increment the old queue pending count if there
+	 * was one so that the old queue count gets adjusted later
+	 * when observed to be empty.
+	 */
+	if (state->act_priority != MASKED)
+		xive_inc_q_pending(kvm,
+				   state->act_server,
+				   state->act_priority);
+	/*
+	 * Update state and HW
+	 */
+	state->act_priority = prio;
+	state->act_server = server;
+
+	/* Get the right irq */
+	kvmppc_xive_select_irq(state, &hw_num, NULL);
+
+	return xive_native_configure_irq(hw_num,
+					 xive->vp_base + server,
+					 prio, state->number);
+}
+
+/*
+ * Targetting rules: In order to avoid losing track of
+ * pending interrupts accross mask and unmask, which would
+ * allow queue overflows, we implement the following rules:
+ *
+ *  - Unless it was never enabled (or we run out of capacity)
+ *    an interrupt is always targetted at a valid server/queue
+ *    pair even when "masked" by the guest. This pair tends to
+ *    be the last one used but it can be changed under some
+ *    circumstances. That allows us to separate targetting
+ *    from masking, we only handle accounting during (re)targetting,
+ *    this also allows us to let an interrupt drain into its target
+ *    queue after masking, avoiding complex schemes to remove
+ *    interrupts out of remote processor queues.
+ *
+ *  - When masking, we set PQ to 10 and save the previous value
+ *    of P and Q.
+ *
+ *  - When unmasking, if saved Q was set, we set PQ to 11
+ *    otherwise we leave PQ to the HW state which will be either
+ *    10 if nothing happened or 11 if the interrupt fired while
+ *    masked. Effectively we are OR'ing the previous Q into the
+ *    HW Q.
+ *
+ *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
+ *    which will unmask the interrupt and shoot a new one if Q was
+ *    set.
+ *
+ *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
+ *    effectively meaning an H_EOI from the guest is still expected
+ *    for that interrupt).
+ *
+ *  - If H_EOI occurs while masked, we clear the saved P.
+ *
+ *  - When changing target, we account on the new target and
+ *    increment a separate "pending" counter on the old one.
+ *    This pending counter will be used to decrement the old
+ *    target's count when its queue has been observed empty.
+ */
+
+int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
+			 u32 priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u8 new_act_prio;
+	int rc = 0;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_xive ! irq 0x%x server 0x%x prio %d\n",
+		 irq, server, priority);
+
+	/* First, check provisioning of queues */
+	if (priority != MASKED)
+		rc = xive_check_provisioning(xive->kvm,
+			      xive_prio_from_guest(priority));
+	if (rc) {
+		pr_devel("  provisioning failure %d !\n", rc);
+		return rc;
+	}
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * We first handle masking/unmasking since the locking
+	 * might need to be retried due to EOIs, we'll handle
+	 * targetting changes later. These functions will return
+	 * with the SB lock held.
+	 *
+	 * xive_lock_and_mask() will also set state->guest_priority
+	 * but won't otherwise change other fields of the state.
+	 *
+	 * xive_lock_for_unmask will not actually unmask, this will
+	 * be done later by xive_finish_unmask() once the targetting
+	 * has been done, so we don't try to unmask an interrupt
+	 * that hasn't yet been targetted.
+	 */
+	if (priority == MASKED)
+		xive_lock_and_mask(xive, sb, state);
+	else
+		xive_lock_for_unmask(sb, state);
+
+
+	/*
+	 * Then we handle targetting.
+	 *
+	 * First calculate a new "actual priority"
+	 */
+	new_act_prio = state->act_priority;
+	if (priority != MASKED)
+		new_act_prio = xive_prio_from_guest(priority);
+
+	pr_devel(" new_act_prio=%x act_server=%x act_prio=%x\n",
+		 new_act_prio, state->act_server, state->act_priority);
+
+	/*
+	 * Then check if we actually need to change anything,
+	 *
+	 * The condition for re-targetting the interrupt is that
+	 * we have a valid new priority (new_act_prio is not 0xff)
+	 * and either the server or the priority changed.
+	 *
+	 * Note: If act_priority was ff and the new priority is
+	 *       also ff, we don't do anything and leave the interrupt
+	 *       untargetted. An attempt of doing an int_on on an
+	 *       untargetted interrupt will fail. If that is a problem
+	 *       we could initialize interrupts with valid default
+	 */
+
+	if (new_act_prio != MASKED &&
+	    (state->act_server != server ||
+	     state->act_priority != new_act_prio))
+		rc = xive_target_interrupt(kvm, state, server, new_act_prio);
+
+	/*
+	 * Perform the final unmasking of the interrupt source
+	 * if necessary
+	 */
+	if (priority != MASKED)
+		xive_finish_unmask(xive, sb, state, priority);
+
+	/*
+	 * Finally Update saved_priority to match. Only int_on/off
+	 * set this field to a different value.
+	 */
+	state->saved_priority = priority;
+
+	arch_spin_unlock(&sb->lock);
+	return rc;
+}
+
+int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+			 u32 *priority)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+	arch_spin_lock(&sb->lock);
+	*server = state->guest_server;
+	*priority = state->guest_priority;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_on(irq=0x%x)\n", irq);
+
+	/*
+	 * Check if interrupt was not targetted
+	 */
+	if (state->act_priority == MASKED) {
+		pr_devel("int_on on untargetted interrupt\n");
+		return -EINVAL;
+	}
+
+	/* If saved_priority is 0xff, do nothing */
+	if (state->saved_priority == MASKED)
+		return 0;
+
+	/*
+	 * Lock and unmask it.
+	 */
+	xive_lock_for_unmask(sb, state);
+	xive_finish_unmask(xive, sb, state, state->saved_priority);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	pr_devel("int_off(irq=0x%x)\n", irq);
+
+	/*
+	 * Lock and mask
+	 */
+	state->saved_priority = xive_lock_and_mask(xive, sb, state);
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+
+static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return false;
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return false;
+
+	/*
+	 * Trigger the IPI. This assumes we never restore a pass-through
+	 * interrupt which should be safe enough
+	 */
+	xive_irq_trigger(&state->ipi_data);
+
+	return true;
+}
+
+u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	if (!xc)
+		return 0;
+
+	/* Return the per-cpu state for state saving/migration */
+	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
+	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+}
+
+int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	u8 cppr, mfrr;
+	u32 xisr;
+
+	if (!xc || !xive)
+		return -ENOENT;
+
+	/* Grab individual state fields. We don't use pending_pri */
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+
+	pr_devel("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
+		 xc->server_num, cppr, mfrr, xisr);
+
+	/*
+	 * We can't update the state of a "pushed" VCPU, but that
+	 * shouldn't happen.
+	 */
+	if (WARN_ON(vcpu->arch.xive_pushed))
+		return -EIO;
+
+	/* Update VCPU HW saved state */
+	vcpu->arch.xive_saved_state.cppr = cppr;
+	xc->hw_cppr = xc->cppr = cppr;
+
+	/*
+	 * Update MFRR state. If it's not 0xff, we mark the VCPU as
+	 * having a pending MFRR change, which will re-evaluate the
+	 * target. The VCPU will thus potentially get a spurious
+	 * interrupt but that's not a big deal.
+	 */
+	xc->mfrr = mfrr;
+	if (mfrr < cppr)
+		xive_irq_trigger(&xc->vp_ipi_data);
+
+	/*
+	 * Now saved XIRR is "interesting". It means there's something in
+	 * the legacy "1 element" queue... for an IPI we simply ignore it,
+	 * as the MFRR restore will handle that. For anything else we need
+	 * to force a resend of the source.
+	 * However the source may not have been setup yet. If that's the
+	 * case, we keep that info and increment a counter in the xive to
+	 * tell subsequent xive_set_source() to go look.
+	 */
+	if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
+		xc->delayed_irq = xisr;
+		xive->delayed_irqs++;
+		pr_devel("  xisr restore delayed\n");
+	}
+
+	return 0;
+}
+
+int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   struct irq_desc *host_desc)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
+	unsigned int host_irq = irq_desc_get_irq(host_desc);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mark the passed-through interrupt as going to a VCPU,
+	 * this will prevent further EOIs and similar operations
+	 * from the XIVE code. It will also mask the interrupt
+	 * to either PQ=10 or 11 state, the latter if the interrupt
+	 * is pending. This will allow us to unmask or retrigger it
+	 * after routing it to the guest with a simple EOI.
+	 *
+	 * The "state" argument is a "token", all it needs is to be
+	 * non-NULL to switch to passed-through or NULL for the
+	 * other way around. We may not yet have an actual VCPU
+	 * target here and we don't really care.
+	 */
+	rc = irq_set_vcpu_affinity(host_irq, state);
+	if (rc) {
+		pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
+		return rc;
+	}
+
+	/*
+	 * Mask and read state of IPI. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IPI prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/* Turn the IPI hard off */
+	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Grab info about irq */
+	state->pt_number = hw_irq;
+	state->pt_data = irq_data_get_irq_handler_data(host_data);
+
+	/*
+	 * Configure the IRQ to match the existing configuration of
+	 * the IPI if it was already targetted. Otherwise this will
+	 * mask the interrupt in a lossy way (act_priority is 0xff)
+	 * which is fine for a never started interrupt.
+	 */
+	xive_native_configure_irq(hw_irq,
+				  xive->vp_base + state->act_server,
+				  state->act_priority, state->number);
+
+	/*
+	 * We do an EOI to enable the interrupt (and retrigger if needed)
+	 * if the guest has the interrupt unmasked and the P bit was *not*
+	 * set in the IPI. If it was set, we know a slot may still be in
+	 * use in the target queue thus we have to wait for a guest
+	 * originated EOI
+	 */
+	if (prio != MASKED && !state->old_p)
+		xive_vm_source_eoi(hw_irq, state->pt_data);
+
+	/* Clear old_p/old_q as they are no longer relevant */
+	state->old_p = state->old_q = false;
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
+
+int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+			   struct irq_desc *host_desc)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	unsigned int host_irq = irq_desc_get_irq(host_desc);
+	u16 idx;
+	u8 prio;
+	int rc;
+
+	if (!xive)
+		return -ENODEV;
+
+	pr_devel("clr_mapped girq 0x%lx...\n", guest_irq);
+
+	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
+	if (!sb)
+		return -EINVAL;
+	state = &sb->irq_state[idx];
+
+	/*
+	 * Mask and read state of IRQ. We need to know if its P bit
+	 * is set as that means it's potentially already using a
+	 * queue entry in the target
+	 */
+	prio = xive_lock_and_mask(xive, sb, state);
+	pr_devel(" old IRQ prio %02x P:%d Q:%d\n", prio,
+		 state->old_p, state->old_q);
+
+	/*
+	 * If old_p is set, the interrupt is pending, we switch it to
+	 * PQ=11. This will force a resend in the host so the interrupt
+	 * isn't lost to whatver host driver may pick it up
+	 */
+	if (state->old_p)
+		xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
+
+	/* Release the passed-through interrupt to the host */
+	rc = irq_set_vcpu_affinity(host_irq, NULL);
+	if (rc) {
+		pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq);
+		return rc;
+	}
+
+	/* Forget about the IRQ */
+	state->pt_number = 0;
+	state->pt_data = NULL;
+
+	/* Reconfigure the IPI */
+	xive_native_configure_irq(state->ipi_number,
+				  xive->vp_base + state->act_server,
+				  state->act_priority, state->number);
+
+	/*
+	 * If old_p is set (we have a queue entry potentially
+	 * occupied) or the interrupt is masked, we set the IPI
+	 * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
+	 */
+	if (prio == MASKED || state->old_p)
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
+	else
+		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
+
+	/* Restore guest prio (unlocks EOI) */
+	mb();
+	state->guest_priority = prio;
+	arch_spin_unlock(&sb->lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
+
+static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	int i, j;
+
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+		if (!sb)
+			continue;
+		for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+			struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+
+			if (!state->valid)
+				continue;
+			if (state->act_priority == MASKED)
+				continue;
+			if (state->act_server != xc->server_num)
+				continue;
+
+			/* Clean it up */
+			arch_spin_lock(&sb->lock);
+			state->act_priority = MASKED;
+			xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+			xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+			if (state->pt_number) {
+				xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
+				xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
+			}
+			arch_spin_unlock(&sb->lock);
+		}
+	}
+}
+
+void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct kvmppc_xive *xive = xc->xive;
+	int i;
+
+	pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
+
+	/* Ensure no interrupt is still routed to that VP */
+	xc->valid = false;
+	kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+	/* Mask the VP IPI */
+	xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
+
+	/* Disable the VP */
+	xive_native_disable_vp(xc->vp_id);
+
+	/* Free the queues & associated interrupts */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Free the escalation irq */
+		if (xc->esc_virq[i]) {
+			free_irq(xc->esc_virq[i], vcpu);
+			irq_dispose_mapping(xc->esc_virq[i]);
+			kfree(xc->esc_virq_names[i]);
+		}
+		/* Free the queue */
+		xive_native_disable_queue(xc->vp_id, q, i);
+		if (q->qpage) {
+			free_pages((unsigned long)q->qpage,
+				   xive->q_page_order);
+			q->qpage = NULL;
+		}
+	}
+
+	/* Free the IPI */
+	if (xc->vp_ipi) {
+		xive_cleanup_irq_data(&xc->vp_ipi_data);
+		xive_native_free_irq(xc->vp_ipi);
+	}
+	/* Free the VP */
+	kfree(xc);
+}
+
+int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
+			     struct kvm_vcpu *vcpu, u32 cpu)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvmppc_xive_vcpu *xc;
+	int i, r = -EBUSY;
+
+	pr_devel("connect_vcpu(cpu=%d)\n", cpu);
+
+	if (dev->ops != &kvm_xive_ops) {
+		pr_devel("Wrong ops !\n");
+		return -EPERM;
+	}
+	if (xive->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
+		pr_devel("Duplicate !\n");
+		return -EEXIST;
+	}
+	if (cpu >= KVM_MAX_VCPUS) {
+		pr_devel("Out of bounds !\n");
+		return -EINVAL;
+	}
+	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+	if (!xc)
+		return -ENOMEM;
+
+	/* We need to synchronize with queue provisioning */
+	mutex_lock(&vcpu->kvm->lock);
+	vcpu->arch.xive_vcpu = xc;
+	xc->xive = xive;
+	xc->vcpu = vcpu;
+	xc->server_num = cpu;
+	xc->vp_id = xive->vp_base + cpu;
+	xc->mfrr = 0xff;
+	xc->valid = true;
+
+	r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
+	if (r)
+		goto bail;
+
+	/* Configure VCPU fields for use by assembly push/pull */
+	vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
+	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
+
+	/* Allocate IPI */
+	xc->vp_ipi = xive_native_alloc_irq();
+	if (!xc->vp_ipi) {
+		r = -EIO;
+		goto bail;
+	}
+	pr_devel(" IPI=0x%x\n", xc->vp_ipi);
+
+	r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
+	if (r)
+		goto bail;
+
+	/*
+	 * Initialize queues. Initially we set them all for no queueing
+	 * and we enable escalation for queue 0 only which we'll use for
+	 * our mfrr change notifications. If the VCPU is hot-plugged, we
+	 * do handle provisioning however.
+	 */
+	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+		struct xive_q *q = &xc->queues[i];
+
+		/* Is queue already enabled ? Provision it */
+		if (xive->qmap & (1 << i)) {
+			r = xive_provision_queue(vcpu, i);
+			if (r == 0)
+				xive_attach_escalation(vcpu, i);
+			if (r)
+				goto bail;
+		} else {
+			r = xive_native_configure_queue(xc->vp_id,
+							q, i, NULL, 0, true);
+			if (r) {
+				pr_err("Failed to configure queue %d for VCPU %d\n",
+				       i, cpu);
+				goto bail;
+			}
+		}
+	}
+
+	/* If not done above, attach priority 0 escalation */
+	r = xive_attach_escalation(vcpu, 0);
+	if (r)
+		goto bail;
+
+	/* Enable the VP */
+	r = xive_native_enable_vp(xc->vp_id);
+	if (r)
+		goto bail;
+
+	/* Route the IPI */
+	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
+	if (!r)
+		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
+
+bail:
+	mutex_unlock(&vcpu->kvm->lock);
+	if (r) {
+		kvmppc_xive_cleanup_vcpu(vcpu);
+		return r;
+	}
+
+	vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+	return 0;
+}
+
+/*
+ * Scanning of queues before/after migration save
+ */
+static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return;
+
+	state = &sb->irq_state[idx];
+
+	/* Some sanity checking */
+	if (!state->valid) {
+		pr_err("invalid irq 0x%x in cpu queue!\n", irq);
+		return;
+	}
+
+	/*
+	 * If the interrupt is in a queue it should have P set.
+	 * We warn so that gets reported. A backtrace isn't useful
+	 * so no need to use a WARN_ON.
+	 */
+	if (!state->saved_p)
+		pr_err("Interrupt 0x%x is marked in a queue but P not set !\n", irq);
+
+	/* Set flag */
+	state->in_queue = true;
+}
+
+static void xive_pre_save_mask_irq(struct kvmppc_xive *xive,
+				   struct kvmppc_xive_src_block *sb,
+				   u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/* Mask and save state, this will also sync HW queues */
+	state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
+
+	/* Transfer P and Q */
+	state->saved_p = state->old_p;
+	state->saved_q = state->old_q;
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_unmask_irq(struct kvmppc_xive *xive,
+				     struct kvmppc_xive_src_block *sb,
+				     u32 irq)
+{
+	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
+
+	if (!state->valid)
+		return;
+
+	/*
+	 * Lock / exclude EOI (not technically necessary if the
+	 * guest isn't running concurrently. If this becomes a
+	 * performance issue we can probably remove the lock.
+	 */
+	xive_lock_for_unmask(sb, state);
+
+	/* Restore mask/prio if it wasn't masked */
+	if (state->saved_scan_prio != MASKED)
+		xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
+
+	/* Unlock */
+	arch_spin_unlock(&sb->lock);
+}
+
+static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
+{
+	u32 idx = q->idx;
+	u32 toggle = q->toggle;
+	u32 irq;
+
+	do {
+		irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
+		if (irq > XICS_IPI)
+			xive_pre_save_set_queued(xive, irq);
+	} while(irq);
+}
+
+static void xive_pre_save_scan(struct kvmppc_xive *xive)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i, j;
+
+	/*
+	 * See comment in xive_get_source() about how this
+	 * work. Collect a stable state for all interrupts
+	 */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_mask_irq(xive, sb, j);
+	}
+
+	/* Then scan the queues and update the "in_queue" flag */
+	kvm_for_each_vcpu(i, vcpu, xive->kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		if (!xc)
+			continue;
+		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
+			if (xc->queues[i].qpage)
+				xive_pre_save_queue(xive, &xc->queues[i]);
+		}
+	}
+
+	/* Finally restore interrupt states */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			xive_pre_save_unmask_irq(xive, sb, j);
+	}
+}
+
+static void xive_post_save_scan(struct kvmppc_xive *xive)
+{
+	u32 i, j;
+
+	/* Clear all the in_queue flags */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+		if (!sb)
+			continue;
+		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
+			sb->irq_state[j].in_queue = false;
+	}
+
+	/* Next get_source() will do a new scan */
+	xive->saved_src_count = 0;
+}
+
+/*
+ * This returns the source configuration and state to user space.
+ */
+static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u64 val, prio;
+	u16 idx;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -ENOENT;
+
+	state = &sb->irq_state[idx];
+
+	if (!state->valid)
+		return -ENOENT;
+
+	pr_devel("get_source(%ld)...\n", irq);
+
+	/*
+	 * So to properly save the state into something that looks like a
+	 * XICS migration stream we cannot treat interrupts individually.
+	 *
+	 * We need, instead, mask them all (& save their previous PQ state)
+	 * to get a stable state in the HW, then sync them to ensure that
+	 * any interrupt that had already fired hits its queue, and finally
+	 * scan all the queues to collect which interrupts are still present
+	 * in the queues, so we can set the "pending" flag on them and
+	 * they can be resent on restore.
+	 *
+	 * So we do it all when the "first" interrupt gets saved, all the
+	 * state is collected at that point, the rest of xive_get_source()
+	 * will merely collect and convert that state to the expected
+	 * userspace bit mask.
+	 */
+	if (xive->saved_src_count == 0)
+		xive_pre_save_scan(xive);
+	xive->saved_src_count++;
+
+	/* Convert saved state into something compatible with xics */
+	val = state->guest_server;
+	prio = state->saved_scan_prio;
+
+	if (prio == MASKED) {
+		val |= KVM_XICS_MASKED;
+		prio = state->saved_priority;
+	}
+	val |= prio << KVM_XICS_PRIORITY_SHIFT;
+	if (state->lsi) {
+		val |= KVM_XICS_LEVEL_SENSITIVE;
+		if (state->saved_p)
+			val |= KVM_XICS_PENDING;
+	} else {
+		if (state->saved_p)
+			val |= KVM_XICS_PRESENTED;
+
+		if (state->saved_q)
+			val |= KVM_XICS_QUEUED;
+
+		/*
+		 * We mark it pending (which will attempt a re-delivery)
+		 * if we are in a queue *or* we were masked and had
+		 * Q set which is equivalent to the XICS "masked pending"
+		 * state
+		 */
+		if (state->in_queue || (prio == MASKED && state->saved_q))
+			val |= KVM_XICS_PENDING;
+	}
+
+	/*
+	 * If that was the last interrupt saved, reset the
+	 * in_queue flags
+	 */
+	if (xive->saved_src_count == xive->src_count)
+		xive_post_save_scan(xive);
+
+	/* Copy the result to userspace */
+	if (put_user(val, ubufp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
+							   int irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvmppc_xive_src_block *sb;
+	int i, bid;
+
+	bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&kvm->lock);
+
+	/* block already exists - somebody else got here first */
+	if (xive->src_blocks[bid])
+		goto out;
+
+	/* Create the ICS */
+	sb = kzalloc(sizeof(*sb), GFP_KERNEL);
+	if (!sb)
+		goto out;
+
+	sb->id = bid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
+		sb->irq_state[i].guest_priority = MASKED;
+		sb->irq_state[i].saved_priority = MASKED;
+		sb->irq_state[i].act_priority = MASKED;
+	}
+	smp_wmb();
+	xive->src_blocks[bid] = sb;
+
+	if (bid > xive->max_sbid)
+		xive->max_sbid = bid;
+
+out:
+	mutex_unlock(&kvm->lock);
+	return xive->src_blocks[bid];
+}
+
+static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
+{
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		if (xc->delayed_irq == irq) {
+			xc->delayed_irq = 0;
+			xive->delayed_irqs--;
+			return true;
+		}
+	}
+	return false;
+}
+
+static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
+{
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 act_prio, guest_prio;
+	u32 server;
+	int rc = 0;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	pr_devel("set_source(irq=0x%lx)\n", irq);
+
+	/* Find the source */
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb) {
+		pr_devel("No source, creating source block...\n");
+		sb = xive_create_src_block(xive, irq);
+		if (!sb) {
+			pr_devel("Failed to create block...\n");
+			return -ENOMEM;
+		}
+	}
+	state = &sb->irq_state[idx];
+
+	/* Read user passed data */
+	if (get_user(val, ubufp)) {
+		pr_devel("fault getting user info !\n");
+		return -EFAULT;
+	}
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
+
+	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
+		 val, server, guest_prio);
+	/*
+	 * If the source doesn't already have an IPI, allocate
+	 * one and get the corresponding data
+	 */
+	if (!state->ipi_number) {
+		state->ipi_number = xive_native_alloc_irq();
+		if (state->ipi_number == 0) {
+			pr_devel("Failed to allocate IPI !\n");
+			return -ENOMEM;
+		}
+		xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
+		pr_devel(" src_ipi=0x%x\n", state->ipi_number);
+	}
+
+	/*
+	 * We use lock_and_mask() to set us in the right masked
+	 * state. We will override that state from the saved state
+	 * further down, but this will handle the cases of interrupts
+	 * that need FW masking. We set the initial guest_priority to
+	 * 0 before calling it to ensure it actually performs the masking.
+	 */
+	state->guest_priority = 0;
+	xive_lock_and_mask(xive, sb, state);
+
+	/*
+	 * Now, we select a target if we have one. If we don't we
+	 * leave the interrupt untargetted. It means that an interrupt
+	 * can become "untargetted" accross migration if it was masked
+	 * by set_xive() but there is little we can do about it.
+	 */
+
+	/* First convert prio and mark interrupt as untargetted */
+	act_prio = xive_prio_from_guest(guest_prio);
+	state->act_priority = MASKED;
+	state->guest_server = server;
+
+	/*
+	 * We need to drop the lock due to the mutex below. Hopefully
+	 * nothing is touching that interrupt yet since it hasn't been
+	 * advertized to a running guest yet
+	 */
+	arch_spin_unlock(&sb->lock);
+
+	/* If we have a priority target the interrupt */
+	if (act_prio != MASKED) {
+		/* First, check provisioning of queues */
+		mutex_lock(&xive->kvm->lock);
+		rc = xive_check_provisioning(xive->kvm, act_prio);
+		mutex_unlock(&xive->kvm->lock);
+
+		/* Target interrupt */
+		if (rc == 0)
+			rc = xive_target_interrupt(xive->kvm, state,
+						   server, act_prio);
+		/*
+		 * If provisioning or targetting failed, leave it
+		 * alone and masked. It will remain disabled until
+		 * the guest re-targets it.
+		 */
+	}
+
+	/*
+	 * Find out if this was a delayed irq stashed in an ICP,
+	 * in which case, treat it as pending
+	 */
+	if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
+		val |= KVM_XICS_PENDING;
+		pr_devel("  Found delayed ! forcing PENDING !\n");
+	}
+
+	/* Cleanup the SW state */
+	state->old_p = false;
+	state->old_q = false;
+	state->lsi = false;
+	state->asserted = false;
+
+	/* Restore LSI state */
+	if (val & KVM_XICS_LEVEL_SENSITIVE) {
+		state->lsi = true;
+		if (val & KVM_XICS_PENDING)
+			state->asserted = true;
+		pr_devel("  LSI ! Asserted=%d\n", state->asserted);
+	}
+
+	/*
+	 * Restore P and Q. If the interrupt was pending, we
+	 * force both P and Q, which will trigger a resend.
+	 *
+	 * That means that a guest that had both an interrupt
+	 * pending (queued) and Q set will restore with only
+	 * one instance of that interrupt instead of 2, but that
+	 * is perfectly fine as coalescing interrupts that haven't
+	 * been presented yet is always allowed.
+	 */
+	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+		state->old_p = true;
+	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
+		state->old_q = true;
+
+	pr_devel("  P=%d, Q=%d\n", state->old_p, state->old_q);
+
+	/*
+	 * If the interrupt was unmasked, update guest priority and
+	 * perform the appropriate state transition and do a
+	 * re-trigger if necessary.
+	 */
+	if (val & KVM_XICS_MASKED) {
+		pr_devel("  masked, saving prio\n");
+		state->guest_priority = MASKED;
+		state->saved_priority = guest_prio;
+	} else {
+		pr_devel("  unmasked, restoring to prio %d\n", guest_prio);
+		xive_finish_unmask(xive, sb, state, guest_prio);
+		state->saved_priority = guest_prio;
+	}
+
+	/* Increment the number of valid sources and mark this one valid */
+	if (!state->valid)
+		xive->src_count++;
+	state->valid = true;
+
+	return 0;
+}
+
+int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+			bool line_status)
+{
+	struct kvmppc_xive *xive = kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	u16 idx;
+
+	if (!xive)
+		return -ENODEV;
+
+	sb = kvmppc_xive_find_source(xive, irq, &idx);
+	if (!sb)
+		return -EINVAL;
+
+	/* Perform locklessly .... (we need to do some RCUisms here...) */
+	state = &sb->irq_state[idx];
+	if (!state->valid)
+		return -EINVAL;
+
+	/* We don't allow a trigger on a passed-through interrupt */
+	if (state->pt_number)
+		return -EINVAL;
+
+	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return 0;
+	}
+
+	/* Trigger the IPI */
+	xive_irq_trigger(&state->ipi_data);
+
+	return 0;
+}
+
+static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_set_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xive *xive = dev->private;
+
+	/* We honor the existing XICS ioctl */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xive_get_source(xive, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	/* We honor the same limits as XICS, at least for now */
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
+{
+	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+	xive_native_configure_irq(hw_num, 0, MASKED, 0);
+	xive_cleanup_irq_data(xd);
+}
+
+static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+{
+	int i;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+
+		if (!state->valid)
+			continue;
+
+		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+		xive_native_free_irq(state->ipi_number);
+
+		/* Pass-through, cleanup too */
+		if (state->pt_number)
+			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
+
+		state->valid = false;
+	}
+}
+
+static void kvmppc_xive_free(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = dev->private;
+	struct kvm *kvm = xive->kvm;
+	int i;
+
+	debugfs_remove(xive->dentry);
+
+	if (kvm)
+		kvm->arch.xive = NULL;
+
+	/* Mask and free interrupts */
+	for (i = 0; i <= xive->max_sbid; i++) {
+		if (xive->src_blocks[i])
+			kvmppc_xive_free_sources(xive->src_blocks[i]);
+		kfree(xive->src_blocks[i]);
+		xive->src_blocks[i] = NULL;
+	}
+
+	if (xive->vp_base != XIVE_INVALID_VP)
+		xive_native_free_vp_block(xive->vp_base);
+
+
+	kfree(xive);
+	kfree(dev);
+}
+
+static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xive *xive;
+	struct kvm *kvm = dev->kvm;
+	int ret = 0;
+
+	pr_devel("Creating xive for partition\n");
+
+	xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+	if (!xive)
+		return -ENOMEM;
+
+	dev->private = xive;
+	xive->dev = dev;
+	xive->kvm = kvm;
+
+	/* Already there ? */
+	if (kvm->arch.xive)
+		ret = -EEXIST;
+	else
+		kvm->arch.xive = xive;
+
+	/* We use the default queue size set by the host */
+	xive->q_order = xive_native_default_eq_shift();
+	if (xive->q_order < PAGE_SHIFT)
+		xive->q_page_order = 0;
+	else
+		xive->q_page_order = xive->q_order - PAGE_SHIFT;
+
+	/* Allocate a bunch of VPs */
+	xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
+	pr_devel("VP_Base=%x\n", xive->vp_base);
+
+	if (xive->vp_base == XIVE_INVALID_VP)
+		ret = -ENOMEM;
+
+	if (ret) {
+		kfree(xive);
+		return ret;
+	}
+
+	return 0;
+}
+
+
+static int xive_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xive *xive = m->private;
+	struct kvm *kvm = xive->kvm;
+	struct kvm_vcpu *vcpu;
+	u64 t_rm_h_xirr = 0;
+	u64 t_rm_h_ipoll = 0;
+	u64 t_rm_h_cppr = 0;
+	u64 t_rm_h_eoi = 0;
+	u64 t_rm_h_ipi = 0;
+	u64 t_vm_h_xirr = 0;
+	u64 t_vm_h_ipoll = 0;
+	u64 t_vm_h_cppr = 0;
+	u64 t_vm_h_eoi = 0;
+	u64 t_vm_h_ipi = 0;
+	unsigned int i;
+
+	if (!kvm)
+		return 0;
+
+	seq_printf(m, "=========\nVCPU state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+		if (!xc)
+			continue;
+
+		seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
+			   " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
+			   xc->server_num, xc->cppr, xc->hw_cppr,
+			   xc->mfrr, xc->pending,
+			   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+
+		t_rm_h_xirr += xc->stat_rm_h_xirr;
+		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
+		t_rm_h_cppr += xc->stat_rm_h_cppr;
+		t_rm_h_eoi += xc->stat_rm_h_eoi;
+		t_rm_h_ipi += xc->stat_rm_h_ipi;
+		t_vm_h_xirr += xc->stat_vm_h_xirr;
+		t_vm_h_ipoll += xc->stat_vm_h_ipoll;
+		t_vm_h_cppr += xc->stat_vm_h_cppr;
+		t_vm_h_eoi += xc->stat_vm_h_eoi;
+		t_vm_h_ipi += xc->stat_vm_h_ipi;
+	}
+
+	seq_printf(m, "Hcalls totals\n");
+	seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
+	seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
+	seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
+	seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
+	seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
+
+	return 0;
+}
+
+static int xive_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xive_debug_show, inode->i_private);
+}
+
+static const struct file_operations xive_debug_fops = {
+	.open = xive_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xive_debugfs_init(struct kvmppc_xive *xive)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xive, &xive_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static void kvmppc_xive_init(struct kvm_device *dev)
+{
+	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
+
+	/* Register some debug interfaces */
+	xive_debugfs_init(xive);
+}
+
+struct kvm_device_ops kvm_xive_ops = {
+	.name = "kvm-xive",
+	.create = kvmppc_xive_create,
+	.init = kvmppc_xive_init,
+	.destroy = kvmppc_xive_free,
+	.set_attr = xive_set_attr,
+	.get_attr = xive_get_attr,
+	.has_attr = xive_has_attr,
+};
+
+void kvmppc_xive_init_module(void)
+{
+	__xive_vm_h_xirr = xive_vm_h_xirr;
+	__xive_vm_h_ipoll = xive_vm_h_ipoll;
+	__xive_vm_h_ipi = xive_vm_h_ipi;
+	__xive_vm_h_cppr = xive_vm_h_cppr;
+	__xive_vm_h_eoi = xive_vm_h_eoi;
+}
+
+void kvmppc_xive_exit_module(void)
+{
+	__xive_vm_h_xirr = NULL;
+	__xive_vm_h_ipoll = NULL;
+	__xive_vm_h_ipi = NULL;
+	__xive_vm_h_cppr = NULL;
+	__xive_vm_h_eoi = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
new file mode 100644
index 000000000000..5938f7644dc1
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XIVE_H
+#define _KVM_PPC_BOOK3S_XIVE_H
+
+#ifdef CONFIG_KVM_XICS
+#include "book3s_xics.h"
+
+/*
+ * State for one guest irq source.
+ *
+ * For each guest source we allocate a HW interrupt in the XIVE
+ * which we use for all SW triggers. It will be unused for
+ * pass-through but it's easier to keep around as the same
+ * guest interrupt can alternatively be emulated or pass-through
+ * if a physical device is hot unplugged and replaced with an
+ * emulated one.
+ *
+ * This state structure is very similar to the XICS one with
+ * additional XIVE specific tracking.
+ */
+struct kvmppc_xive_irq_state {
+	bool valid;			/* Interrupt entry is valid */
+
+	u32 number;			/* Guest IRQ number */
+	u32 ipi_number;			/* XIVE IPI HW number */
+	struct xive_irq_data ipi_data;	/* XIVE IPI associated data */
+	u32 pt_number;			/* XIVE Pass-through number if any */
+	struct xive_irq_data *pt_data;	/* XIVE Pass-through associated data */
+
+	/* Targetting as set by guest */
+	u32 guest_server;		/* Current guest selected target */
+	u8 guest_priority;		/* Guest set priority */
+	u8 saved_priority;		/* Saved priority when masking */
+
+	/* Actual targetting */
+	u32 act_server;			/* Actual server */
+	u8 act_priority;		/* Actual priority */
+
+	/* Various state bits */
+	bool in_eoi;			/* Synchronize with H_EOI */
+	bool old_p;			/* P bit state when masking */
+	bool old_q;			/* Q bit state when masking */
+	bool lsi;			/* level-sensitive interrupt */
+	bool asserted;			/* Only for emulated LSI: current state */
+
+	/* Saved for migration state */
+	bool in_queue;
+	bool saved_p;
+	bool saved_q;
+	u8 saved_scan_prio;
+};
+
+/* Select the "right" interrupt (IPI vs. passthrough) */
+static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state *state,
+					  u32 *out_hw_irq,
+					  struct xive_irq_data **out_xd)
+{
+	if (state->pt_number) {
+		if (out_hw_irq)
+			*out_hw_irq = state->pt_number;
+		if (out_xd)
+			*out_xd = state->pt_data;
+	} else {
+		if (out_hw_irq)
+			*out_hw_irq = state->ipi_number;
+		if (out_xd)
+			*out_xd = &state->ipi_data;
+	}
+}
+
+/*
+ * This corresponds to an "ICS" in XICS terminology, we use it
+ * as a mean to break up source information into multiple structures.
+ */
+struct kvmppc_xive_src_block {
+	arch_spinlock_t lock;
+	u16 id;
+	struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+
+struct kvmppc_xive {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+
+	/* VP block associated with the VM */
+	u32	vp_base;
+
+	/* Blocks of sources */
+	struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1];
+	u32	max_sbid;
+
+	/*
+	 * For state save, we lazily scan the queues on the first interrupt
+	 * being migrated. We don't have a clean way to reset that flags
+	 * so we keep track of the number of valid sources and how many of
+	 * them were migrated so we can reset when all of them have been
+	 * processed.
+	 */
+	u32	src_count;
+	u32	saved_src_count;
+
+	/*
+	 * Some irqs are delayed on restore until the source is created,
+	 * keep track here of how many of them
+	 */
+	u32	delayed_irqs;
+
+	/* Which queues (priorities) are in use by the guest */
+	u8	qmap;
+
+	/* Queue orders */
+	u32	q_order;
+	u32	q_page_order;
+
+};
+
+#define KVMPPC_XIVE_Q_COUNT	8
+
+struct kvmppc_xive_vcpu {
+	struct kvmppc_xive	*xive;
+	struct kvm_vcpu		*vcpu;
+	bool			valid;
+
+	/* Server number. This is the HW CPU ID from a guest perspective */
+	u32			server_num;
+
+	/*
+	 * HW VP corresponding to this VCPU. This is the base of the VP
+	 * block plus the server number.
+	 */
+	u32			vp_id;
+	u32			vp_chip_id;
+	u32			vp_cam;
+
+	/* IPI used for sending ... IPIs */
+	u32			vp_ipi;
+	struct xive_irq_data	vp_ipi_data;
+
+	/* Local emulation state */
+	uint8_t			cppr;	/* guest CPPR */
+	uint8_t			hw_cppr;/* Hardware CPPR */
+	uint8_t			mfrr;
+	uint8_t			pending;
+
+	/* Each VP has 8 queues though we only provision some */
+	struct xive_q		queues[KVMPPC_XIVE_Q_COUNT];
+	u32			esc_virq[KVMPPC_XIVE_Q_COUNT];
+	char			*esc_virq_names[KVMPPC_XIVE_Q_COUNT];
+
+	/* Stash a delayed irq on restore from migration (see set_icp) */
+	u32			delayed_irq;
+
+	/* Stats */
+	u64			stat_rm_h_xirr;
+	u64			stat_rm_h_ipoll;
+	u64			stat_rm_h_cppr;
+	u64			stat_rm_h_eoi;
+	u64			stat_rm_h_ipi;
+	u64			stat_vm_h_xirr;
+	u64			stat_vm_h_ipoll;
+	u64			stat_vm_h_cppr;
+	u64			stat_vm_h_eoi;
+	u64			stat_vm_h_ipi;
+};
+
+static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num)
+			return vcpu;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmppc_xive *xive,
+		u32 irq, u16 *source)
+{
+	u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+
+	if (source)
+		*source = src;
+	if (bid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	return xive->src_blocks[bid];
+}
+
+/*
+ * Mapping between guest priorities and host priorities
+ * is as follow.
+ *
+ * Guest request for 0...6 are honored. Guest request for anything
+ * higher results in a priority of 7 being applied.
+ *
+ * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
+ * in order to match AIX expectations
+ *
+ * Similar mapping is done for CPPR values
+ */
+static inline u8 xive_prio_from_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 8)
+		return prio;
+	return 7;
+}
+
+static inline u8 xive_prio_to_guest(u8 prio)
+{
+	if (prio == 0xff || prio < 7)
+		return prio;
+	return 0xb;
+}
+
+static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
+{
+	u32 cur;
+
+	if (!qpage)
+		return 0;
+	cur = be32_to_cpup(qpage + *idx);
+	if ((cur >> 31) == *toggle)
+		return 0;
+	*idx = (*idx + 1) & msk;
+	if (*idx == 0)
+		(*toggle) ^= 1;
+	return cur & 0x7fffffff;
+}
+
+extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu);
+extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
+extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+			 unsigned long mfrr);
+extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
+extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
+extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+			      unsigned long mfrr);
+extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
+extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+
+#endif /* CONFIG_KVM_XICS */
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
new file mode 100644
index 000000000000..023a31133c37
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+/* File to be included by other .c files */
+
+#define XGLUE(a,b) a##b
+#define GLUE(a,b) XGLUE(a,b)
+
+static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
+{
+	u8 cppr;
+	u16 ack;
+
+	/* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
+
+	/* Perform the acknowledge OS to register cycle. */
+	ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
+
+	/* Synchronize subsequent queue accesses */
+	mb();
+
+	/* XXX Check grouping level */
+
+	/* Anything ? */
+	if (!((ack >> 8) & TM_QW1_NSR_EO))
+		return;
+
+	/* Grab CPPR of the most favored pending interrupt */
+	cppr = ack & 0xff;
+	if (cppr < 8)
+		xc->pending |= 1 << cppr;
+
+#ifdef XIVE_RUNTIME_CHECKS
+	/* Check consistency */
+	if (cppr >= xc->hw_cppr)
+		pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
+			smp_processor_id(), cppr, xc->hw_cppr);
+#endif
+
+	/*
+	 * Update our image of the HW CPPR. We don't yet modify
+	 * xc->cppr, this will be done as we scan for interrupts
+	 * in the queues.
+	 */
+	xc->hw_cppr = cppr;
+}
+
+static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset)
+{
+	u64 val;
+
+	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+		offset |= offset << 4;
+
+	val =__x_readq(__x_eoi_page(xd) + offset);
+#ifdef __LITTLE_ENDIAN__
+	val >>= 64-8;
+#endif
+	return (u8)val;
+}
+
+
+static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
+{
+	/* If the XIVE supports the new "store EOI facility, use it */
+	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+		__x_writeq(0, __x_eoi_page(xd));
+	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
+		opal_int_eoi(hw_irq);
+	} else {
+		uint64_t eoi_val;
+
+		/*
+		 * Otherwise for EOI, we use the special MMIO that does
+		 * a clear of both P and Q and returns the old Q,
+		 * except for LSIs where we use the "EOI cycle" special
+		 * load.
+		 *
+		 * This allows us to then do a re-trigger if Q was set
+		 * rather than synthetizing an interrupt in software
+		 *
+		 * For LSIs, using the HW EOI cycle works around a problem
+		 * on P9 DD1 PHBs where the other ESB accesses don't work
+		 * properly.
+		 */
+		if (xd->flags & XIVE_IRQ_FLAG_LSI)
+			__x_readq(__x_eoi_page(xd));
+		else {
+			eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00);
+
+			/* Re-trigger if needed */
+			if ((eoi_val & 1) && __x_trig_page(xd))
+				__x_writeq(0, __x_trig_page(xd));
+		}
+	}
+}
+
+enum {
+	scan_fetch,
+	scan_poll,
+	scan_eoi,
+};
+
+static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
+				       u8 pending, int scan_type)
+{
+	u32 hirq = 0;
+	u8 prio = 0xff;
+
+	/* Find highest pending priority */
+	while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
+		struct xive_q *q;
+		u32 idx, toggle;
+		__be32 *qpage;
+
+		/*
+		 * If pending is 0 this will return 0xff which is what
+		 * we want
+		 */
+		prio = ffs(pending) - 1;
+
+		/*
+		 * If the most favoured prio we found pending is less
+		 * favored (or equal) than a pending IPI, we return
+		 * the IPI instead.
+		 *
+		 * Note: If pending was 0 and mfrr is 0xff, we will
+		 * not spurriously take an IPI because mfrr cannot
+		 * then be smaller than cppr.
+		 */
+		if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
+			prio = xc->mfrr;
+			hirq = XICS_IPI;
+			break;
+		}
+
+		/* Don't scan past the guest cppr */
+		if (prio >= xc->cppr || prio > 7)
+			break;
+
+		/* Grab queue and pointers */
+		q = &xc->queues[prio];
+		idx = q->idx;
+		toggle = q->toggle;
+
+		/*
+		 * Snapshot the queue page. The test further down for EOI
+		 * must use the same "copy" that was used by __xive_read_eq
+		 * since qpage can be set concurrently and we don't want
+		 * to miss an EOI.
+		 */
+		qpage = READ_ONCE(q->qpage);
+
+skip_ipi:
+		/*
+		 * Try to fetch from the queue. Will return 0 for a
+		 * non-queueing priority (ie, qpage = 0).
+		 */
+		hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
+
+		/*
+		 * If this was a signal for an MFFR change done by
+		 * H_IPI we skip it. Additionally, if we were fetching
+		 * we EOI it now, thus re-enabling reception of a new
+		 * such signal.
+		 *
+		 * We also need to do that if prio is 0 and we had no
+		 * page for the queue. In this case, we have non-queued
+		 * IPI that needs to be EOId.
+		 *
+		 * This is safe because if we have another pending MFRR
+		 * change that wasn't observed above, the Q bit will have
+		 * been set and another occurrence of the IPI will trigger.
+		 */
+		if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
+			if (scan_type == scan_fetch)
+				GLUE(X_PFX,source_eoi)(xc->vp_ipi,
+						       &xc->vp_ipi_data);
+			/* Loop back on same queue with updated idx/toggle */
+#ifdef XIVE_RUNTIME_CHECKS
+			WARN_ON(hirq && hirq != XICS_IPI);
+#endif
+			if (hirq)
+				goto skip_ipi;
+		}
+
+		/* If fetching, update queue pointers */
+		if (scan_type == scan_fetch) {
+			q->idx = idx;
+			q->toggle = toggle;
+		}
+
+		/* Something found, stop searching */
+		if (hirq)
+			break;
+
+		/* Clear the pending bit on the now empty queue */
+		pending &= ~(1 << prio);
+
+		/*
+		 * Check if the queue count needs adjusting due to
+		 * interrupts being moved away.
+		 */
+		if (atomic_read(&q->pending_count)) {
+			int p = atomic_xchg(&q->pending_count, 0);
+			if (p) {
+#ifdef XIVE_RUNTIME_CHECKS
+				WARN_ON(p > atomic_read(&q->count));
+#endif
+				atomic_sub(p, &q->count);
+			}
+		}
+	}
+
+	/* If we are just taking a "peek", do nothing else */
+	if (scan_type == scan_poll)
+		return hirq;
+
+	/* Update the pending bits */
+	xc->pending = pending;
+
+	/*
+	 * If this is an EOI that's it, no CPPR adjustment done here,
+	 * all we needed was cleanup the stale pending bits and check
+	 * if there's anything left.
+	 */
+	if (scan_type == scan_eoi)
+		return hirq;
+
+	/*
+	 * If we found an interrupt, adjust what the guest CPPR should
+	 * be as if we had just fetched that interrupt from HW.
+	 */
+	if (hirq)
+		xc->cppr = prio;
+	/*
+	 * If it was an IPI the HW CPPR might have been lowered too much
+	 * as the HW interrupt we use for IPIs is routed to priority 0.
+	 *
+	 * We re-sync it here.
+	 */
+	if (xc->cppr != xc->hw_cppr) {
+		xc->hw_cppr = xc->cppr;
+		__x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+	}
+
+	return hirq;
+}
+
+X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+	u32 hirq;
+
+	pr_devel("H_XIRR\n");
+
+	xc->GLUE(X_STAT_PFX,h_xirr)++;
+
+	/* First collect pending bits from HW */
+	GLUE(X_PFX,ack_pending)(xc);
+
+	/*
+	 * Cleanup the old-style bits if needed (they may have been
+	 * set by pull or an escalation interrupts).
+	 */
+	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
+		clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+			  &vcpu->arch.pending_exceptions);
+
+	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
+		 xc->pending, xc->hw_cppr, xc->cppr);
+
+	/* Grab previous CPPR and reverse map it */
+	old_cppr = xive_prio_to_guest(xc->cppr);
+
+	/* Scan for actual interrupts */
+	hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch);
+
+	pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
+		 hirq, xc->hw_cppr, xc->cppr);
+
+#ifdef XIVE_RUNTIME_CHECKS
+	/* That should never hit */
+	if (hirq & 0xff000000)
+		pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
+#endif
+
+	/*
+	 * XXX We could check if the interrupt is masked here and
+	 * filter it. If we chose to do so, we would need to do:
+	 *
+	 *    if (masked) {
+	 *        lock();
+	 *        if (masked) {
+	 *            old_Q = true;
+	 *            hirq = 0;
+	 *        }
+	 *        unlock();
+	 *    }
+	 */
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.gpr[4] = hirq | (old_cppr << 24);
+
+	return H_SUCCESS;
+}
+
+X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 pending = xc->pending;
+	u32 hirq;
+	u8 pipr;
+
+	pr_devel("H_IPOLL(server=%ld)\n", server);
+
+	xc->GLUE(X_STAT_PFX,h_ipoll)++;
+
+	/* Grab the target VCPU if not the current one */
+	if (xc->server_num != server) {
+		vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+		if (!vcpu)
+			return H_PARAMETER;
+		xc = vcpu->arch.xive_vcpu;
+
+		/* Scan all priorities */
+		pending = 0xff;
+	} else {
+		/* Grab pending interrupt if any */
+		pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		if (pipr < 8)
+			pending |= 1 << pipr;
+	}
+
+	hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll);
+
+	/* Return interrupt and old CPPR in GPR4 */
+	vcpu->arch.gpr[4] = hirq | (xc->cppr << 24);
+
+	return H_SUCCESS;
+}
+
+static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
+{
+	u8 pending, prio;
+
+	pending = xc->pending;
+	if (xc->mfrr != 0xff) {
+		if (xc->mfrr < 8)
+			pending |= 1 << xc->mfrr;
+		else
+			pending |= 0x80;
+	}
+	if (!pending)
+		return;
+	prio = ffs(pending) - 1;
+
+	__x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING);
+}
+
+X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	u8 old_cppr;
+
+	pr_devel("H_CPPR(cppr=%ld)\n", cppr);
+
+	xc->GLUE(X_STAT_PFX,h_cppr)++;
+
+	/* Map CPPR */
+	cppr = xive_prio_from_guest(cppr);
+
+	/* Remember old and update SW state */
+	old_cppr = xc->cppr;
+	xc->cppr = cppr;
+
+	/*
+	 * We are masking less, we need to look for pending things
+	 * to deliver and set VP pending bits accordingly to trigger
+	 * a new interrupt otherwise we might miss MFRR changes for
+	 * which we have optimized out sending an IPI signal.
+	 */
+	if (cppr > old_cppr)
+		GLUE(X_PFX,push_pending_to_hw)(xc);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = cppr;
+	__x_writeb(cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+
+	return H_SUCCESS;
+}
+
+X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+	struct kvmppc_xive_src_block *sb;
+	struct kvmppc_xive_irq_state *state;
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+	struct xive_irq_data *xd;
+	u8 new_cppr = xirr >> 24;
+	u32 irq = xirr & 0x00ffffff, hw_num;
+	u16 src;
+	int rc = 0;
+
+	pr_devel("H_EOI(xirr=%08lx)\n", xirr);
+
+	xc->GLUE(X_STAT_PFX,h_eoi)++;
+
+	xc->cppr = xive_prio_from_guest(new_cppr);
+
+	/*
+	 * IPIs are synthetized from MFRR and thus don't need
+	 * any special EOI handling. The underlying interrupt
+	 * used to signal MFRR changes is EOId when fetched from
+	 * the queue.
+	 */
+	if (irq == XICS_IPI || irq == 0)
+		goto bail;
+
+	/* Find interrupt source */
+	sb = kvmppc_xive_find_source(xive, irq, &src);
+	if (!sb) {
+		pr_devel(" source not found !\n");
+		rc = H_PARAMETER;
+		goto bail;
+	}
+	state = &sb->irq_state[src];
+	kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+	state->in_eoi = true;
+	mb();
+
+again:
+	if (state->guest_priority == MASKED) {
+		arch_spin_lock(&sb->lock);
+		if (state->guest_priority != MASKED) {
+			arch_spin_unlock(&sb->lock);
+			goto again;
+		}
+		pr_devel(" EOI on saved P...\n");
+
+		/* Clear old_p, that will cause unmask to perform an EOI */
+		state->old_p = false;
+
+		arch_spin_unlock(&sb->lock);
+	} else {
+		pr_devel(" EOI on source...\n");
+
+		/* Perform EOI on the source */
+		GLUE(X_PFX,source_eoi)(hw_num, xd);
+
+		/* If it's an emulated LSI, check level and resend */
+		if (state->lsi && state->asserted)
+			__x_writeq(0, __x_trig_page(xd));
+
+	}
+
+	mb();
+	state->in_eoi = false;
+bail:
+
+	/* Re-evaluate pending IRQs and update HW */
+	GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi);
+	GLUE(X_PFX,push_pending_to_hw)(xc);
+	pr_devel(" after scan pending=%02x\n", xc->pending);
+
+	/* Apply new CPPR */
+	xc->hw_cppr = xc->cppr;
+	__x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR);
+
+	return rc;
+}
+
+X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
+			       unsigned long mfrr)
+{
+	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+	pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
+
+	xc->GLUE(X_STAT_PFX,h_ipi)++;
+
+	/* Find target */
+	vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
+	if (!vcpu)
+		return H_PARAMETER;
+	xc = vcpu->arch.xive_vcpu;
+
+	/* Locklessly write over MFRR */
+	xc->mfrr = mfrr;
+
+	/* Shoot the IPI if most favored than target cppr */
+	if (mfrr < xc->cppr)
+		__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
+
+	return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
index 5a9a10b90762..3f1be85a83bc 100644
--- a/arch/powerpc/kvm/irq.h
+++ b/arch/powerpc/kvm/irq.h
@@ -12,6 +12,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 #endif
 #ifdef CONFIG_KVM_XICS
 	ret = ret || (kvm->arch.xics != NULL);
+	ret = ret || (kvm->arch.xive != NULL);
 #endif
 	smp_rmb();
 	return ret;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 95c91a9de351..de79bd721ec7 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -37,6 +37,8 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include <asm/iommu.h>
+#include <asm/xive.h>
+
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -699,7 +701,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
 		break;
 	case KVMPPC_IRQ_XICS:
-		kvmppc_xics_free_icp(vcpu);
+		if (xive_enabled())
+			kvmppc_xive_cleanup_vcpu(vcpu);
+		else
+			kvmppc_xics_free_icp(vcpu);
 		break;
 	}
 
@@ -1219,8 +1224,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 
 		r = -EPERM;
 		dev = kvm_device_from_filp(f.file);
-		if (dev)
-			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		if (dev) {
+			if (xive_enabled())
+				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
+			else
+				r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+		}
 
 		fdput(f);
 		break;
@@ -1244,7 +1253,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
 		return true;
 #endif
 #ifdef CONFIG_KVM_XICS
-	if (kvm->arch.xics)
+	if (kvm->arch.xics || kvm->arch.xive)
 		return true;
 #endif
 	return false;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..d71cd773d870 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -890,3 +890,4 @@ EXPORT_SYMBOL_GPL(opal_leds_set_ind);
 EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
 /* Export this for KVM */
 EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
+EXPORT_SYMBOL_GPL(opal_int_eoi);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index d9cd7f705f21..496036c93531 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -46,13 +46,15 @@
 #endif
 
 bool __xive_enabled;
+EXPORT_SYMBOL_GPL(__xive_enabled);
 bool xive_cmdline_disabled;
 
 /* We use only one priority for now */
 static u8 xive_irq_priority;
 
-/* TIMA */
+/* TIMA exported to KVM */
 void __iomem *xive_tima;
+EXPORT_SYMBOL_GPL(xive_tima);
 u32 xive_tima_offset;
 
 /* Backend ops */
@@ -345,8 +347,11 @@ static void xive_irq_eoi(struct irq_data *d)
 	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
 		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
 
-	/* EOI the source if it hasn't been disabled */
-	if (!irqd_irq_disabled(d))
+	/*
+	 * EOI the source if it hasn't been disabled and hasn't
+	 * been passed-through to a KVM guest
+	 */
+	if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d))
 		xive_do_source_eoi(irqd_to_hwirq(d), xd);
 
 	/*
@@ -689,9 +694,14 @@ static int xive_irq_set_affinity(struct irq_data *d,
 
 	old_target = xd->target;
 
-	rc = xive_ops->configure_irq(hw_irq,
-				     get_hard_smp_processor_id(target),
-				     xive_irq_priority, d->irq);
+	/*
+	 * Only configure the irq if it's not currently passed-through to
+	 * a KVM guest
+	 */
+	if (!irqd_is_forwarded_to_vcpu(d))
+		rc = xive_ops->configure_irq(hw_irq,
+					     get_hard_smp_processor_id(target),
+					     xive_irq_priority, d->irq);
 	if (rc < 0) {
 		pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
 		return rc;
@@ -771,6 +781,123 @@ static int xive_irq_retrigger(struct irq_data *d)
 	return 1;
 }
 
+static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
+{
+	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	int rc;
+	u8 pq;
+
+	/*
+	 * We only support this on interrupts that do not require
+	 * firmware calls for masking and unmasking
+	 */
+	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW)
+		return -EIO;
+
+	/*
+	 * This is called by KVM with state non-NULL for enabling
+	 * pass-through or NULL for disabling it
+	 */
+	if (state) {
+		irqd_set_forwarded_to_vcpu(d);
+
+		/* Set it to PQ=10 state to prevent further sends */
+		pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+
+		/* No target ? nothing to do */
+		if (xd->target == XIVE_INVALID_TARGET) {
+			/*
+			 * An untargetted interrupt should have been
+			 * also masked at the source
+			 */
+			WARN_ON(pq & 2);
+
+			return 0;
+		}
+
+		/*
+		 * If P was set, adjust state to PQ=11 to indicate
+		 * that a resend is needed for the interrupt to reach
+		 * the guest. Also remember the value of P.
+		 *
+		 * This also tells us that it's in flight to a host queue
+		 * or has already been fetched but hasn't been EOIed yet
+		 * by the host. This it's potentially using up a host
+		 * queue slot. This is important to know because as long
+		 * as this is the case, we must not hard-unmask it when
+		 * "returning" that interrupt to the host.
+		 *
+		 * This saved_p is cleared by the host EOI, when we know
+		 * for sure the queue slot is no longer in use.
+		 */
+		if (pq & 2) {
+			pq = xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+			xd->saved_p = true;
+
+			/*
+			 * Sync the XIVE source HW to ensure the interrupt
+			 * has gone through the EAS before we change its
+			 * target to the guest. That should guarantee us
+			 * that we *will* eventually get an EOI for it on
+			 * the host. Otherwise there would be a small window
+			 * for P to be seen here but the interrupt going
+			 * to the guest queue.
+			 */
+			if (xive_ops->sync_source)
+				xive_ops->sync_source(hw_irq);
+		} else
+			xd->saved_p = false;
+	} else {
+		irqd_clr_forwarded_to_vcpu(d);
+
+		/* No host target ? hard mask and return */
+		if (xd->target == XIVE_INVALID_TARGET) {
+			xive_do_source_set_mask(xd, true);
+			return 0;
+		}
+
+		/*
+		 * Sync the XIVE source HW to ensure the interrupt
+		 * has gone through the EAS before we change its
+		 * target to the host.
+		 */
+		if (xive_ops->sync_source)
+			xive_ops->sync_source(hw_irq);
+
+		/*
+		 * By convention we are called with the interrupt in
+		 * a PQ=10 or PQ=11 state, ie, it won't fire and will
+		 * have latched in Q whether there's a pending HW
+		 * interrupt or not.
+		 *
+		 * First reconfigure the target.
+		 */
+		rc = xive_ops->configure_irq(hw_irq,
+					     get_hard_smp_processor_id(xd->target),
+					     xive_irq_priority, d->irq);
+		if (rc)
+			return rc;
+
+		/*
+		 * Then if saved_p is not set, effectively re-enable the
+		 * interrupt with an EOI. If it is set, we know there is
+		 * still a message in a host queue somewhere that will be
+		 * EOId eventually.
+		 *
+		 * Note: We don't check irqd_irq_disabled(). Effectively,
+		 * we *will* let the irq get through even if masked if the
+		 * HW is still firing it in order to deal with the whole
+		 * saved_p business properly. If the interrupt triggers
+		 * while masked, the generic code will re-mask it anyway.
+		 */
+		if (!xd->saved_p)
+			xive_do_source_eoi(hw_irq, xd);
+
+	}
+	return 0;
+}
+
 static struct irq_chip xive_irq_chip = {
 	.name = "XIVE-IRQ",
 	.irq_startup = xive_irq_startup,
@@ -781,12 +908,14 @@ static struct irq_chip xive_irq_chip = {
 	.irq_set_affinity = xive_irq_set_affinity,
 	.irq_set_type = xive_irq_set_type,
 	.irq_retrigger = xive_irq_retrigger,
+	.irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
 };
 
 bool is_xive_irq(struct irq_chip *chip)
 {
 	return chip == &xive_irq_chip;
 }
+EXPORT_SYMBOL_GPL(is_xive_irq);
 
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
@@ -801,6 +930,7 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
 		xd->trig_mmio = NULL;
 	}
 }
+EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
 
 static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 {
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index 5fae59186cb2..6feac0a758e1 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -31,6 +31,7 @@
 #include <asm/xive.h>
 #include <asm/xive-regs.h>
 #include <asm/opal.h>
+#include <asm/kvm_ppc.h>
 
 #include "xive-internal.h"
 
@@ -95,6 +96,7 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(xive_native_populate_irq_data);
 
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 {
@@ -108,6 +110,8 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
 	}
 	return rc == 0 ? 0 : -ENXIO;
 }
+EXPORT_SYMBOL_GPL(xive_native_configure_irq);
+
 
 /* This can be called multiple time to change a queue configuration */
 int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
@@ -172,6 +176,7 @@ int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
 fail:
 	return rc;
 }
+EXPORT_SYMBOL_GPL(xive_native_configure_queue);
 
 static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 {
@@ -191,6 +196,7 @@ void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
 {
 	__xive_native_disable_queue(vp_id, q, prio);
 }
+EXPORT_SYMBOL_GPL(xive_native_disable_queue);
 
 static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
 {
@@ -261,6 +267,7 @@ static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
 	}
 	return 0;
 }
+#endif /* CONFIG_SMP */
 
 u32 xive_native_alloc_irq(void)
 {
@@ -276,6 +283,7 @@ u32 xive_native_alloc_irq(void)
 		return 0;
 	return rc;
 }
+EXPORT_SYMBOL_GPL(xive_native_alloc_irq);
 
 void xive_native_free_irq(u32 irq)
 {
@@ -286,7 +294,9 @@ void xive_native_free_irq(u32 irq)
 		msleep(1);
 	}
 }
+EXPORT_SYMBOL_GPL(xive_native_free_irq);
 
+#ifdef CONFIG_SMP
 static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
 {
 	s64 rc;
@@ -382,7 +392,7 @@ static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
 		return;
 
 	/* Enable the pool VP */
-	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	vp = xive_pool_vps + cpu;
 	pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
 	for (;;) {
 		rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
@@ -427,7 +437,7 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
 	in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
 
 	/* Disable it */
-	vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+	vp = xive_pool_vps + cpu;
 	for (;;) {
 		rc = opal_xive_set_vp_info(vp, 0, 0);
 		if (rc != OPAL_BUSY)
@@ -436,10 +446,11 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
 	}
 }
 
-static void xive_native_sync_source(u32 hw_irq)
+void xive_native_sync_source(u32 hw_irq)
 {
 	opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
 }
+EXPORT_SYMBOL_GPL(xive_native_sync_source);
 
 static const struct xive_ops xive_native_ops = {
 	.populate_irq_data	= xive_native_populate_irq_data,
@@ -500,10 +511,24 @@ static bool xive_parse_provisioning(struct device_node *np)
 	return true;
 }
 
+static void xive_native_setup_pools(void)
+{
+	/* Allocate a pool big enough */
+	pr_debug("XIVE: Allocating VP block for pool size %d\n", nr_cpu_ids);
+
+	xive_pool_vps = xive_native_alloc_vp_block(nr_cpu_ids);
+	if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
+		pr_err("XIVE: Failed to allocate pool VP, KVM might not function\n");
+
+	pr_debug("XIVE: Pool VPs allocated at 0x%x for %d max CPUs\n",
+		 xive_pool_vps, nr_cpu_ids);
+}
+
 u32 xive_native_default_eq_shift(void)
 {
 	return xive_queue_shift;
 }
+EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
 
 bool xive_native_init(void)
 {
@@ -513,7 +538,7 @@ bool xive_native_init(void)
 	struct property *prop;
 	u8 max_prio = 7;
 	const __be32 *p;
-	u32 val;
+	u32 val, cpu;
 	s64 rc;
 
 	if (xive_cmdline_disabled)
@@ -549,7 +574,11 @@ bool xive_native_init(void)
 			break;
 	}
 
-	/* Grab size of provisioning pages */
+	/* Configure Thread Management areas for KVM */
+	for_each_possible_cpu(cpu)
+		kvmppc_set_xive_tima(cpu, r.start, tima);
+
+	/* Grab size of provisionning pages */
 	xive_parse_provisioning(np);
 
 	/* Switch the XIVE to exploitation mode */
@@ -559,6 +588,9 @@ bool xive_native_init(void)
 		return false;
 	}
 
+	/* Setup some dummy HV pool VPs */
+	xive_native_setup_pools();
+
 	/* Initialize XIVE core with our backend */
 	if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
 			    max_prio)) {
@@ -637,3 +669,47 @@ void xive_native_free_vp_block(u32 vp_base)
 		pr_warn("OPAL error %lld freeing VP block\n", rc);
 }
 EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
+
+int xive_native_enable_vp(u32 vp_id)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_enable_vp);
+
+int xive_native_disable_vp(u32 vp_id)
+{
+	s64 rc;
+
+	for (;;) {
+		rc = opal_xive_set_vp_info(vp_id, 0, 0);
+		if (rc != OPAL_BUSY)
+			break;
+		msleep(1);
+	}
+	return rc ? -EIO : 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_disable_vp);
+
+int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
+{
+	__be64 vp_cam_be;
+	__be32 vp_chip_id_be;
+	s64 rc;
+
+	rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, &vp_chip_id_be);
+	if (rc)
+		return -EIO;
+	*out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu;
+	*out_chip_id = be32_to_cpu(vp_chip_id_be);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..d1a6e554ee68 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1165,7 +1165,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type);
 void kvm_unregister_device_ops(u32 type);
 
 extern struct kvm_device_ops kvm_mpic_ops;
-extern struct kvm_device_ops kvm_xics_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
 extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a17d78759727..1b0da5771f71 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2839,10 +2839,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
 	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
 	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
 #endif
-
-#ifdef CONFIG_KVM_XICS
-	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
-#endif
 };
 
 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
-- 
cgit v1.2.3


From 72875d8a4d92f6f37e051be522b2252fd49bd50e Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:19 +0200
Subject: KVM: add kvm_{test,clear}_request to replace {test,clear}_bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Users were expected to use kvm_check_request() for testing and clearing,
but request have expanded their use since then and some users want to
only test or do a faster clear.

Make sure that requests are not directly accessed with bit operations.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/emulate.c           |  2 +-
 arch/powerpc/kvm/book3s_pr.c      |  2 +-
 arch/powerpc/kvm/book3s_pr_papr.c |  2 +-
 arch/powerpc/kvm/booke.c          |  4 ++--
 arch/powerpc/kvm/powerpc.c        |  2 +-
 arch/s390/kvm/kvm-s390.c          |  2 +-
 arch/x86/kvm/vmx.c                |  2 +-
 arch/x86/kvm/x86.c                |  8 ++++----
 include/linux/kvm_host.h          | 14 ++++++++++++--
 9 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 34e78a3ee9d7..4144bfaef137 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -982,7 +982,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
 		 * check if any I/O interrupts are pending.
 		 */
 		if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		}
 	}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f026b062c0ed..69a09444d46e 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -349,7 +349,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
 	if (msr & MSR_POW) {
 		if (!vcpu->arch.pending_exceptions) {
 			kvm_vcpu_block(vcpu);
-			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 			vcpu->stat.halt_wakeup++;
 
 			/* Unset POW bit after we woke up */
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index f102616febc7..bcbeeb62dd13 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -344,7 +344,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 	case H_CEDE:
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
 	case H_LOGICAL_CI_LOAD:
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 3c296c2eacf8..3eaac3809977 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -584,7 +584,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu)
 	 * userspace, so clear the KVM_REQ_WATCHDOG request.
 	 */
 	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
-		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_WATCHDOG, vcpu);
 
 	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
 	nr_jiffies = watchdog_next_timeout(vcpu);
@@ -695,7 +695,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shared->msr & MSR_WE) {
 		local_irq_enable();
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		hard_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf725c580fc5..1ee22a910074 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -233,7 +233,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	case EV_HCALL_TOKEN(EV_IDLE):
 		r = EV_SUCCESS;
 		kvm_vcpu_block(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		break;
 	default:
 		r = EV_UNIMPLEMENTED;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 7eb1275cc265..4bafb0a0c8b5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2496,7 +2496,7 @@ retry:
 	}
 
 	/* nothing to do, just clear the request */
-	clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4ef63718101..b003b8dfb20c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6299,7 +6299,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
 			return handle_interrupt_window(&vmx->vcpu);
 
-		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+		if (kvm_test_request(KVM_REQ_EVENT, vcpu))
 			return 1;
 
 		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f68c5b2ba627..2de54c20fa9e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1753,7 +1753,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 	/* guest entries allowed */
 	kvm_for_each_vcpu(i, vcpu, kvm)
-		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
 
 	spin_unlock(&ka->pvclock_gtod_sync_lock);
 #endif
@@ -7041,7 +7041,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		if (r <= 0)
 			break;
 
-		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
@@ -7169,7 +7169,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
 		kvm_vcpu_block(vcpu);
 		kvm_apic_accept_events(vcpu);
-		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		r = -EAGAIN;
 		goto out;
 	}
@@ -8382,7 +8382,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (atomic_read(&vcpu->arch.nmi_queued))
 		return true;
 
-	if (test_bit(KVM_REQ_SMI, &vcpu->requests))
+	if (kvm_test_request(KVM_REQ_SMI, vcpu))
 		return true;
 
 	if (kvm_arch_interrupt_allowed(vcpu) &&
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 397b7b5b1933..374fa92c7657 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1079,10 +1079,20 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	set_bit(req, &vcpu->requests);
 }
 
+static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_bit(req, &vcpu->requests);
+}
+
+static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
+{
+	clear_bit(req, &vcpu->requests);
+}
+
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
-	if (test_bit(req, &vcpu->requests)) {
-		clear_bit(req, &vcpu->requests);
+	if (kvm_test_request(req, vcpu)) {
+		kvm_clear_request(req, vcpu);
 
 		/*
 		 * Ensure the rest of the request is visible to kvm_check_request's
-- 
cgit v1.2.3


From 930f7fd6da77ed9476a538345513460fd304aaf5 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:22 +0200
Subject: KVM: mark requests that do not need a wakeup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some operations must ensure that the guest is not running with stale
data, but if the guest is halted, then the update can wait until another
event happens.  kvm_make_all_requests() currently doesn't wake up, so we
can mark all requests used with it.

First 8 bits were arbitrarily reserved for request numbers.

Most uses of requests have the request type as a constant, so a compiler
will optimize the '&'.

An alternative would be to have an inline function that would return
whether the request needs a wake-up or not, but I like this one better
even though it might produce worse assembly.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h   |  2 +-
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/x86/include/asm/kvm_host.h   |  6 +++---
 include/linux/kvm_host.h          | 12 +++++++-----
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index de67ce647501..49358f20d36f 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -44,7 +44,7 @@
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 #endif
 
-#define KVM_REQ_VCPU_EXIT	8
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
 
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 522e4f60976e..1c9458a7ec92 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -41,7 +41,7 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
-#define KVM_REQ_VCPU_EXIT	8
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f5c942edbc86..19219826bed6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -61,10 +61,10 @@
 #define KVM_REQ_PMI               19
 #define KVM_REQ_SMI               20
 #define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS 22
-#define KVM_REQ_SCAN_IOAPIC       23
+#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD  25
+#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_CRASH          26
 #define KVM_REQ_IOAPIC_EOI_EXIT   27
 #define KVM_REQ_HV_RESET          28
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 374fa92c7657..a805ddcb7eb0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -115,12 +115,14 @@ static inline bool is_error_page(struct page *page)
 	return IS_ERR(page);
 }
 
+#define KVM_REQUEST_MASK           GENMASK(7,0)
+#define KVM_REQUEST_NO_WAKEUP      BIT(8)
 /*
  * Architecture-independent vcpu->requests bit members
  * Bits 4-7 are reserved for more arch-independent bits.
  */
-#define KVM_REQ_TLB_FLUSH          0
-#define KVM_REQ_MMU_RELOAD         1
+#define KVM_REQ_TLB_FLUSH          (0 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MMU_RELOAD         (1 | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_PENDING_TIMER      2
 #define KVM_REQ_UNHALT             3
 
@@ -1076,17 +1078,17 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 	 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
 	 */
 	smp_wmb();
-	set_bit(req, &vcpu->requests);
+	set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
 {
-	return test_bit(req, &vcpu->requests);
+	return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
 {
-	clear_bit(req, &vcpu->requests);
+	clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
 }
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From cde9af6e79046e12cd08d161139b1d5e57e9fbac Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Wed, 26 Apr 2017 22:32:24 +0200
Subject: KVM: add explicit barrier to kvm_vcpu_kick

kvm_vcpu_kick() must issue a general memory barrier prior to reading
vcpu->mode in order to ensure correctness of the mutual-exclusion
memory barrier pattern used with vcpu->requests.  While the cmpxchg
called from kvm_vcpu_kick():

 kvm_vcpu_kick
   kvm_arch_vcpu_should_kick
     kvm_vcpu_exiting_guest_mode
       cmpxchg

implies general memory barriers before and after the operation, that
implication is only valid when cmpxchg succeeds.  We need an explicit
barrier for when it fails, otherwise a VCPU thread on its entry path
that reads zero for vcpu->requests does not exclude the possibility
the requesting thread sees !IN_GUEST_MODE when it reads vcpu->mode.

kvm_make_all_cpus_request already had a barrier, so we remove it, as
now it would be redundant.

Signed-off-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c       | 2 +-
 include/linux/kvm_host.h | 6 ++++++
 virt/kvm/kvm_main.c      | 3 ---
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0936c3e2e51c..69fcee26f4da 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6853,7 +6853,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	/*
 	 * 1) We should set ->mode before checking ->requests.  Please see
-	 * the comment in kvm_make_all_cpus_request.
+	 * the comment in kvm_vcpu_exiting_guest_mode().
 	 *
 	 * 2) For APICv, we should set ->mode before checking PIR.ON.  This
 	 * pairs with the memory barrier implicit in pi_test_and_set_on
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a805ddcb7eb0..84c5396564f7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -270,6 +270,12 @@ struct kvm_vcpu {
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
 {
+	/*
+	 * The memory barrier ensures a previous write to vcpu->requests cannot
+	 * be reordered with the read of vcpu->mode.  It pairs with the general
+	 * memory barrier following the write of vcpu->mode in VCPU RUN.
+	 */
+	smp_mb__before_atomic();
 	return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3772f7dcc72d..1efb07643035 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -183,9 +183,6 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
-		/* Set ->requests bit before we read ->mode. */
-		smp_mb__after_atomic();
-
 		if (!(req & KVM_REQUEST_NO_WAKEUP))
 			kvm_vcpu_wake_up(vcpu);
 
-- 
cgit v1.2.3


From 178f02ffafafc59d4d4b135242e5cc1515743680 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 26 Apr 2017 22:32:26 +0200
Subject: KVM: return if kvm_vcpu_wake_up() did wake up the VCPU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No need to kick a VCPU that we have just woken up.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 84c5396564f7..f4a2c00092f8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -690,7 +690,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
 void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
-void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1efb07643035..632f7b3e198c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -183,8 +183,8 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		kvm_make_request(req, vcpu);
 		cpu = vcpu->cpu;
 
-		if (!(req & KVM_REQUEST_NO_WAKEUP))
-			kvm_vcpu_wake_up(vcpu);
+		if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
+			continue;
 
 		if (cpus != NULL && cpu != -1 && cpu != me &&
 		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
@@ -2195,7 +2195,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 
-void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
+bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 {
 	struct swait_queue_head *wqp;
 
@@ -2203,8 +2203,10 @@ void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 	if (swait_active(wqp)) {
 		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
+		return true;
 	}
 
+	return false;
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
 
@@ -2216,7 +2218,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	int me;
 	int cpu = vcpu->cpu;
 
-	kvm_vcpu_wake_up(vcpu);
+	if (kvm_vcpu_wake_up(vcpu))
+		return;
+
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 		if (kvm_arch_vcpu_should_kick(vcpu))
-- 
cgit v1.2.3


From 7a97cec26b94c909f4cbad2dc3186af3e457a522 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 27 Apr 2017 14:33:43 +0200
Subject: KVM: mark requests that need synchronization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kvm_make_all_requests() provides a synchronization that waits until all
kicked VCPUs have acknowledged the kick.  This is important for
KVM_REQ_MMU_RELOAD as it prevents freeing while lockless paging is
underway.

This patch adds the synchronization property into all requests that are
currently being used with kvm_make_all_requests() in order to preserve
the current behavior and only introduce a new framework.  Removing it
from requests where it is not necessary is left for future patches.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm/include/asm/kvm_host.h   |  2 +-
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/x86/include/asm/kvm_host.h   |  6 +++---
 include/linux/kvm_host.h          |  9 +++++----
 virt/kvm/kvm_main.c               | 25 ++++++++++++++++++++++---
 5 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 49358f20d36f..3cd04d164c64 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -44,7 +44,7 @@
 #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS
 #endif
 
-#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode);
 int __attribute_const__ kvm_target_cpu(void);
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 1c9458a7ec92..d239ae166c4e 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -41,7 +41,7 @@
 
 #define KVM_VCPU_MAX_FEATURES 4
 
-#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_VCPU_EXIT	(8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 19219826bed6..84c8489531bb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -61,10 +61,10 @@
 #define KVM_REQ_PMI               19
 #define KVM_REQ_SMI               20
 #define KVM_REQ_MASTERCLOCK_UPDATE 21
-#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_SCAN_IOAPIC       (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
-#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_APIC_PAGE_RELOAD  (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
 #define KVM_REQ_HV_CRASH          26
 #define KVM_REQ_IOAPIC_EOI_EXIT   27
 #define KVM_REQ_HV_RESET          28
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f4a2c00092f8..a5bfffa8c8d4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -117,14 +117,15 @@ static inline bool is_error_page(struct page *page)
 
 #define KVM_REQUEST_MASK           GENMASK(7,0)
 #define KVM_REQUEST_NO_WAKEUP      BIT(8)
+#define KVM_REQUEST_WAIT           BIT(9)
 /*
  * Architecture-independent vcpu->requests bit members
  * Bits 4-7 are reserved for more arch-independent bits.
  */
-#define KVM_REQ_TLB_FLUSH          (0 | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_MMU_RELOAD         (1 | KVM_REQUEST_NO_WAKEUP)
-#define KVM_REQ_PENDING_TIMER      2
-#define KVM_REQ_UNHALT             3
+#define KVM_REQ_TLB_FLUSH         (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_MMU_RELOAD        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
+#define KVM_REQ_PENDING_TIMER     2
+#define KVM_REQ_UNHALT            3
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 632f7b3e198c..035bc51f656f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,6 +165,24 @@ void vcpu_put(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(vcpu_put);
 
+/* TODO: merge with kvm_arch_vcpu_should_kick */
+static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
+{
+	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
+
+	/*
+	 * We need to wait for the VCPU to reenable interrupts and get out of
+	 * READING_SHADOW_PAGE_TABLES mode.
+	 */
+	if (req & KVM_REQUEST_WAIT)
+		return mode != OUTSIDE_GUEST_MODE;
+
+	/*
+	 * Need to kick a running VCPU, but otherwise there is nothing to do.
+	 */
+	return mode == IN_GUEST_MODE;
+}
+
 static void ack_flush(void *_completed)
 {
 }
@@ -174,6 +192,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	int i, cpu, me;
 	cpumask_var_t cpus;
 	bool called = true;
+	bool wait = req & KVM_REQUEST_WAIT;
 	struct kvm_vcpu *vcpu;
 
 	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
@@ -187,13 +206,13 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 			continue;
 
 		if (cpus != NULL && cpu != -1 && cpu != me &&
-		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
+		    kvm_request_needs_ipi(vcpu, req))
 			cpumask_set_cpu(cpu, cpus);
 	}
 	if (unlikely(cpus == NULL))
-		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+		smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait);
 	else if (!cpumask_empty(cpus))
-		smp_call_function_many(cpus, ack_flush, NULL, 1);
+		smp_call_function_many(cpus, ack_flush, NULL, wait);
 	else
 		called = false;
 	put_cpu();
-- 
cgit v1.2.3


From 6d684e54690caef45cf14051ddeb7c71beeb681b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 27 Apr 2017 13:44:51 +0800
Subject: rhashtable: Cap total number of entries to 2^31

When max_size is not set or if it set to a sufficiently large
value, the nelems counter can overflow.  This would cause havoc
with the automatic shrinking as it would then attempt to fit a
huge number of entries into a tiny hash table.

This patch fixes this by adding max_elems to struct rhashtable
to cap the number of elements.  This is set to 2^31 as nelems is
not a precise count.  This is sufficiently smaller than UINT_MAX
that it should be safe.

When max_size is set max_elems will be lowered to at most twice
max_size as is the status quo.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 5 +++--
 lib/rhashtable.c           | 5 +++++
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index ae93b65d13d7..45f89369c4c8 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -155,6 +155,7 @@ struct rhashtable_params {
  * @nelems: Number of elements in table
  * @key_len: Key length for hashfn
  * @p: Configuration parameters
+ * @max_elems: Maximum number of elements in table
  * @rhlist: True if this is an rhltable
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
@@ -165,6 +166,7 @@ struct rhashtable {
 	atomic_t			nelems;
 	unsigned int			key_len;
 	struct rhashtable_params	p;
+	unsigned int			max_elems;
 	bool				rhlist;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
@@ -327,8 +329,7 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
 static inline bool rht_grow_above_max(const struct rhashtable *ht,
 				      const struct bucket_table *tbl)
 {
-	return ht->p.max_size &&
-	       (atomic_read(&ht->nelems) / 2u) >= ht->p.max_size;
+	return atomic_read(&ht->nelems) >= ht->max_elems;
 }
 
 /* The bucket lock is selected based on the hash and protects mutations
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index f3b82e0d417b..751630bbe409 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -961,6 +961,11 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->max_size)
 		ht->p.max_size = rounddown_pow_of_two(params->max_size);
 
+	/* Cap total entries at 2^31 to avoid nelems overflow. */
+	ht->max_elems = 1u << 31;
+	if (ht->p.max_size < ht->max_elems / 2)
+		ht->max_elems = ht->p.max_size * 2;
+
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
 	if (params->nelem_hint)
-- 
cgit v1.2.3


From 8ecbc40ada116f2f7d6b61cd646802c87b7c5c7d Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed, 26 Apr 2017 11:05:12 +0800
Subject: net: update comment for netif_dormant() function

This patch updates the comment for netif_dormant() function to reflect
the intended usage.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8c5c8cdc7b97..6847714a5ae3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3408,10 +3408,10 @@ static inline void netif_dormant_off(struct net_device *dev)
 }
 
 /**
- *	netif_dormant - test if carrier present
+ *	netif_dormant - test if device is dormant
  *	@dev: network device
  *
- * Check if carrier is present on device
+ * Check if device is dormant.
  */
 static inline bool netif_dormant(const struct net_device *dev)
 {
-- 
cgit v1.2.3


From 99f906e9ad7b6e79ffeda30f45906a8448b9d6a2 Mon Sep 17 00:00:00 2001
From: Mike Manning <mmanning@brocade.com>
Date: Wed, 26 Apr 2017 14:48:09 +0100
Subject: bridge: add per-port broadcast flood flag

Support for l2 multicast flood control was added in commit b6cb5ac8331b
("net: bridge: add per-port multicast flood flag"). It allows broadcast
as it was introduced specifically for unknown multicast flood control.
But as broadcast is a special case of multicast, this may also need to
be disabled. For this purpose, introduce a flag to disable the flooding
of received l2 broadcasts. This approach is backwards compatible and
provides flexibility in filtering for the desired packet types.

Cc: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Mike Manning <mmanning@brocade.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_bridge.h    |  1 +
 include/uapi/linux/if_link.h |  1 +
 net/bridge/br_forward.c      | 24 +++++++++++++++++-------
 net/bridge/br_if.c           |  2 +-
 net/bridge/br_netlink.c      |  3 +++
 net/bridge/br_sysfs_if.c     |  2 ++
 6 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index c5847dc75a93..0c16866a7aac 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -48,6 +48,7 @@ struct br_ip_list {
 #define BR_MCAST_FLOOD		BIT(11)
 #define BR_MULTICAST_TO_UNICAST	BIT(12)
 #define BR_VLAN_TUNNEL		BIT(13)
+#define BR_BCAST_FLOOD		BIT(14)
 
 #define BR_DEFAULT_AGEING_TIME	(300 * HZ)
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 633aa0276d32..8e56ac70e0d1 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -323,6 +323,7 @@ enum {
 	IFLA_BRPORT_MCAST_FLOOD,
 	IFLA_BRPORT_MCAST_TO_UCAST,
 	IFLA_BRPORT_VLAN_TUNNEL,
+	IFLA_BRPORT_BCAST_FLOOD,
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 902af6ba481c..48fb17417fac 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -183,13 +183,23 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
 	struct net_bridge_port *p;
 
 	list_for_each_entry_rcu(p, &br->port_list, list) {
-		/* Do not flood unicast traffic to ports that turn it off */
-		if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
-			continue;
-		/* Do not flood if mc off, except for traffic we originate */
-		if (pkt_type == BR_PKT_MULTICAST &&
-		    !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
-			continue;
+		/* Do not flood unicast traffic to ports that turn it off, nor
+		 * other traffic if flood off, except for traffic we originate
+		 */
+		switch (pkt_type) {
+		case BR_PKT_UNICAST:
+			if (!(p->flags & BR_FLOOD))
+				continue;
+			break;
+		case BR_PKT_MULTICAST:
+			if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
+				continue;
+			break;
+		case BR_PKT_BROADCAST:
+			if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev)
+				continue;
+			break;
+		}
 
 		/* Do not flood to ports that enable proxy ARP */
 		if (p->flags & BR_PROXYARP)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index f3544d96155c..7f8d05cf9065 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -361,7 +361,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
 	p->path_cost = port_cost(dev);
 	p->priority = 0x8000 >> BR_PORT_BITS;
 	p->port_no = index;
-	p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD;
+	p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
 	br_init_port(p);
 	br_set_state(p, BR_STATE_DISABLED);
 	br_stp_port_timer_init(p);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 650986473577..a572db710d4e 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -189,6 +189,8 @@ static int br_port_fill_attrs(struct sk_buff *skb,
 		       !!(p->flags & BR_FLOOD)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD,
 		       !!(p->flags & BR_MCAST_FLOOD)) ||
+	    nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD,
+		       !!(p->flags & BR_BCAST_FLOOD)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) ||
 	    nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI,
 		       !!(p->flags & BR_PROXYARP_WIFI)) ||
@@ -683,6 +685,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 	br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
+	br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
 	br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 79aee759aba5..5d5d413a6cf8 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -173,6 +173,7 @@ BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD);
 BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP);
 BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI);
 BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD);
+BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD);
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
@@ -221,6 +222,7 @@ static const struct brport_attribute *brport_attrs[] = {
 	&brport_attr_proxyarp,
 	&brport_attr_proxyarp_wifi,
 	&brport_attr_multicast_flood,
+	&brport_attr_broadcast_flood,
 	NULL
 };
 
-- 
cgit v1.2.3


From 7e0d574f2683a2346c978613a72ff07afc89b17a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 27 Apr 2017 10:11:23 -0700
Subject: dm: introduce enum dm_queue_mode to cleanup related code

Introduce an enumeration type for the queue mode.  This patch does
not change any functionality but makes the DM code easier to read.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-core.h          |  2 +-
 drivers/md/dm-ioctl.c         |  2 +-
 drivers/md/dm-mpath.c         |  5 ++++-
 drivers/md/dm-table.c         | 14 +++++++-------
 drivers/md/dm.c               | 11 +++++++----
 drivers/md/dm.h               |  8 ++++----
 include/linux/device-mapper.h | 14 ++++++++------
 7 files changed, 32 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..b92f74d9a982 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -47,7 +47,7 @@ struct mapped_device {
 	struct request_queue *queue;
 	int numa_node_id;
 
-	unsigned type;
+	enum dm_queue_mode type;
 	/* Protect queue and type against concurrent access. */
 	struct mutex type_lock;
 
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index ddda8107aa7e..2d5d7064acbf 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1260,7 +1260,7 @@ static int populate_table(struct dm_table *table,
 	return dm_table_complete(table);
 }
 
-static bool is_valid_type(unsigned cur, unsigned new)
+static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
 {
 	if (cur == new ||
 	    (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED))
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 730ec0be1afa..8336332bd61f 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -90,7 +90,7 @@ struct multipath {
 	atomic_t pg_init_in_progress;	/* Only one pg_init allowed at once */
 	atomic_t pg_init_count;		/* Number of times pg_init called */
 
-	unsigned queue_mode;
+	enum dm_queue_mode queue_mode;
 
 	struct mutex work_mutex;
 	struct work_struct trigger_event;
@@ -1700,6 +1700,9 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			case DM_TYPE_MQ_REQUEST_BASED:
 				DMEMIT("queue_mode mq ");
 				break;
+			default:
+				WARN_ON_ONCE(true);
+				break;
 			}
 		}
 	}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 515136f29115..a02a04829156 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -30,7 +30,7 @@
 
 struct dm_table {
 	struct mapped_device *md;
-	unsigned type;
+	enum dm_queue_mode type;
 
 	/* btree table */
 	unsigned int depth;
@@ -825,19 +825,19 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 }
 EXPORT_SYMBOL(dm_consume_args);
 
-static bool __table_type_bio_based(unsigned table_type)
+static bool __table_type_bio_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_BIO_BASED ||
 		table_type == DM_TYPE_DAX_BIO_BASED);
 }
 
-static bool __table_type_request_based(unsigned table_type)
+static bool __table_type_request_based(enum dm_queue_mode table_type)
 {
 	return (table_type == DM_TYPE_REQUEST_BASED ||
 		table_type == DM_TYPE_MQ_REQUEST_BASED);
 }
 
-void dm_table_set_type(struct dm_table *t, unsigned type)
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type)
 {
 	t->type = type;
 }
@@ -879,7 +879,7 @@ static int dm_table_determine_type(struct dm_table *t)
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
-	unsigned live_md_type = dm_get_md_type(t->md);
+	enum dm_queue_mode live_md_type = dm_get_md_type(t->md);
 
 	if (t->type != DM_TYPE_NONE) {
 		/* target already set the table's type */
@@ -988,7 +988,7 @@ verify_rq_based:
 	return 0;
 }
 
-unsigned dm_table_get_type(struct dm_table *t)
+enum dm_queue_mode dm_table_get_type(struct dm_table *t)
 {
 	return t->type;
 }
@@ -1039,7 +1039,7 @@ bool dm_table_all_blk_mq_devices(struct dm_table *t)
 
 static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
-	unsigned type = dm_table_get_type(t);
+	enum dm_queue_mode type = dm_table_get_type(t);
 	unsigned per_io_data_size = 0;
 	struct dm_target *tgt;
 	unsigned i;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9940c9a42665..45660246e8f5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1807,13 +1807,13 @@ void dm_unlock_md_type(struct mapped_device *md)
 	mutex_unlock(&md->type_lock);
 }
 
-void dm_set_md_type(struct mapped_device *md, unsigned type)
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
 {
 	BUG_ON(!mutex_is_locked(&md->type_lock));
 	md->type = type;
 }
 
-unsigned dm_get_md_type(struct mapped_device *md)
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
 {
 	return md->type;
 }
@@ -1840,7 +1840,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 {
 	int r;
-	unsigned type = dm_get_md_type(md);
+	enum dm_queue_mode type = dm_get_md_type(md);
 
 	switch (type) {
 	case DM_TYPE_REQUEST_BASED:
@@ -1871,6 +1871,9 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
 		if (type == DM_TYPE_DAX_BIO_BASED)
 			queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
 		break;
+	case DM_TYPE_NONE:
+		WARN_ON_ONCE(true);
+		break;
 	}
 
 	return 0;
@@ -2556,7 +2559,7 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
 					    unsigned integrity, unsigned per_io_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index f298b01f7ab3..38c84c0a35d4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -64,7 +64,7 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-unsigned dm_table_get_type(struct dm_table *t);
+enum dm_queue_mode dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
 struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
@@ -76,8 +76,8 @@ struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
-void dm_set_md_type(struct mapped_device *md, unsigned type);
-unsigned dm_get_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
+enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
 
 int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
@@ -204,7 +204,7 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
 					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 98f981026e4e..1ce4036224eb 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -22,11 +22,13 @@ struct bio_vec;
 /*
  * Type of table, mapped_device's mempool and request_queue
  */
-#define DM_TYPE_NONE			0
-#define DM_TYPE_BIO_BASED		1
-#define DM_TYPE_REQUEST_BASED		2
-#define DM_TYPE_MQ_REQUEST_BASED	3
-#define DM_TYPE_DAX_BIO_BASED		4
+enum dm_queue_mode {
+	DM_TYPE_NONE		 = 0,
+	DM_TYPE_BIO_BASED	 = 1,
+	DM_TYPE_REQUEST_BASED	 = 2,
+	DM_TYPE_MQ_REQUEST_BASED = 3,
+	DM_TYPE_DAX_BIO_BASED	 = 4,
+};
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
@@ -476,7 +478,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback
  * Useful for "hybrid" target (supports both bio-based
  * and request-based).
  */
-void dm_table_set_type(struct dm_table *t, unsigned type);
+void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type);
 
 /*
  * Finally call this to make the table ready for use.
-- 
cgit v1.2.3


From ed6473ddc704a2005b9900ca08e236ebb2d8540a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 26 Apr 2017 11:55:27 -0400
Subject: NFSv4: Fix callback server shutdown

We want to use kthread_stop() in order to ensure the threads are
shut down before we tear down the nfs_callback_info in nfs_callback_down.

Tested-and-reviewed-by: Kinglong Mee <kinglongmee@gmail.com>
Reported-by: Kinglong Mee <kinglongmee@gmail.com>
Fixes: bb6aeba736ba9 ("NFSv4.x: Switch to using svc_set_num_threads()...")
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/callback.c          | 24 ++++++++++++++++--------
 include/linux/sunrpc/svc.h |  1 +
 net/sunrpc/svc.c           | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c5e27ebd8da8..73a1f928226c 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_should_stop()) {
+	while (!kthread_freezable_should_stop(NULL)) {
+
+		if (signal_pending(current))
+			flush_signals(current);
 		/*
 		 * Listen for a request on the socket
 		 */
@@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
 			continue;
 		svc_process(rqstp);
 	}
+	svc_exit_thread(rqstp);
+	module_put_and_exit(0);
 	return 0;
 }
 
@@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
 
 	set_freezable();
 
-	while (!kthread_should_stop()) {
-		if (try_to_freeze())
-			continue;
+	while (!kthread_freezable_should_stop(NULL)) {
+
+		if (signal_pending(current))
+			flush_signals(current);
 
 		prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
 		spin_lock_bh(&serv->sv_cb_lock);
@@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
 				error);
 		} else {
 			spin_unlock_bh(&serv->sv_cb_lock);
-			schedule();
+			if (!kthread_should_stop())
+				schedule();
 			finish_wait(&serv->sv_cb_waitq, &wq);
 		}
-		flush_signals(current);
 	}
+	svc_exit_thread(rqstp);
+	module_put_and_exit(0);
 	return 0;
 }
 
@@ -221,14 +229,14 @@ err_bind:
 static struct svc_serv_ops nfs40_cb_sv_ops = {
 	.svo_function		= nfs4_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
-	.svo_setup		= svc_set_num_threads,
+	.svo_setup		= svc_set_num_threads_sync,
 	.svo_module		= THIS_MODULE,
 };
 #if defined(CONFIG_NFS_V4_1)
 static struct svc_serv_ops nfs41_cb_sv_ops = {
 	.svo_function		= nfs41_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
-	.svo_setup		= svc_set_num_threads,
+	.svo_setup		= svc_set_num_threads_sync,
 	.svo_module		= THIS_MODULE,
 };
 
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6ef19cf658b4..94631026f79c 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -473,6 +473,7 @@ void		   svc_pool_map_put(void);
 struct svc_serv *  svc_create_pooled(struct svc_program *, unsigned int,
 			struct svc_serv_ops *);
 int		   svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+int		   svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
 int		   svc_pool_stats_open(struct svc_serv *serv, struct file *file);
 void		   svc_destroy(struct svc_serv *);
 void		   svc_shutdown_net(struct svc_serv *, struct net *);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 98dc33ae738b..bc0f5a0ecbdc 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -795,6 +795,44 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 }
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
 
+/* destroy old threads */
+static int
+svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+	struct task_struct *task;
+	unsigned int state = serv->sv_nrthreads-1;
+
+	/* destroy old threads */
+	do {
+		task = choose_victim(serv, pool, &state);
+		if (task == NULL)
+			break;
+		kthread_stop(task);
+		nrservs++;
+	} while (nrservs < 0);
+	return 0;
+}
+
+int
+svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+{
+	if (pool == NULL) {
+		/* The -1 assumes caller has done a svc_get() */
+		nrservs -= (serv->sv_nrthreads-1);
+	} else {
+		spin_lock_bh(&pool->sp_lock);
+		nrservs -= pool->sp_nrthreads;
+		spin_unlock_bh(&pool->sp_lock);
+	}
+
+	if (nrservs > 0)
+		return svc_start_kthreads(serv, pool, nrservs);
+	if (nrservs < 0)
+		return svc_stop_kthreads(serv, pool, nrservs);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
+
 /*
  * Called from a server thread as it's exiting. Caller must hold the "service
  * mutex" for the service.
-- 
cgit v1.2.3


From 7b4ccb3c466f62bbf2f4dd5d6a143d945a6f3051 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 25 Apr 2017 19:36:25 +0200
Subject: soc: renesas: Provide dummy rcar_rst_read_mode_pins() for
 compile-testing

If the R-Car RST driver is not included, compile-testing R-Car clock
drivers fails with a link error:

    undefined reference to `rcar_rst_read_mode_pins'

To fix this, provide a dummy version.  Use the exact same test logic as
in drivers/soc/renesas/Makefile, as there is no Kconfig symbol (yet) to
control compilation of the R-Car RST driver.

Fixes: 527c02f66d263d2e ("soc: renesas: Add R-Car RST driver")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 include/linux/soc/renesas/rcar-rst.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/soc/renesas/rcar-rst.h b/include/linux/soc/renesas/rcar-rst.h
index a18e0783946b..787e7ad53d45 100644
--- a/include/linux/soc/renesas/rcar-rst.h
+++ b/include/linux/soc/renesas/rcar-rst.h
@@ -1,6 +1,11 @@
 #ifndef __LINUX_SOC_RENESAS_RCAR_RST_H__
 #define __LINUX_SOC_RENESAS_RCAR_RST_H__
 
+#if defined(CONFIG_ARCH_RCAR_GEN1) || defined(CONFIG_ARCH_RCAR_GEN2) || \
+    defined(CONFIG_ARCH_R8A7795) || defined(CONFIG_ARCH_R8A7796)
 int rcar_rst_read_mode_pins(u32 *mode);
+#else
+static inline int rcar_rst_read_mode_pins(u32 *mode) { return -ENODEV; }
+#endif
 
 #endif /* __LINUX_SOC_RENESAS_RCAR_RST_H__ */
-- 
cgit v1.2.3


From e38a017bf080d47376db340e94b9c2ffc47eb9b4 Mon Sep 17 00:00:00 2001
From: Avraham Stern <avraham.stern@intel.com>
Date: Wed, 26 Apr 2017 10:58:47 +0300
Subject: mac80211: Add support for BSS max idle period element

Parse the BSS max idle period element and set the BSS configuration
accordingly so the driver can use this information to configure the
max idle period and to use protected management frames for keep alive
when required.

The BSS max idle period element is defined in IEEE802.11-2016,
section 9.4.2.79

Signed-off-by: Avraham Stern <avraham.stern@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 28 +++++++++++++++++++++++++++-
 include/net/mac80211.h     | 14 +++++++++++++-
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/mlme.c        | 14 +++++++++++++-
 net/mac80211/util.c        |  9 +++++++++
 5 files changed, 63 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 639e77abf064..69033353d0d1 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -7,7 +7,7 @@
  * Copyright (c) 2005, Devicescape Software, Inc.
  * Copyright (c) 2006, Michael Wu <flamingice@sourmilk.net>
  * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH
- * Copyright (c) 2016 Intel Deutschland GmbH
+ * Copyright (c) 2016 - 2017 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2316,6 +2316,32 @@ struct ieee80211_timeout_interval_ie {
 	__le32 value;
 } __packed;
 
+/**
+ * enum ieee80211_idle_options - BSS idle options
+ * @WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE: the station should send an RSN
+ *	protected frame to the AP to reset the idle timer at the AP for
+ *	the station.
+ */
+enum ieee80211_idle_options {
+	WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE = BIT(0),
+};
+
+/**
+ * struct ieee80211_bss_max_idle_period_ie
+ *
+ * This structure refers to "BSS Max idle period element"
+ *
+ * @max_idle_period: indicates the time period during which a station can
+ *	refrain from transmitting frames to its associated AP without being
+ *	disassociated. In units of 1000 TUs.
+ * @idle_options: indicates the options associated with the BSS idle capability
+ *	as specified in &enum ieee80211_idle_options.
+ */
+struct ieee80211_bss_max_idle_period_ie {
+	__le16 max_idle_period;
+	u8 idle_options;
+} __packed;
+
 /* BACK action code */
 enum ieee80211_back_actioncode {
 	WLAN_ACTION_ADDBA_REQ = 0,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 807ee6cd903f..4d05a9443344 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -299,6 +299,8 @@ struct ieee80211_vif_chanctx_switch {
  *	context had been assigned.
  * @BSS_CHANGED_OCB: OCB join status changed
  * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed
+ * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected
+ *	keep alive) changed.
  */
 enum ieee80211_bss_change {
 	BSS_CHANGED_ASSOC		= 1<<0,
@@ -325,6 +327,7 @@ enum ieee80211_bss_change {
 	BSS_CHANGED_BANDWIDTH		= 1<<21,
 	BSS_CHANGED_OCB                 = 1<<22,
 	BSS_CHANGED_MU_GROUPS		= 1<<23,
+	BSS_CHANGED_KEEP_ALIVE		= 1<<24,
 
 	/* when adding here, make sure to change ieee80211_reconfig */
 };
@@ -533,6 +536,13 @@ struct ieee80211_mu_group_data {
  * @allow_p2p_go_ps: indication for AP or P2P GO interface, whether it's allowed
  *	to use P2P PS mechanism or not. AP/P2P GO is not allowed to use P2P PS
  *	if it has associated clients without P2P PS support.
+ * @max_idle_period: the time period during which the station can refrain from
+ *	transmitting frames to its associated AP without being disassociated.
+ *	In units of 1000 TUs. Zero value indicates that the AP did not include
+ *	a (valid) BSS Max Idle Period Element.
+ * @protected_keep_alive: if set, indicates that the station should send an RSN
+ *	protected frame to the AP to reset the idle timer at the AP for the
+ *	station.
  */
 struct ieee80211_bss_conf {
 	const u8 *bssid;
@@ -573,6 +583,8 @@ struct ieee80211_bss_conf {
 	enum nl80211_tx_power_setting txpower_type;
 	struct ieee80211_p2p_noa_attr p2p_noa_attr;
 	bool allow_p2p_go_ps;
+	u16 max_idle_period;
+	bool protected_keep_alive;
 };
 
 /**
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2a5730573aa3..f8f6c148f554 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1483,6 +1483,7 @@ struct ieee802_11_elems {
 	const u8 *opmode_notif;
 	const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
 	const struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie;
+	const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie;
 
 	/* length of them, respectively */
 	u8 ext_capab_len;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 45d80fe61c5f..89dff563b1ec 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6,7 +6,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -3098,6 +3098,18 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	}
 	changed |= BSS_CHANGED_QOS;
 
+	if (elems.max_idle_period_ie) {
+		bss_conf->max_idle_period =
+			le16_to_cpu(elems.max_idle_period_ie->max_idle_period);
+		bss_conf->protected_keep_alive =
+			!!(elems.max_idle_period_ie->idle_options &
+			   WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE);
+		changed |= BSS_CHANGED_KEEP_ALIVE;
+	} else {
+		bss_conf->max_idle_period = 0;
+		bss_conf->protected_keep_alive = false;
+	}
+
 	/* set AID and assoc capability,
 	 * ieee80211_set_associated() will tell the driver */
 	bss_conf->aid = aid;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 4a5414481b78..bfc28053639b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -828,6 +828,7 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 		case WLAN_EID_EXT_CAPABILITY:
 		case WLAN_EID_CHAN_SWITCH_TIMING:
 		case WLAN_EID_LINK_ID:
+		case WLAN_EID_BSS_MAX_IDLE_PERIOD:
 		/*
 		 * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
 		 * that if the content gets bigger it might be needed more than once
@@ -1089,6 +1090,10 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 			else
 				elem_parse_failed = true;
 			break;
+		case WLAN_EID_BSS_MAX_IDLE_PERIOD:
+			if (elen >= sizeof(*elems->max_idle_period_ie))
+				elems->max_idle_period_ie = (void *)pos;
+			break;
 		default:
 			break;
 		}
@@ -1983,6 +1988,10 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			if (sdata->u.mgd.have_beacon)
 				changed |= BSS_CHANGED_BEACON_INFO;
 
+			if (sdata->vif.bss_conf.max_idle_period ||
+			    sdata->vif.bss_conf.protected_keep_alive)
+				changed |= BSS_CHANGED_KEEP_ALIVE;
+
 			sdata_lock(sdata);
 			ieee80211_bss_info_change_notify(sdata, changed);
 			sdata_unlock(sdata);
-- 
cgit v1.2.3


From 9f993737906b30d7b2454a38637d1f70ffd60f2f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 10 Apr 2017 09:54:54 -0600
Subject: blk-mq: unify hctx delayed_run_work and run_work

They serve the exact same purpose. Get rid of the non-delayed
work variant, and just run it without delay for the normal case.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 +-
 block/blk-mq.c         | 27 ++++++---------------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 8 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6bd4d1754d29..37939672d4df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -269,7 +269,7 @@ void blk_sync_queue(struct request_queue *q)
 		int i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
-			cancel_work_sync(&hctx->run_work);
+			cancel_delayed_work_sync(&hctx->run_work);
 			cancel_delayed_work_sync(&hctx->delay_work);
 		}
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e6aad49c1686..5c68fce87ffc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1166,13 +1166,9 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 		put_cpu();
 	}
 
-	if (msecs == 0)
-		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
-					 &hctx->run_work);
-	else
-		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-						 &hctx->delayed_run_work,
-						 msecs_to_jiffies(msecs));
+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					 &hctx->run_work,
+					 msecs_to_jiffies(msecs));
 }
 
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1224,7 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_work(&hctx->run_work);
+	cancel_delayed_work_sync(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
@@ -1282,17 +1278,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
-
-	__blk_mq_run_hw_queue(hctx);
-}
-
-static void blk_mq_delayed_run_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
-
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 	__blk_mq_run_hw_queue(hctx);
 }
 
@@ -1898,8 +1884,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 
-	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
+	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 32bd8eb5ba67..c7cc90328426 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -15,7 +15,7 @@ struct blk_mq_hw_ctx {
 		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;
 
-	struct work_struct	run_work;
+	struct delayed_work	run_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@@ -51,7 +51,6 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delayed_run_work;
 	struct delayed_work	delay_work;
 
 	struct hlist_node	cpuhp_dead;
-- 
cgit v1.2.3


From 818cd1cbaa7b00bbc35452a76bebc681a65f1912 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 10 Apr 2017 09:54:55 -0600
Subject: block: add kblock_mod_delayed_work_on()

This modifies (or adds, if not currently pending) an existing
delayed work item.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 7 +++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 37939672d4df..64b6e58532bf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3076,6 +3076,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work_on);
 
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
+				unsigned long delay)
+{
+	return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
 				  unsigned long delay)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6c247861cb66..d098c66b3ab0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1685,6 +1685,7 @@ int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
-- 
cgit v1.2.3


From 21c6e939a9f6bb06fe616a87defec0f92a7c3df0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 10 Apr 2017 09:54:56 -0600
Subject: blk-mq: unify hctx delay_work and run_work

The only difference between ->run_work and ->delay_work, is that
the latter is used to defer running a queue. This is done by
marking the queue stopped, and scheduling ->delay_work to run
sometime in the future. While the queue is stopped, direct runs
or runs through ->run_work will not run the queue.

If we combine the handlers, then we need to handle two things:

1) If a delayed/stopped run is scheduled, then we should not run
   the queue before that has been completed.
2) If a queue is delayed/stopped, the handler needs to restart
   the queue. Normally a run of a queue with the stopped bit set
   would be a no-op.

Case 1 is handled by modifying a currently pending queue run
to the deadline set by the caller of blk_mq_delay_queue().
Subsequent attempts to queue a queue run will find the work
item already pending, and direct runs will see a stopped queue
as before.

Case 2 is handled by adding a new bit, BLK_MQ_S_START_ON_RUN,
that tells the work handler that it should clear a stopped
queue and run the handler.

Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  4 +---
 block/blk-mq.c         | 34 ++++++++++++++++++++++------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 24 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 64b6e58532bf..24886b69690f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
-		queue_for_each_hw_ctx(q, hctx, i) {
+		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
-			cancel_delayed_work_sync(&hctx->delay_work);
-		}
 	} else {
 		cancel_delayed_work_sync(&q->delay_work);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5c68fce87ffc..a0bdf63aebfe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1221,7 +1221,6 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	cancel_delayed_work_sync(&hctx->run_work);
-	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1279,27 +1278,39 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-	__blk_mq_run_hw_queue(hctx);
-}
 
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
+	/*
+	 * If we are stopped, don't run the queue. The exception is if
+	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+	 * the STOPPED bit and run it.
+	 */
+	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+			return;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	}
 
-	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
-		__blk_mq_run_hw_queue(hctx);
+	__blk_mq_run_hw_queue(hctx);
 }
 
+
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
 	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 		return;
 
+	/*
+	 * Stop the hw queue, then modify currently delayed work.
+	 * This should prevent us from running the queue prematurely.
+	 * Mark the queue as auto-clearing STOPPED when it runs.
+	 */
 	blk_mq_stop_hw_queue(hctx);
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->delay_work, msecs_to_jiffies(msecs));
+	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					&hctx->run_work,
+					msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
@@ -1885,7 +1896,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		node = hctx->numa_node = set->numa_node;
 
 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c7cc90328426..f3e5e1de1bdb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,8 +51,6 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delay_work;
-
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
 
@@ -168,6 +166,7 @@ enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 	BLK_MQ_S_TAG_WAITING	= 3,
+	BLK_MQ_S_START_ON_RUN	= 4,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.2.3


From 984c307878f8924d743c419c79fdebbc19f1285e Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Mon, 27 Mar 2017 15:15:13 +0530
Subject: PCI: Add device IDs for DRA74x and DRA72x

Add device IDs for DRA74x and DRA72x devices. These devices have
configurable PCI endpoint.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a4f77feecbb0..5f6b71d15393 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -862,6 +862,8 @@
 #define PCI_DEVICE_ID_TI_X620		0xac8d
 #define PCI_DEVICE_ID_TI_X420		0xac8e
 #define PCI_DEVICE_ID_TI_XX20_FM	0xac8f
+#define PCI_DEVICE_ID_TI_DRA74x		0xb500
+#define PCI_DEVICE_ID_TI_DRA72x		0xb501
 
 #define PCI_VENDOR_ID_SONY		0x104d
 
-- 
cgit v1.2.3


From 140c91b26ebc48d80c6ac3ef06953b17d7fb3785 Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Sun, 9 Apr 2017 15:00:19 -0700
Subject: watchdog: iTCO_wdt: Add PMC specific noreboot update api

In some SoCs, setting noreboot bit needs modification to
PMC GC registers. But not all PMC drivers allow other drivers
to memory map their GC region. This could create mem request
conflict in watchdog driver. So this patch adds facility to allow
PMC drivers to pass noreboot update function to watchdog
drivers via platform data.

Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/watchdog/iTCO_wdt.c            | 25 ++++++++++++++++++-------
 include/linux/platform_data/itco_wdt.h |  4 ++++
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index a62993381c52..347f0389b089 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -106,6 +106,8 @@ struct iTCO_wdt_private {
 	struct pci_dev *pci_dev;
 	/* whether or not the watchdog has been suspended */
 	bool suspended;
+	/* no reboot API private data */
+	void *no_reboot_priv;
 	/* no reboot update function pointer */
 	int (*update_no_reboot_bit)(void *p, bool set);
 };
@@ -217,14 +219,23 @@ static int update_no_reboot_bit_mem(void *priv, bool set)
 	return 0;
 }
 
-static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p)
+static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p,
+		struct itco_wdt_platform_data *pdata)
 {
+	if (pdata->update_no_reboot_bit) {
+		p->update_no_reboot_bit = pdata->update_no_reboot_bit;
+		p->no_reboot_priv = pdata->no_reboot_priv;
+		return;
+	}
+
 	if (p->iTCO_version >= 2)
 		p->update_no_reboot_bit = update_no_reboot_bit_mem;
 	else if (p->iTCO_version == 1)
 		p->update_no_reboot_bit = update_no_reboot_bit_pci;
 	else
 		p->update_no_reboot_bit = update_no_reboot_bit_def;
+
+	p->no_reboot_priv = p;
 }
 
 static int iTCO_wdt_start(struct watchdog_device *wd_dev)
@@ -237,7 +248,7 @@ static int iTCO_wdt_start(struct watchdog_device *wd_dev)
 	iTCO_vendor_pre_start(p->smi_res, wd_dev->timeout);
 
 	/* disable chipset's NO_REBOOT bit */
-	if (p->update_no_reboot_bit(p, false)) {
+	if (p->update_no_reboot_bit(p->no_reboot_priv, false)) {
 		spin_unlock(&p->io_lock);
 		pr_err("failed to reset NO_REBOOT flag, reboot disabled by hardware/BIOS\n");
 		return -EIO;
@@ -278,7 +289,7 @@ static int iTCO_wdt_stop(struct watchdog_device *wd_dev)
 	val = inw(TCO1_CNT(p));
 
 	/* Set the NO_REBOOT bit to prevent later reboots, just for sure */
-	p->update_no_reboot_bit(p, true);
+	p->update_no_reboot_bit(p->no_reboot_priv, true);
 
 	spin_unlock(&p->io_lock);
 
@@ -443,13 +454,13 @@ static int iTCO_wdt_probe(struct platform_device *pdev)
 	p->iTCO_version = pdata->version;
 	p->pci_dev = to_pci_dev(dev->parent);
 
-	iTCO_wdt_no_reboot_bit_setup(p);
+	iTCO_wdt_no_reboot_bit_setup(p, pdata);
 
 	/*
 	 * Get the Memory-Mapped GCS or PMC register, we need it for the
 	 * NO_REBOOT flag (TCO v2 and v3).
 	 */
-	if (p->iTCO_version >= 2) {
+	if (p->iTCO_version >= 2 && !pdata->update_no_reboot_bit) {
 		p->gcs_pmc_res = platform_get_resource(pdev,
 						       IORESOURCE_MEM,
 						       ICH_RES_MEM_GCS_PMC);
@@ -459,14 +470,14 @@ static int iTCO_wdt_probe(struct platform_device *pdev)
 	}
 
 	/* Check chipset's NO_REBOOT bit */
-	if (p->update_no_reboot_bit(p, false) &&
+	if (p->update_no_reboot_bit(p->no_reboot_priv, false) &&
 	    iTCO_vendor_check_noreboot_on()) {
 		pr_info("unable to reset NO_REBOOT flag, device disabled by hardware/BIOS\n");
 		return -ENODEV;	/* Cannot reset NO_REBOOT bit */
 	}
 
 	/* Set the NO_REBOOT bit to prevent later reboots, just for sure */
-	p->update_no_reboot_bit(p, true);
+	p->update_no_reboot_bit(p->no_reboot_priv, true);
 
 	/* The TCO logic uses the TCO_EN bit in the SMI_EN register */
 	if (!devm_request_region(dev, p->smi_res->start,
diff --git a/include/linux/platform_data/itco_wdt.h b/include/linux/platform_data/itco_wdt.h
index f16542c77ff7..0e95527edf25 100644
--- a/include/linux/platform_data/itco_wdt.h
+++ b/include/linux/platform_data/itco_wdt.h
@@ -14,6 +14,10 @@
 struct itco_wdt_platform_data {
 	char name[32];
 	unsigned int version;
+	/* private data to be passed to update_no_reboot_bit API */
+	void *no_reboot_priv;
+	/* pointer for platform specific no reboot update function */
+	int (*update_no_reboot_bit)(void *priv, bool set);
 };
 
 #endif /* _ITCO_WDT_H_ */
-- 
cgit v1.2.3


From 63ccc191649eb0f14a761074291551d0d2f85389 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Apr 2017 14:26:52 +0200
Subject: libata: remove SCT WRITE SAME support

This was already disabled a while ago because it caused I/O errors,
and it's severly getting into the way of the discard / write zeroes
rework.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-scsi.c | 132 ++++++++++------------------------------------
 include/linux/ata.h       |   5 --
 2 files changed, 29 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 50f56d066936..49ba9834c715 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3392,46 +3392,6 @@ static size_t ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 trmax,
 	return r;
 }
 
-/**
- * ata_format_dsm_trim_descr() - SATL Write Same to ATA SCT Write Same
- * @cmd: SCSI command being translated
- * @lba: Starting sector
- * @num: Number of sectors to be zero'd.
- *
- * Rewrite the WRITE SAME payload to be an SCT Write Same formatted
- * descriptor.
- * NOTE: Writes a pattern (0's) in the foreground.
- *
- * Return: Number of bytes copied into sglist.
- */
-static size_t ata_format_sct_write_same(struct scsi_cmnd *cmd, u64 lba, u64 num)
-{
-	struct scsi_device *sdp = cmd->device;
-	size_t len = sdp->sector_size;
-	size_t r;
-	u16 *buf;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ata_scsi_rbuf_lock, flags);
-	buf = ((void *)ata_scsi_rbuf);
-
-	put_unaligned_le16(0x0002,  &buf[0]); /* SCT_ACT_WRITE_SAME */
-	put_unaligned_le16(0x0101,  &buf[1]); /* WRITE PTRN FG */
-	put_unaligned_le64(lba,     &buf[2]);
-	put_unaligned_le64(num,     &buf[6]);
-	put_unaligned_le32(0u,      &buf[10]); /* pattern */
-
-	WARN_ON(len > ATA_SCSI_RBUF_SIZE);
-
-	if (len > ATA_SCSI_RBUF_SIZE)
-		len = ATA_SCSI_RBUF_SIZE;
-
-	r = sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, len);
-	spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags);
-
-	return r;
-}
-
 /**
  * ata_scsi_write_same_xlat() - SATL Write Same to ATA SCT Write Same
  * @qc: Command to be translated
@@ -3476,26 +3436,17 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 	}
 	scsi_16_lba_len(cdb, &block, &n_block);
 
-	if (unmap) {
-		/* If trim is not enabled the cmd is invalid. */
-		if ((dev->horkage & ATA_HORKAGE_NOTRIM) ||
-		    !ata_id_has_trim(dev->id)) {
-			fp = 1;
-			bp = 3;
-			goto invalid_fld;
-		}
-		/* If the request is too large the cmd is invalid */
-		if (n_block > 0xffff * trmax) {
-			fp = 2;
-			goto invalid_fld;
-		}
-	} else {
-		/* If write same is not available the cmd is invalid */
-		if (!ata_id_sct_write_same(dev->id)) {
-			fp = 1;
-			bp = 3;
-			goto invalid_fld;
-		}
+	if (!unmap ||
+	    (dev->horkage & ATA_HORKAGE_NOTRIM) ||
+	    !ata_id_has_trim(dev->id)) {
+		fp = 1;
+		bp = 3;
+		goto invalid_fld;
+	}
+	/* If the request is too large the cmd is invalid */
+	if (n_block > 0xffff * trmax) {
+		fp = 2;
+		goto invalid_fld;
 	}
 
 	/*
@@ -3510,49 +3461,28 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 	 * For DATA SET MANAGEMENT TRIM in ACS-2 nsect (aka count)
 	 * is defined as number of 512 byte blocks to be transferred.
 	 */
-	if (unmap) {
-		size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
-		if (size != len)
-			goto invalid_param_len;
 
-		if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
-			/* Newer devices support queued TRIM commands */
-			tf->protocol = ATA_PROT_NCQ;
-			tf->command = ATA_CMD_FPDMA_SEND;
-			tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
-			tf->nsect = qc->tag << 3;
-			tf->hob_feature = (size / 512) >> 8;
-			tf->feature = size / 512;
+	size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
+	if (size != len)
+		goto invalid_param_len;
 
-			tf->auxiliary = 1;
-		} else {
-			tf->protocol = ATA_PROT_DMA;
-			tf->hob_feature = 0;
-			tf->feature = ATA_DSM_TRIM;
-			tf->hob_nsect = (size / 512) >> 8;
-			tf->nsect = size / 512;
-			tf->command = ATA_CMD_DSM;
-		}
-	} else {
-		size = ata_format_sct_write_same(scmd, block, n_block);
-		if (size != len)
-			goto invalid_param_len;
+	if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
+		/* Newer devices support queued TRIM commands */
+		tf->protocol = ATA_PROT_NCQ;
+		tf->command = ATA_CMD_FPDMA_SEND;
+		tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
+		tf->nsect = qc->tag << 3;
+		tf->hob_feature = (size / 512) >> 8;
+		tf->feature = size / 512;
 
-		tf->hob_feature = 0;
-		tf->feature = 0;
-		tf->hob_nsect = 0;
-		tf->nsect = 1;
-		tf->lbah = 0;
-		tf->lbam = 0;
-		tf->lbal = ATA_CMD_STANDBYNOW1;
-		tf->hob_lbah = 0;
-		tf->hob_lbam = 0;
-		tf->hob_lbal = 0;
-		tf->device = ATA_CMD_STANDBYNOW1;
+		tf->auxiliary = 1;
+	} else {
 		tf->protocol = ATA_PROT_DMA;
-		tf->command = ATA_CMD_WRITE_LOG_DMA_EXT;
-		if (unlikely(dev->flags & ATA_DFLAG_PIO))
-			tf->command = ATA_CMD_WRITE_LOG_EXT;
+		tf->hob_feature = 0;
+		tf->feature = ATA_DSM_TRIM;
+		tf->hob_nsect = (size / 512) >> 8;
+		tf->nsect = size / 512;
+		tf->command = ATA_CMD_DSM;
 	}
 
 	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
@@ -3627,10 +3557,6 @@ static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
 	case START_STOP:
 		supported = 3;
 		break;
-	case WRITE_SAME_16:
-		if (!ata_id_sct_write_same(dev->id))
-			break;
-		/* fallthrough: if SCT ... only enable for ZBC */
 	case ZBC_IN:
 	case ZBC_OUT:
 		if (ata_id_zoned_cap(dev->id) ||
diff --git a/include/linux/ata.h b/include/linux/ata.h
index af6859b3a93d..ad7d9ee89ff0 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -817,11 +817,6 @@ static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id)
 	return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false;
 }
 
-static inline bool ata_id_sct_write_same(const u16 *id)
-{
-	return id[ATA_ID_SCT_CMD_XPORT] & (1 << 2) ? true : false;
-}
-
 static inline bool ata_id_sct_long_sector_access(const u16 *id)
 {
 	return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false;
-- 
cgit v1.2.3


From 461a6946b1f93f6720577fb06aa78e8cbd9291c9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:46:20 +0200
Subject: iommu: Remove pci.h include from trace/events/iommu.h

The include file does not need any PCI specifics, so remove
that include. Also fix the places that relied on it.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c       | 1 +
 drivers/infiniband/hw/qedr/main.c | 1 +
 drivers/iommu/fsl_pamu.h          | 1 +
 drivers/iommu/rockchip-iommu.c    | 1 +
 drivers/iommu/tegra-smmu.c        | 1 +
 include/linux/dma-iommu.h         | 1 +
 include/trace/events/iommu.h      | 1 -
 7 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 81cdb2e844ed..982f85b81624 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -28,6 +28,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/vmalloc.h>
 #include <linux/swiotlb.h>
+#include <linux/pci.h>
 
 #include <asm/cacheflush.h>
 
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index b9b47e5cc8b3..33033624cd9b 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -35,6 +35,7 @@
 #include <rdma/ib_user_verbs.h>
 #include <linux/netdevice.h>
 #include <linux/iommu.h>
+#include <linux/pci.h>
 #include <net/addrconf.h>
 #include <linux/qed/qede_roce.h>
 #include <linux/qed/qed_chain.h>
diff --git a/drivers/iommu/fsl_pamu.h b/drivers/iommu/fsl_pamu.h
index aab723f91f12..c3434f29c967 100644
--- a/drivers/iommu/fsl_pamu.h
+++ b/drivers/iommu/fsl_pamu.h
@@ -20,6 +20,7 @@
 #define __FSL_PAMU_H
 
 #include <linux/iommu.h>
+#include <linux/pci.h>
 
 #include <asm/fsl_pamu_stash.h>
 
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 9afcbf79f0b0..0ba303a184dd 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -8,6 +8,7 @@
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 9305964250ac..eeb19f560a05 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -15,6 +15,7 @@
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 
 #include <soc/tegra/ahb.h>
 #include <soc/tegra/mc.h>
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 5725c94b1f12..abd946569515 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -20,6 +20,7 @@
 #include <asm/errno.h>
 
 #ifdef CONFIG_IOMMU_DMA
+#include <linux/dma-mapping.h>
 #include <linux/iommu.h>
 #include <linux/msi.h>
 
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 2c7befb10f13..99254ed89212 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -11,7 +11,6 @@
 #define _TRACE_IOMMU_H
 
 #include <linux/tracepoint.h>
-#include <linux/pci.h>
 
 struct device;
 
-- 
cgit v1.2.3


From 208480bb273e15f42711bd47f70dc0fbfa2570b8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Wed, 26 Apr 2017 15:49:57 +0200
Subject: iommu: Remove trace-events include from iommu.h

It is not needed there anymore. All places needing it are
fixed.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/media/platform/mtk-vpu/mtk_vpu.c | 1 +
 include/linux/iommu.h                    | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/platform/mtk-vpu/mtk_vpu.c b/drivers/media/platform/mtk-vpu/mtk_vpu.c
index 463b69c934be..e011c8dc0198 100644
--- a/drivers/media/platform/mtk-vpu/mtk_vpu.c
+++ b/drivers/media/platform/mtk-vpu/mtk_vpu.c
@@ -23,6 +23,7 @@
 #include <linux/of_reserved_mem.h>
 #include <linux/sched.h>
 #include <linux/sizes.h>
+#include <linux/dma-mapping.h>
 
 #include "mtk_vpu.h"
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index abaa0ca848bc..dda8717545e9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -26,8 +26,6 @@
 #include <linux/err.h>
 #include <linux/of.h>
 
-#include <trace/events/iommu.h>
-
 #define IOMMU_READ	(1 << 0)
 #define IOMMU_WRITE	(1 << 1)
 #define IOMMU_CACHE	(1 << 2) /* DMA cache coherency */
-- 
cgit v1.2.3


From 23f4984483623cf8621246004228f08fcabf51e4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 29 Apr 2017 15:24:03 -0700
Subject: libnvdimm: rework region badblocks clearing

Toshi noticed that the new support for a region-level badblocks missed
the case where errors are cleared due to BTT I/O.

An initial attempt to fix this ran into a "sleeping while atomic"
warning due to taking the nvdimm_bus_lock() in the BTT I/O path to
satisfy the locking requirements of __nvdimm_bus_badblocks_clear().
However, that lock is not needed since we are not acting on any data that
is subject to change under that lock. The badblocks instance has its own
internal lock to handle mutations of the error list.

So, in order to make it clear that we are just acting on region devices,
rename __nvdimm_bus_badblocks_clear() to nvdimm_clear_badblocks_regions().
Eliminate the lock and consolidate all support routines for the new
nvdimm_account_cleared_poison() in drivers/nvdimm/bus.c. Finally, to the
opportunity to cleanup to some unnecessary casts, make the calling
convention of nvdimm_clear_badblocks_regions() clearer by replacing struct
resource with the minimal struct clear_badblocks_context, and use the
DEVICE_ATTR macro.

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/bus.c         | 76 +++++++++++++++++++++++++++++++-------------
 drivers/nvdimm/region.c      | 25 ---------------
 drivers/nvdimm/region_devs.c | 15 +++------
 include/linux/libnvdimm.h    |  3 --
 4 files changed, 59 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 43ddfd487c85..e9361bffe5ee 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -172,6 +172,57 @@ void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
 }
 EXPORT_SYMBOL_GPL(nvdimm_region_notify);
 
+struct clear_badblocks_context {
+	resource_size_t phys, cleared;
+};
+
+static int nvdimm_clear_badblocks_region(struct device *dev, void *data)
+{
+	struct clear_badblocks_context *ctx = data;
+	struct nd_region *nd_region;
+	resource_size_t ndr_end;
+	sector_t sector;
+
+	/* make sure device is a region */
+	if (!is_nd_pmem(dev))
+		return 0;
+
+	nd_region = to_nd_region(dev);
+	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
+
+	/* make sure we are in the region */
+	if (ctx->phys < nd_region->ndr_start
+			|| (ctx->phys + ctx->cleared) > ndr_end)
+		return 0;
+
+	sector = (ctx->phys - nd_region->ndr_start) / 512;
+	badblocks_clear(&nd_region->bb, sector, ctx->cleared / 512);
+
+	return 0;
+}
+
+static void nvdimm_clear_badblocks_regions(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	struct clear_badblocks_context ctx = {
+		.phys = phys,
+		.cleared = cleared,
+	};
+
+	device_for_each_child(&nvdimm_bus->dev, &ctx,
+			nvdimm_clear_badblocks_region);
+}
+
+static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
+		phys_addr_t phys, u64 cleared)
+{
+	if (cleared > 0)
+		nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+
+	if (cleared > 0 && cleared / 512)
+		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
+}
+
 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len)
 {
@@ -219,22 +270,12 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 	if (cmd_rc < 0)
 		return cmd_rc;
 
-	if (clear_err.cleared > 0)
-		nvdimm_forget_poison(nvdimm_bus, phys, clear_err.cleared);
+	nvdimm_account_cleared_poison(nvdimm_bus, phys, clear_err.cleared);
 
 	return clear_err.cleared;
 }
 EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
 
-void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
-		struct resource *res)
-{
-	lockdep_assert_held(&nvdimm_bus->reconfig_mutex);
-	device_for_each_child(&nvdimm_bus->dev, (void *)res,
-			nvdimm_region_badblocks_clear);
-}
-EXPORT_SYMBOL_GPL(__nvdimm_bus_badblocks_clear);
-
 static int nvdimm_bus_match(struct device *dev, struct device_driver *drv);
 
 static struct bus_type nvdimm_bus_type = {
@@ -989,18 +1030,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR && cmd_rc >= 0) {
 		struct nd_cmd_clear_error *clear_err = buf;
-		struct resource res;
-
-		if (clear_err->cleared) {
-			/* clearing the poison list we keep track of */
-			nvdimm_forget_poison(nvdimm_bus, clear_err->address,
-					clear_err->cleared);
 
-			/* now sync the badblocks lists */
-			res.start = clear_err->address;
-			res.end = clear_err->address + clear_err->cleared - 1;
-			__nvdimm_bus_badblocks_clear(nvdimm_bus, &res);
-		}
+		nvdimm_account_cleared_poison(nvdimm_bus, clear_err->address,
+				clear_err->cleared);
 	}
 	nvdimm_bus_unlock(&nvdimm_bus->dev);
 
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 23c4307d254c..869a886c292e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -131,31 +131,6 @@ static void nd_region_notify(struct device *dev, enum nvdimm_event event)
 	device_for_each_child(dev, &event, child_notify);
 }
 
-int nvdimm_region_badblocks_clear(struct device *dev, void *data)
-{
-	struct resource *res = (struct resource *)data;
-	struct nd_region *nd_region;
-	resource_size_t ndr_end;
-	sector_t sector;
-
-	/* make sure device is a region */
-	if (!is_nd_pmem(dev))
-		return 0;
-
-	nd_region = to_nd_region(dev);
-	ndr_end = nd_region->ndr_start + nd_region->ndr_size - 1;
-
-	/* make sure we are in the region */
-	if (res->start < nd_region->ndr_start || res->end > ndr_end)
-		return 0;
-
-	sector = (res->start - nd_region->ndr_start) >> 9;
-	badblocks_clear(&nd_region->bb, sector, resource_size(res) >> 9);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(nvdimm_region_badblocks_clear);
-
 static struct nd_device_driver nd_region_driver = {
 	.probe = nd_region_probe,
 	.remove = nd_region_remove,
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 53d1ba4e6d99..07756b2e1cd5 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -477,20 +477,15 @@ static ssize_t read_only_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(read_only);
 
-static ssize_t nd_badblocks_show(struct device *dev,
+static ssize_t region_badblocks_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct nd_region *nd_region = to_nd_region(dev);
 
 	return badblocks_show(&nd_region->bb, buf, 0);
 }
-static struct device_attribute dev_attr_nd_badblocks = {
-	.attr = {
-		.name = "badblocks",
-		.mode = S_IRUGO
-	},
-	.show = nd_badblocks_show,
-};
+
+static DEVICE_ATTR(badblocks, 0444, region_badblocks_show, NULL);
 
 static ssize_t resource_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
@@ -514,7 +509,7 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_available_size.attr,
 	&dev_attr_namespace_seed.attr,
 	&dev_attr_init_namespaces.attr,
-	&dev_attr_nd_badblocks.attr,
+	&dev_attr_badblocks.attr,
 	&dev_attr_resource.attr,
 	NULL,
 };
@@ -532,7 +527,7 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
 		return 0;
 
-	if (!is_nd_pmem(dev) && a == &dev_attr_nd_badblocks.attr)
+	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
 		return 0;
 
 	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 98b207611b06..f07b1b14159a 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -162,7 +162,4 @@ void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
 void nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
-int nvdimm_region_badblocks_clear(struct device *dev, void *data);
-void __nvdimm_bus_badblocks_clear(struct nvdimm_bus *nvdimm_bus,
-		struct resource *res);
 #endif /* __LIBNVDIMM_H__ */
-- 
cgit v1.2.3


From 4bfd036221c0bf75a0f475b05e22f7be9abc3101 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 4 Apr 2017 14:43:34 -0700
Subject: fscrypt: remove fscrypt_symlink_data_len()

fscrypt_symlink_data_len() is never called and can be removed.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fscrypt_common.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h
index 10c1abfbac6c..0a30c106c1e5 100644
--- a/include/linux/fscrypt_common.h
+++ b/include/linux/fscrypt_common.h
@@ -46,17 +46,6 @@ struct fscrypt_symlink_data {
 	char encrypted_path[1];
 } __packed;
 
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 fscrypt_symlink_data_len(u32 l)
-{
-	if (l < FS_CRYPTO_BLOCK_SIZE)
-		l = FS_CRYPTO_BLOCK_SIZE;
-	return (l + sizeof(struct fscrypt_symlink_data) - 1);
-}
-
 struct fscrypt_str {
 	unsigned char *name;
 	u32 len;
-- 
cgit v1.2.3


From f6dfb4c3f2161c23ab2939dd1b5f133dcdf147c6 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Fri, 24 Feb 2017 12:16:33 +0200
Subject: net/mlx5e: Update neighbour 'used' state using HW flow rules counters

When IP tunnel encapsulation rules are offloaded, the kernel can't see
the traffic of the offloaded flow. The neighbour for the IP tunnel
destination of the offloaded flow can mistakenly become STALE and
deleted by the kernel since its 'used' value wasn't changed.

To make sure that a neighbour which is used by the HW won't become
STALE, we proactively update the neighbour 'used' value every
DELAY_PROBE_TIME period, when packets were matched and counted by the HW
for one of the tunnel encap flows related to this neighbour.

The periodic task that updates the used neighbours is scheduled when a
tunnel encap rule is successfully offloaded into HW and keeps re-scheduling
itself as long as the representor's neighbours list isn't empty.

Add, remove, lookup and status change operations done over the
representor's neighbours list or the neighbour hash entry encaps list
are all serialized by RTNL lock.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   | 52 +++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   | 11 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 58 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h    |  3 ++
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h  |  5 ++
 .../net/ethernet/mellanox/mlx5/core/fs_counters.c  | 24 ++++++++-
 include/linux/mlx5/driver.h                        |  1 +
 7 files changed, 152 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 730de6b7e46e..af61b10b85bf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -41,6 +41,7 @@
 #include "en.h"
 #include "en_rep.h"
 #include "en_tc.h"
+#include "fs_core.h"
 
 static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
 
@@ -226,6 +227,51 @@ void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
 	mlx5_eswitch_sqs2vport_stop(esw, rep);
 }
 
+static void mlx5e_rep_neigh_update_init_interval(struct mlx5e_rep_priv *rpriv)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unsigned long ipv6_interval = NEIGH_VAR(&ipv6_stub->nd_tbl->parms,
+						DELAY_PROBE_TIME);
+#else
+	unsigned long ipv6_interval = ~0UL;
+#endif
+	unsigned long ipv4_interval = NEIGH_VAR(&arp_tbl.parms,
+						DELAY_PROBE_TIME);
+	struct net_device *netdev = rpriv->rep->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+
+	rpriv->neigh_update.min_interval = min_t(unsigned long, ipv6_interval, ipv4_interval);
+	mlx5_fc_update_sampling_interval(priv->mdev, rpriv->neigh_update.min_interval);
+}
+
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv)
+{
+	struct mlx5e_rep_priv *rpriv = priv->ppriv;
+	struct mlx5e_neigh_update_table *neigh_update = &rpriv->neigh_update;
+
+	mlx5_fc_queue_stats_work(priv->mdev,
+				 &neigh_update->neigh_stats_work,
+				 neigh_update->min_interval);
+}
+
+static void mlx5e_rep_neigh_stats_work(struct work_struct *work)
+{
+	struct mlx5e_rep_priv *rpriv = container_of(work, struct mlx5e_rep_priv,
+						    neigh_update.neigh_stats_work.work);
+	struct net_device *netdev = rpriv->rep->netdev;
+	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5e_neigh_hash_entry *nhe;
+
+	rtnl_lock();
+	if (!list_empty(&rpriv->neigh_update.neigh_list))
+		mlx5e_rep_queue_neigh_stats_work(priv);
+
+	list_for_each_entry(nhe, &rpriv->neigh_update.neigh_list, neigh_list)
+		mlx5e_tc_update_neigh_used_value(nhe);
+
+	rtnl_unlock();
+}
+
 static void mlx5e_rep_neigh_entry_hold(struct mlx5e_neigh_hash_entry *nhe)
 {
 	refcount_inc(&nhe->refcnt);
@@ -325,6 +371,7 @@ static int mlx5e_rep_netevent_event(struct notifier_block *nb,
 			return NOTIFY_DONE;
 
 		m_neigh.dev = n->dev;
+		m_neigh.family = n->ops->family;
 		memcpy(&m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
 
 		/* We are in atomic context and can't take RTNL mutex, so use
@@ -378,6 +425,9 @@ static int mlx5e_rep_neigh_init(struct mlx5e_rep_priv *rpriv)
 
 	INIT_LIST_HEAD(&neigh_update->neigh_list);
 	spin_lock_init(&neigh_update->encap_lock);
+	INIT_DELAYED_WORK(&neigh_update->neigh_stats_work,
+			  mlx5e_rep_neigh_stats_work);
+	mlx5e_rep_neigh_update_init_interval(rpriv);
 
 	rpriv->neigh_update.netevent_nb.notifier_call = mlx5e_rep_netevent_event;
 	err = register_netevent_notifier(&rpriv->neigh_update.netevent_nb);
@@ -399,6 +449,8 @@ static void mlx5e_rep_neigh_cleanup(struct mlx5e_rep_priv *rpriv)
 
 	flush_workqueue(priv->wq); /* flush neigh update works */
 
+	cancel_delayed_work_sync(&rpriv->neigh_update.neigh_stats_work);
+
 	rhashtable_destroy(&neigh_update->neigh_ht);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index e4d0ea5246fd..a0a1a7a1d6c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -48,6 +48,8 @@ struct mlx5e_neigh_update_table {
 	/* protect lookup/remove operations */
 	spinlock_t              encap_lock;
 	struct notifier_block   netevent_nb;
+	struct delayed_work     neigh_stats_work;
+	unsigned long           min_interval; /* jiffies */
 };
 
 struct mlx5e_rep_priv {
@@ -61,6 +63,7 @@ struct mlx5e_neigh {
 		__be32	v4;
 		struct in6_addr v6;
 	} dst_ip;
+	int family;
 };
 
 struct mlx5e_neigh_hash_entry {
@@ -87,6 +90,12 @@ struct mlx5e_neigh_hash_entry {
 	 * it's used by the neigh notification call.
 	 */
 	refcount_t refcnt;
+
+	/* Save the last reported time offloaded trafic pass over one of the
+	 * neigh hash entry flows. Use it to periodically update the neigh
+	 * 'used' value and avoid neigh deleting by the kernel.
+	 */
+	unsigned long reported_lastuse;
 };
 
 enum {
@@ -131,4 +140,6 @@ int mlx5e_rep_encap_entry_attach(struct mlx5e_priv *priv,
 void mlx5e_rep_encap_entry_detach(struct mlx5e_priv *priv,
 				  struct mlx5e_encap_entry *e);
 
+void mlx5e_rep_queue_neigh_stats_work(struct mlx5e_priv *priv);
+
 #endif /* __MLX5E_REP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 624dbfe31a0e..11c27e4fadf6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -44,6 +44,7 @@
 #include <net/tc_act/tc_tunnel_key.h>
 #include <net/tc_act/tc_pedit.h>
 #include <net/vxlan.h>
+#include <net/arp.h>
 #include "en.h"
 #include "en_rep.h"
 #include "en_tc.h"
@@ -278,6 +279,7 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 		return;
 	}
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(priv);
 
 	list_for_each_entry(flow, &e->flows, encap) {
 		flow->esw_attr->encap_id = e->encap_id;
@@ -315,6 +317,58 @@ void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 	}
 }
 
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
+{
+	struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
+	u64 bytes, packets, lastuse = 0;
+	struct mlx5e_tc_flow *flow;
+	struct mlx5e_encap_entry *e;
+	struct mlx5_fc *counter;
+	struct neigh_table *tbl;
+	bool neigh_used = false;
+	struct neighbour *n;
+
+	if (m_neigh->family == AF_INET)
+		tbl = &arp_tbl;
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (m_neigh->family == AF_INET6)
+		tbl = ipv6_stub->nd_tbl;
+#endif
+	else
+		return;
+
+	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		if (!(e->flags & MLX5_ENCAP_ENTRY_VALID))
+			continue;
+		list_for_each_entry(flow, &e->flows, encap) {
+			if (flow->flags & MLX5E_TC_FLOW_OFFLOADED) {
+				counter = mlx5_flow_rule_counter(flow->rule);
+				mlx5_fc_query_cached(counter, &bytes, &packets, &lastuse);
+				if (time_after((unsigned long)lastuse, nhe->reported_lastuse)) {
+					neigh_used = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (neigh_used) {
+		nhe->reported_lastuse = jiffies;
+
+		/* find the relevant neigh according to the cached device and
+		 * dst ip pair
+		 */
+		n = neigh_lookup(tbl, &m_neigh->dst_ip, m_neigh->dev);
+		if (!n) {
+			WARN(1, "The neighbour already freed\n");
+			return;
+		}
+
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+	}
+}
+
 static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 			       struct mlx5e_tc_flow *flow)
 {
@@ -1315,6 +1369,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 	 * entry in the neigh hash table when a user deletes a rule
 	 */
 	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
 	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
 	e->out_dev = out_dev;
 
@@ -1359,6 +1414,7 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
 		goto destroy_neigh_entry;
 
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
 	neigh_release(n);
 	return err;
 
@@ -1418,6 +1474,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 	 * entry in the neigh hash table when a user deletes a rule
 	 */
 	e->m_neigh.dev = n->dev;
+	e->m_neigh.family = n->ops->family;
 	memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len);
 	e->out_dev = out_dev;
 
@@ -1463,6 +1520,7 @@ static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
 		goto destroy_neigh_entry;
 
 	e->flags |= MLX5_ENCAP_ENTRY_VALID;
+	mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev));
 	neigh_release(n);
 	return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 278c7a646a55..ecbe30d808ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -52,6 +52,9 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 			      struct mlx5e_encap_entry *e);
 
+struct mlx5e_neigh_hash_entry;
+void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
+
 static inline int mlx5e_tc_num_filters(struct mlx5e_priv *priv)
 {
 	return atomic_read(&priv->fs.tc.ht.nelems);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 577d056bf3df..81eafc7b9dd9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -199,6 +199,11 @@ struct mlx5_flow_root_namespace {
 
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev);
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay);
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval);
 
 int mlx5_init_fs(struct mlx5_core_dev *dev);
 void mlx5_cleanup_fs(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
index 7431f633de31..6507d8acc54d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c
@@ -165,7 +165,8 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 	list_splice_tail_init(&fc_stats->addlist, &tmplist);
 
 	if (!list_empty(&tmplist) || !RB_EMPTY_ROOT(&fc_stats->counters))
-		queue_delayed_work(fc_stats->wq, &fc_stats->work, MLX5_FC_STATS_PERIOD);
+		queue_delayed_work(fc_stats->wq, &fc_stats->work,
+				   fc_stats->sampling_interval);
 
 	spin_unlock(&fc_stats->addlist_lock);
 
@@ -200,7 +201,7 @@ static void mlx5_fc_stats_work(struct work_struct *work)
 		node = mlx5_fc_stats_query(dev, counter, last->id);
 	}
 
-	fc_stats->next_query = now + MLX5_FC_STATS_PERIOD;
+	fc_stats->next_query = now + fc_stats->sampling_interval;
 }
 
 struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging)
@@ -265,6 +266,7 @@ int mlx5_init_fc_stats(struct mlx5_core_dev *dev)
 	if (!fc_stats->wq)
 		return -ENOMEM;
 
+	fc_stats->sampling_interval = MLX5_FC_STATS_PERIOD;
 	INIT_DELAYED_WORK(&fc_stats->work, mlx5_fc_stats_work);
 
 	return 0;
@@ -317,3 +319,21 @@ void mlx5_fc_query_cached(struct mlx5_fc *counter,
 	counter->lastbytes = c.bytes;
 	counter->lastpackets = c.packets;
 }
+
+void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev,
+			      struct delayed_work *dwork,
+			      unsigned long delay)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	queue_delayed_work(fc_stats->wq, dwork, delay);
+}
+
+void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev,
+				      unsigned long interval)
+{
+	struct mlx5_fc_stats *fc_stats = &dev->priv.fc_stats;
+
+	fc_stats->sampling_interval = min_t(unsigned long, interval,
+					    fc_stats->sampling_interval);
+}
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f50864626230..3fece51dcf13 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -540,6 +540,7 @@ struct mlx5_fc_stats {
 	struct workqueue_struct *wq;
 	struct delayed_work work;
 	unsigned long next_query;
+	unsigned long sampling_interval; /* jiffies */
 };
 
 struct mlx5_eswitch;
-- 
cgit v1.2.3


From 71389703839ebe9cb426c72d5f0bd549592e583c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Apr 2017 10:23:37 -0700
Subject: mm, zone_device: Replace {get, put}_zone_device_page() with a single
 reference to fix pmem crash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The x86 conversion to the generic GUP code included a small change which causes
crashes and data corruption in the pmem code - not good.

The root cause is that the /dev/pmem driver code implicitly relies on the x86
get_user_pages() implementation doing a get_page() on the page refcount, because
get_page() does a get_zone_device_page() which properly refcounts pmem's separate
page struct arrays that are not present in the regular page struct structures.
(The pmem driver does this because it can cover huge memory areas.)

But the x86 conversion to the generic GUP code changed the get_page() to
page_cache_get_speculative() which is faster but doesn't do the
get_zone_device_page() call the pmem code relies on.

One way to solve the regression would be to change the generic GUP code to use
get_page(), but that would slow things down a bit and punish other generic-GUP
using architectures for an x86-ism they did not care about. (Arguably the pmem
driver was probably not working reliably for them: but nvdimm is an Intel
feature, so non-x86 exposure is probably still limited.)

So restructure the pmem code's interface with the MM instead: get rid of the
get/put_zone_device_page() distinction, integrate put_zone_device_page() into
__put_page() and and restructure the pmem completion-wait and teardown machinery:

Kirill points out that the calls to {get,put}_dev_pagemap() can be
removed from the mm fast path if we take a single get_dev_pagemap()
reference to signify that the page is alive and use the final put of the
page to drop that reference.

This does require some care to make sure that any waits for the
percpu_ref to drop to zero occur *after* devm_memremap_page_release(),
since it now maintains its own elevated reference.

This speeds up things while also making the pmem refcounting more robust going
forward.

Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/dax/pmem.c    |  2 +-
 drivers/nvdimm/pmem.c | 13 +++++++++++--
 include/linux/mm.h    | 14 --------------
 kernel/memremap.c     | 22 +++++++++-------------
 mm/swap.c             | 10 ++++++++++
 5 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 033f49b31fdc..cb0d742fa23f 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
 	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
 
 	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	wait_for_completion(&dax_pmem->cmp);
 	percpu_ref_exit(ref);
 }
 
@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
 
 	dev_dbg(dax_pmem->dev, "%s\n", __func__);
 	percpu_ref_kill(ref);
-	wait_for_completion(&dax_pmem->cmp);
 }
 
 static int dax_pmem_probe(struct device *dev)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fb7bbc79ac26 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -25,6 +25,7 @@
 #include <linux/badblocks.h>
 #include <linux/memremap.h>
 #include <linux/vmalloc.h>
+#include <linux/blk-mq.h>
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
 	blk_cleanup_queue(q);
 }
 
+static void pmem_freeze_queue(void *q)
+{
+	blk_mq_freeze_queue_start(q);
+}
+
 static void pmem_release_disk(void *disk)
 {
 	del_gendisk(disk);
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
 	if (!q)
 		return -ENOMEM;
 
+	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+		return -ENOMEM;
+
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
 		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
 				pmem->size, ARCH_MEMREMAP_PMEM);
 
 	/*
-	 * At release time the queue must be dead before
+	 * At release time the queue must be frozen before
 	 * devm_memremap_pages is unwound
 	 */
-	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+	if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
 		return -ENOMEM;
 
 	if (IS_ERR(addr))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a835edd2db34..695da2a19b4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -762,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
-void get_zone_device_page(struct page *page);
-void put_zone_device_page(struct page *page);
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return page_zonenum(page) == ZONE_DEVICE;
 }
 #else
-static inline void get_zone_device_page(struct page *page)
-{
-}
-static inline void put_zone_device_page(struct page *page)
-{
-}
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return false;
@@ -790,9 +782,6 @@ static inline void get_page(struct page *page)
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
-
-	if (unlikely(is_zone_device_page(page)))
-		get_zone_device_page(page);
 }
 
 static inline void put_page(struct page *page)
@@ -801,9 +790,6 @@ static inline void put_page(struct page *page)
 
 	if (put_page_testzero(page))
 		__put_page(page);
-
-	if (unlikely(is_zone_device_page(page)))
-		put_zone_device_page(page);
 }
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
 	struct vmem_altmap altmap;
 };
 
-void get_zone_device_page(struct page *page)
-{
-	percpu_ref_get(page->pgmap->ref);
-}
-EXPORT_SYMBOL(get_zone_device_page);
-
-void put_zone_device_page(struct page *page)
-{
-	put_dev_pagemap(page->pgmap);
-}
-EXPORT_SYMBOL(put_zone_device_page);
-
 static void pgmap_radix_release(struct resource *res)
 {
 	resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	struct resource *res = &page_map->res;
 	resource_size_t align_start, align_size;
 	struct dev_pagemap *pgmap = &page_map->pgmap;
+	unsigned long pfn;
+
+	for_each_device_pfn(pfn, page_map)
+		put_page(pfn_to_page(pfn));
 
 	if (percpu_ref_tryget_live(pgmap->ref)) {
 		dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
  *
  * Notes:
  * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- *    (or devm release event).
+ *    (or devm release event). The expected order of events is that @ref has
+ *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
+ *    wait for the completion of all references being dropped and
+ *    percpu_ref_exit() must occur after devm_memremap_pages_release().
  *
  * 2/ @res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		 */
 		list_del(&page->lru);
 		page->pgmap = pgmap;
+		percpu_ref_get(ref);
 	}
 	devres_add(dev, page_map);
 	return __va(res->start);
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..a4e6113276b5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
 
 void __put_page(struct page *page)
 {
+	if (is_zone_device_page(page)) {
+		put_dev_pagemap(page->pgmap);
+
+		/*
+		 * The page belongs to the device that created pgmap. Do
+		 * not return it to page allocator.
+		 */
+		return;
+	}
+
 	if (unlikely(PageCompound(page)))
 		__put_compound_page(page);
 	else
-- 
cgit v1.2.3


From 917362135b8a5c0680acf08807e9fc6179eb6c79 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 14 Apr 2017 20:32:49 +0200
Subject: power: supply: max17042_battery: Add default platform_data fallback
 data

Some x86 machines use a max17047 fuel-gauge and x86 might be missing
platform_data if not provided by SFI.

This commit adds default platform_data as fallback option so that the
driver can work on boards where no platform_data is provided.

Since not all boards have a thermistor hooked up, set temp_min to 0 and
change the health checks from temp <= temp_min to temp < temp_min to
not trigger on such boards (where temp reads 0).

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/max17042_battery.c | 60 +++++++++++++++++++++++++++++----
 include/linux/power/max17042_battery.h  |  6 +++-
 2 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c
index a51b2965cd5c..f0ff6e880ff2 100644
--- a/drivers/power/supply/max17042_battery.c
+++ b/drivers/power/supply/max17042_battery.c
@@ -150,12 +150,12 @@ static int max17042_get_battery_health(struct max17042_chip *chip, int *health)
 	if (ret < 0)
 		goto health_error;
 
-	if (temp <= chip->pdata->temp_min) {
+	if (temp < chip->pdata->temp_min) {
 		*health = POWER_SUPPLY_HEALTH_COLD;
 		goto out;
 	}
 
-	if (temp >= chip->pdata->temp_max) {
+	if (temp > chip->pdata->temp_max) {
 		*health = POWER_SUPPLY_HEALTH_OVERHEAT;
 		goto out;
 	}
@@ -772,8 +772,9 @@ static void max17042_init_worker(struct work_struct *work)
 
 #ifdef CONFIG_OF
 static struct max17042_platform_data *
-max17042_get_pdata(struct device *dev)
+max17042_get_pdata(struct max17042_chip *chip)
 {
+	struct device *dev = &chip->client->dev;
 	struct device_node *np = dev->of_node;
 	u32 prop;
 	struct max17042_platform_data *pdata;
@@ -806,10 +807,55 @@ max17042_get_pdata(struct device *dev)
 	return pdata;
 }
 #else
+static struct max17042_reg_data max17047_default_pdata_init_regs[] = {
+	/*
+	 * Some firmwares do not set FullSOCThr, Enable End-of-Charge Detection
+	 * when the voltage FG reports 95%, as recommended in the datasheet.
+	 */
+	{ MAX17047_FullSOCThr, MAX17042_BATTERY_FULL << 8 },
+};
+
 static struct max17042_platform_data *
-max17042_get_pdata(struct device *dev)
+max17042_get_pdata(struct max17042_chip *chip)
 {
-	return dev->platform_data;
+	struct device *dev = &chip->client->dev;
+	struct max17042_platform_data *pdata;
+	int ret, misc_cfg;
+
+	if (dev->platform_data)
+		return dev->platform_data;
+
+	/*
+	 * The MAX17047 gets used on x86 where we might not have pdata, assume
+	 * the firmware will already have initialized the fuel-gauge and provide
+	 * default values for the non init bits to make things work.
+	 */
+	pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return pdata;
+
+	if (chip->chip_type != MAXIM_DEVICE_TYPE_MAX17042) {
+		pdata->init_data = max17047_default_pdata_init_regs;
+		pdata->num_init_data =
+			ARRAY_SIZE(max17047_default_pdata_init_regs);
+	}
+
+	ret = regmap_read(chip->regmap, MAX17042_MiscCFG, &misc_cfg);
+	if (ret < 0)
+		return NULL;
+
+	/* If bits 0-1 are set to 3 then only Voltage readings are used */
+	if ((misc_cfg & 0x3) == 0x3)
+		pdata->enable_current_sense = false;
+	else
+		pdata->enable_current_sense = true;
+
+	pdata->vmin = MAX17042_DEFAULT_VMIN;
+	pdata->vmax = MAX17042_DEFAULT_VMAX;
+	pdata->temp_min = MAX17042_DEFAULT_TEMP_MIN;
+	pdata->temp_max = MAX17042_DEFAULT_TEMP_MAX;
+
+	return pdata;
 }
 #endif
 
@@ -858,20 +904,20 @@ static int max17042_probe(struct i2c_client *client,
 		return -ENOMEM;
 
 	chip->client = client;
+	chip->chip_type = id->driver_data;
 	chip->regmap = devm_regmap_init_i2c(client, &max17042_regmap_config);
 	if (IS_ERR(chip->regmap)) {
 		dev_err(&client->dev, "Failed to initialize regmap\n");
 		return -EINVAL;
 	}
 
-	chip->pdata = max17042_get_pdata(&client->dev);
+	chip->pdata = max17042_get_pdata(chip);
 	if (!chip->pdata) {
 		dev_err(&client->dev, "no platform data provided\n");
 		return -EINVAL;
 	}
 
 	i2c_set_clientdata(client, chip);
-	chip->chip_type = id->driver_data;
 	psy_cfg.drv_data = chip;
 
 	/* When current is not measured,
diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
index 522757ac9cd4..3489fb0f9099 100644
--- a/include/linux/power/max17042_battery.h
+++ b/include/linux/power/max17042_battery.h
@@ -24,8 +24,12 @@
 #define __MAX17042_BATTERY_H_
 
 #define MAX17042_STATUS_BattAbsent	(1 << 3)
-#define MAX17042_BATTERY_FULL	(100)
+#define MAX17042_BATTERY_FULL		(95)   /* Recommend. FullSOCThr value */
 #define MAX17042_DEFAULT_SNS_RESISTOR	(10000)
+#define MAX17042_DEFAULT_VMIN		(3000)
+#define MAX17042_DEFAULT_VMAX		(4500) /* LiHV cell max */
+#define MAX17042_DEFAULT_TEMP_MIN	(0)    /* For sys without temp sensor */
+#define MAX17042_DEFAULT_TEMP_MAX	(700)  /* 70 degrees Celcius */
 
 #define MAX17042_CHARACTERIZATION_DATA_SIZE 48
 
-- 
cgit v1.2.3


From a9df22c00d7c2c9c2944c62f1b819de6c214660f Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Fri, 14 Apr 2017 20:32:51 +0200
Subject: power: supply: max17042_battery: Add support for the STATUS property

Userspace prefers the driver having a status property over having to guess
itself. Specifically this will properly make the GNOME3 UI (and likely
others) properly show discharging / charging / full status, instead
of always showing discharging as status.

Note that in the case there is no charger driver supplying the max17042,
then a status of unknown will get returned. At least upower treats
this the same as not having a status attribute, so in this case nothing
changes from a userspace pov.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
---
 drivers/power/supply/max17042_battery.c | 46 +++++++++++++++++++++++++++++++++
 include/linux/power/max17042_battery.h  |  3 +++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c
index f0ff6e880ff2..62efe7eeb3f8 100644
--- a/drivers/power/supply/max17042_battery.c
+++ b/drivers/power/supply/max17042_battery.c
@@ -76,6 +76,7 @@ struct max17042_chip {
 };
 
 static enum power_supply_property max17042_battery_props[] = {
+	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_PRESENT,
 	POWER_SUPPLY_PROP_CYCLE_COUNT,
 	POWER_SUPPLY_PROP_VOLTAGE_MAX,
@@ -113,6 +114,46 @@ static int max17042_get_temperature(struct max17042_chip *chip, int *temp)
 	return 0;
 }
 
+static int max17042_get_status(struct max17042_chip *chip, int *status)
+{
+	int ret, charge_full, charge_now;
+
+	ret = power_supply_am_i_supplied(chip->battery);
+	if (ret < 0) {
+		*status = POWER_SUPPLY_STATUS_UNKNOWN;
+		return 0;
+	}
+	if (ret == 0) {
+		*status = POWER_SUPPLY_STATUS_DISCHARGING;
+		return 0;
+	}
+
+	/*
+	 * The MAX170xx has builtin end-of-charge detection and will update
+	 * FullCAP to match RepCap when it detects end of charging.
+	 *
+	 * When this cycle the battery gets charged to a higher (calculated)
+	 * capacity then the previous cycle then FullCAP will get updated
+	 * contineously once end-of-charge detection kicks in, so allow the
+	 * 2 to differ a bit.
+	 */
+
+	ret = regmap_read(chip->regmap, MAX17042_FullCAP, &charge_full);
+	if (ret < 0)
+		return ret;
+
+	ret = regmap_read(chip->regmap, MAX17042_RepCap, &charge_now);
+	if (ret < 0)
+		return ret;
+
+	if ((charge_full - charge_now) <= MAX17042_FULL_THRESHOLD)
+		*status = POWER_SUPPLY_STATUS_FULL;
+	else
+		*status = POWER_SUPPLY_STATUS_CHARGING;
+
+	return 0;
+}
+
 static int max17042_get_battery_health(struct max17042_chip *chip, int *health)
 {
 	int temp, vavg, vbatt, ret;
@@ -182,6 +223,11 @@ static int max17042_get_property(struct power_supply *psy,
 		return -EAGAIN;
 
 	switch (psp) {
+	case POWER_SUPPLY_PROP_STATUS:
+		ret = max17042_get_status(chip, &val->intval);
+		if (ret < 0)
+			return ret;
+		break;
 	case POWER_SUPPLY_PROP_PRESENT:
 		ret = regmap_read(map, MAX17042_STATUS, &data);
 		if (ret < 0)
diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h
index 3489fb0f9099..a7ed29baf44a 100644
--- a/include/linux/power/max17042_battery.h
+++ b/include/linux/power/max17042_battery.h
@@ -31,6 +31,9 @@
 #define MAX17042_DEFAULT_TEMP_MIN	(0)    /* For sys without temp sensor */
 #define MAX17042_DEFAULT_TEMP_MAX	(700)  /* 70 degrees Celcius */
 
+/* Consider RepCap which is less then 10 units below FullCAP full */
+#define MAX17042_FULL_THRESHOLD		10
+
 #define MAX17042_CHARACTERIZATION_DATA_SIZE 48
 
 enum max17042_register {
-- 
cgit v1.2.3


From 73a757e63114dfd765f1c5d1ff7e994f123d0234 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 1 May 2017 09:35:09 -0400
Subject: ring-buffer: Return reader page back into existing ring buffer

When reading the ring buffer for consuming, it is optimized for splice,
where a page is taken out of the ring buffer (zero copy) and sent to the
reading consumer. When the read is finished with the page, it calls
ring_buffer_free_read_page(), which simply frees the page. The next time the
reader needs to get a page from the ring buffer, it must call
ring_buffer_alloc_read_page() which allocates and initializes a reader page
for the ring buffer to be swapped into the ring buffer for a new filled page
for the reader.

The problem is that there's no reason to actually free the page when it is
passed back to the ring buffer. It can hold it off and reuse it for the next
iteration. This completely removes the interaction with the page_alloc
mechanism.

Using the trace-cmd utility to record all events (causing trace-cmd to
require reading lots of pages from the ring buffer, and calling
ring_buffer_alloc/free_read_page() several times), and also assigning a
stack trace trigger to the mm_page_alloc event, we can see how many times
the ring_buffer_alloc_read_page() needed to allocate a page for the ring
buffer.

Before this change:

  # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1
  # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l
  9968

After this change:

  # trace-cmd record -e all -e mem_page_alloc -R stacktrace sleep 1
  # trace-cmd report |grep ring_buffer_alloc_read_page | wc -l
  4

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h          |  2 +-
 kernel/trace/ring_buffer.c           | 40 +++++++++++++++++++++++++++++++++---
 kernel/trace/ring_buffer_benchmark.c |  2 +-
 kernel/trace/trace.c                 | 17 ++++++++++-----
 4 files changed, 51 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b6d4568795a7..ee9b461af095 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -185,7 +185,7 @@ size_t ring_buffer_page_len(void *page);
 
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
+void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 			  size_t len, int cpu, int full);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 96fc3c043ad6..01b4ee5326cf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -438,6 +438,7 @@ struct ring_buffer_per_cpu {
 	raw_spinlock_t			reader_lock;	/* serialize readers */
 	arch_spinlock_t			lock;
 	struct lock_class_key		lock_key;
+	struct buffer_data_page		*free_page;
 	unsigned long			nr_pages;
 	unsigned int			current_context;
 	struct list_head		*pages;
@@ -4377,9 +4378,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 {
-	struct buffer_data_page *bpage;
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct buffer_data_page *bpage = NULL;
+	unsigned long flags;
 	struct page *page;
 
+	local_irq_save(flags);
+	arch_spin_lock(&cpu_buffer->lock);
+
+	if (cpu_buffer->free_page) {
+		bpage = cpu_buffer->free_page;
+		cpu_buffer->free_page = NULL;
+	}
+
+	arch_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
+
+	if (bpage)
+		goto out;
+
 	page = alloc_pages_node(cpu_to_node(cpu),
 				GFP_KERNEL | __GFP_NORETRY, 0);
 	if (!page)
@@ -4387,6 +4404,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
 
 	bpage = page_address(page);
 
+ out:
 	rb_init_page(bpage);
 
 	return bpage;
@@ -4396,13 +4414,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 /**
  * ring_buffer_free_read_page - free an allocated read page
  * @buffer: the buffer the page was allocate for
+ * @cpu: the cpu buffer the page came from
  * @data: the page to free
  *
  * Free a page allocated from ring_buffer_alloc_read_page.
  */
-void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
 {
-	free_page((unsigned long)data);
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+	struct buffer_data_page *bpage = data;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&cpu_buffer->lock);
+
+	if (!cpu_buffer->free_page) {
+		cpu_buffer->free_page = bpage;
+		bpage = NULL;
+	}
+
+	arch_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
+
+	free_page((unsigned long)bpage);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index c190a4d5013c..9fbcaf567886 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -171,7 +171,7 @@ static enum event_status read_page(int cpu)
 			}
 		}
 	}
-	ring_buffer_free_read_page(buffer, bpage);
+	ring_buffer_free_read_page(buffer, cpu, bpage);
 
 	if (ret < 0)
 		return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 60c904fa5480..5b645b0fbbb8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6054,6 +6054,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
 struct ftrace_buffer_info {
 	struct trace_iterator	iter;
 	void			*spare;
+	unsigned int		spare_cpu;
 	unsigned int		read;
 };
 
@@ -6383,9 +6384,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return -EBUSY;
 #endif
 
-	if (!info->spare)
+	if (!info->spare) {
 		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
 							  iter->cpu_file);
+		info->spare_cpu = iter->cpu_file;
+	}
 	if (!info->spare)
 		return -ENOMEM;
 
@@ -6445,7 +6448,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 	__trace_array_put(iter->tr);
 
 	if (info->spare)
-		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
+		ring_buffer_free_read_page(iter->trace_buffer->buffer,
+					   info->spare_cpu, info->spare);
 	kfree(info);
 
 	mutex_unlock(&trace_types_lock);
@@ -6456,6 +6460,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 struct buffer_ref {
 	struct ring_buffer	*buffer;
 	void			*page;
+	int			cpu;
 	int			ref;
 };
 
@@ -6467,7 +6472,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
 	if (--ref->ref)
 		return;
 
-	ring_buffer_free_read_page(ref->buffer, ref->page);
+	ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
 	kfree(ref);
 	buf->private = 0;
 }
@@ -6501,7 +6506,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
 	if (--ref->ref)
 		return;
 
-	ring_buffer_free_read_page(ref->buffer, ref->page);
+	ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
 	kfree(ref);
 	spd->partial[i].private = 0;
 }
@@ -6566,11 +6571,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			kfree(ref);
 			break;
 		}
+		ref->cpu = iter->cpu_file;
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
 					  len, iter->cpu_file, 1);
 		if (r < 0) {
-			ring_buffer_free_read_page(ref->buffer, ref->page);
+			ring_buffer_free_read_page(ref->buffer, ref->cpu,
+						   ref->page);
 			kfree(ref);
 			break;
 		}
-- 
cgit v1.2.3


From 45d9b378e85f1b00ac047626827c68589168936c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 30 Apr 2017 21:46:45 -0700
Subject: netlink: add NULL-friendly helper for setting extended ACK message

As we propagate extended ack reporting throughout various paths in
the kernel it may be that the same function is called with the
extended ack parameter passed as NULL.  One place where that happens
is in drivers which have a centralized reconfiguration function
called both from ndos and from ethtool_ops.  Add a new helper for
setting the error message in such conditions.

Existing helper is left as is to encourage propagating the ext act
fully wherever possible.  It also makes it clear in the code which
messages may be lost due to ext ack being NULL.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 8d2a8924705c..c20395edf2de 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -92,6 +92,14 @@ struct netlink_ext_ack {
 	(extack)->_msg = _msg;			\
 } while (0)
 
+#define NL_MOD_TRY_SET_ERR_MSG(extack, msg) do {		\
+	static const char _msg[] = KBUILD_MODNAME ": " msg;	\
+	struct netlink_ext_ack *_extack = (extack);		\
+								\
+	if (_extack)						\
+		_extack->_msg = _msg;				\
+} while (0)
+
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
 extern int netlink_change_ngroups(struct sock *sk, unsigned int groups);
-- 
cgit v1.2.3


From ddf9f970764f4390aba767e77fddaaced4a6760d Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Sun, 30 Apr 2017 21:46:46 -0700
Subject: xdp: propagate extended ack to XDP setup

Drivers usually have a number of restrictions for running XDP
- most common being buffer sizes, LRO and number of rings.
Even though some drivers try to be helpful and print error
messages experience shows that users don't often consult
kernel logs on netlink errors.  Try to use the new extended
ack mechanism to carry the message back to user space.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 10 ++++++++--
 net/core/dev.c            |  5 ++++-
 net/core/rtnetlink.c      | 13 ++++++++-----
 3 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6847714a5ae3..9c23bd2efb56 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -813,11 +813,16 @@ enum xdp_netdev_command {
 	XDP_QUERY_PROG,
 };
 
+struct netlink_ext_ack;
+
 struct netdev_xdp {
 	enum xdp_netdev_command command;
 	union {
 		/* XDP_SETUP_PROG */
-		struct bpf_prog *prog;
+		struct {
+			struct bpf_prog *prog;
+			struct netlink_ext_ack *extack;
+		};
 		/* XDP_QUERY_PROG */
 		bool prog_attached;
 	};
@@ -3291,7 +3296,8 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8371a01eee87..35a06cebb282 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6854,12 +6854,14 @@ EXPORT_SYMBOL(dev_change_proto_down);
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
+ *	@extact: netlink extended ack
  *	@fd: new program fd or negative value to clear
  *	@flags: xdp-related flags
  *
  *	Set or clear a bpf program for a device
  */
-int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, u32 flags)
 {
 	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -6892,6 +6894,7 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
 
 	memset(&xdp, 0, sizeof(xdp));
 	xdp.command = XDP_SETUP_PROG;
+	xdp.extack = extack;
 	xdp.prog = prog;
 
 	err = xdp_op(dev, &xdp);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 9031a6c8bfa7..6e67315ec368 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1919,6 +1919,7 @@ static int do_set_master(struct net_device *dev, int ifindex)
 #define DO_SETLINK_NOTIFY	0x03
 static int do_setlink(const struct sk_buff *skb,
 		      struct net_device *dev, struct ifinfomsg *ifm,
+		      struct netlink_ext_ack *extack,
 		      struct nlattr **tb, char *ifname, int status)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -2201,7 +2202,7 @@ static int do_setlink(const struct sk_buff *skb,
 		}
 
 		if (xdp[IFLA_XDP_FD]) {
-			err = dev_change_xdp_fd(dev,
+			err = dev_change_xdp_fd(dev, extack,
 						nla_get_s32(xdp[IFLA_XDP_FD]),
 						xdp_flags);
 			if (err)
@@ -2261,7 +2262,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout;
 
-	err = do_setlink(skb, dev, ifm, tb, ifname, 0);
+	err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
 errout:
 	return err;
 }
@@ -2423,6 +2424,7 @@ EXPORT_SYMBOL(rtnl_create_link);
 static int rtnl_group_changelink(const struct sk_buff *skb,
 		struct net *net, int group,
 		struct ifinfomsg *ifm,
+		struct netlink_ext_ack *extack,
 		struct nlattr **tb)
 {
 	struct net_device *dev, *aux;
@@ -2430,7 +2432,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
 
 	for_each_netdev_safe(net, dev, aux) {
 		if (dev->group == group) {
-			err = do_setlink(skb, dev, ifm, tb, NULL, 0);
+			err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0);
 			if (err < 0)
 				return err;
 		}
@@ -2576,14 +2578,15 @@ replay:
 				status |= DO_SETLINK_NOTIFY;
 			}
 
-			return do_setlink(skb, dev, ifm, tb, ifname, status);
+			return do_setlink(skb, dev, ifm, extack, tb, ifname,
+					  status);
 		}
 
 		if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
 			if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
 				return rtnl_group_changelink(skb, net,
 						nla_get_u32(tb[IFLA_GROUP]),
-						ifm, tb);
+						ifm, extack, tb);
 			return -ENODEV;
 		}
 
-- 
cgit v1.2.3


From 20b1bd96e9f4feeffc9206284df3c6a4438e9ca8 Mon Sep 17 00:00:00 2001
From: Ram Amrani <Ram.Amrani@cavium.com>
Date: Sun, 30 Apr 2017 11:49:10 +0300
Subject: qed: output the DPM status and WID count

Output to the RDMA driver whether DPM mode is enabled or disabled in
the HW and if so what is the number of WIDs it supports

Signed-off-by: Ram Amrani <Ram.Amrani@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed.h      | 1 +
 drivers/net/ethernet/qlogic/qed/qed_dev.c  | 4 +++-
 drivers/net/ethernet/qlogic/qed/qed_roce.c | 4 ++++
 include/linux/qed/qed_roce_if.h            | 2 ++
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h
index edf3b68bf935..2ab1aab7c3fe 100644
--- a/drivers/net/ethernet/qlogic/qed/qed.h
+++ b/drivers/net/ethernet/qlogic/qed/qed.h
@@ -526,6 +526,7 @@ struct qed_hwfn {
 	struct dbg_tools_data		dbg_info;
 
 	/* PWM region specific data */
+	u16				wid_count;
 	u32				dpi_size;
 	u32				dpi_count;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index c478e079b039..5f31140d0b77 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -1354,7 +1354,7 @@ qed_hw_init_pf_doorbell_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 {
 	u32 pwm_regsize, norm_regsize;
 	u32 non_pwm_conn, min_addr_reg1;
-	u32 db_bar_size, n_cpus;
+	u32 db_bar_size, n_cpus = 1;
 	u32 roce_edpm_mode;
 	u32 pf_dems_shift;
 	int rc = 0;
@@ -1415,6 +1415,8 @@ qed_hw_init_pf_doorbell_bar(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 			qed_rdma_dpm_bar(p_hwfn, p_ptt);
 	}
 
+	p_hwfn->wid_count = (u16) n_cpus;
+
 	DP_INFO(p_hwfn,
 		"doorbell bar: normal_region_size=%d, pwm_region_size=%d, dpi_size=%d, dpi_count=%d, roce_edpm=%s\n",
 		norm_regsize,
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 53f285e40e3d..56289d7cd306 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -784,6 +784,7 @@ static int qed_rdma_add_user(void *rdma_cxt,
 				    ((out_params->dpi) * p_hwfn->dpi_size);
 
 	out_params->dpi_size = p_hwfn->dpi_size;
+	out_params->wid_count = p_hwfn->wid_count;
 
 	DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "Adding user - done, rc = %d\n", rc);
 	return rc;
@@ -856,9 +857,12 @@ static void qed_rdma_cnq_prod_update(void *rdma_cxt, u8 qz_offset, u16 prod)
 static int qed_fill_rdma_dev_info(struct qed_dev *cdev,
 				  struct qed_dev_rdma_info *info)
 {
+	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
+
 	memset(info, 0, sizeof(*info));
 
 	info->rdma_type = QED_RDMA_TYPE_ROCE;
+	info->user_dpm_enabled = (p_hwfn->db_bar_no_edpm == 0);
 
 	qed_fill_dev_info(cdev, &info->common);
 
diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h
index f742d4312c9d..cbb2ff0ce4bc 100644
--- a/include/linux/qed/qed_roce_if.h
+++ b/include/linux/qed/qed_roce_if.h
@@ -240,6 +240,7 @@ struct qed_rdma_add_user_out_params {
 	u64 dpi_addr;
 	u64 dpi_phys_addr;
 	u32 dpi_size;
+	u16 wid_count;
 };
 
 enum roce_mode {
@@ -533,6 +534,7 @@ enum qed_rdma_type {
 struct qed_dev_rdma_info {
 	struct qed_dev_info common;
 	enum qed_rdma_type rdma_type;
+	u8 user_dpm_enabled;
 };
 
 struct qed_rdma_ops {
-- 
cgit v1.2.3


From 133bea04ff6fd715d8140edca9d6c7337249571b Mon Sep 17 00:00:00 2001
From: Tim Wright <tim@binbash.co.uk>
Date: Mon, 1 May 2017 17:30:08 +0100
Subject: IB/mlx5: Add port_xmit_wait to counter registers read

Add port_xmit_wait to the error counters read by mlx5_ib_process_mad to
ensure sysfs port counter provides correct value for PortXmitWait.
Otherwise the sysfs port_xmit_wait file always contains zero.

The previous MAD_IFC implementation populated this counter, but it was
removed during the migration to PPCNT for error counters (32-bit only).

Signed-off-by: Tim Wright <tim@binbash.co.uk>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/mad.c | 2 ++
 include/linux/mlx5/mlx5_ifc.h    | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 8dacb49eabd9..f1b56de64871 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -187,6 +187,8 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt,
 			     port_xmit_discards);
 	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors,
 			     port_xmit_constraint_errors);
+	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_wait,
+			     port_xmit_wait);
 	MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors,
 			     port_rcv_constraint_errors);
 	MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 954f42c268a4..32de0724b400 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1456,7 +1456,9 @@ struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits {
 
 	u8         vl_15_dropped[0x10];
 
-	u8	   reserved_at_a0[0xa0];
+	u8	   reserved_at_a0[0x80];
+
+	u8         port_xmit_wait[0x20];
 };
 
 struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits {
-- 
cgit v1.2.3


From 48e75b430670ebdbb00ba008e1d3690f61ab9824 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 1 May 2017 22:18:01 +0200
Subject: rhashtable: compact struct rhashtable_params

By using smaller datatypes this (rather large) struct shrinks considerably
(80 -> 48 bytes on x86_64).

As this is embedded in other structs, this also rerduces size of several
others, e.g. cls_fl_head or nft_hash.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 18 +++++++++---------
 lib/rhashtable.c           |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 45f89369c4c8..7d56a7ea2b2e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -127,23 +127,23 @@ struct rhashtable;
  * @head_offset: Offset of rhash_head in struct to be hashed
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
- * @nulls_base: Base value to generate nulls marker
- * @automatic_shrinking: Enable automatic shrinking of tables
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
+ * @automatic_shrinking: Enable automatic shrinking of tables
+ * @nulls_base: Base value to generate nulls marker
  * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash)
  * @obj_hashfn: Function to hash object
  * @obj_cmpfn: Function to compare key with object
  */
 struct rhashtable_params {
-	size_t			nelem_hint;
-	size_t			key_len;
-	size_t			key_offset;
-	size_t			head_offset;
+	u16			nelem_hint;
+	u16			key_len;
+	u16			key_offset;
+	u16			head_offset;
 	unsigned int		max_size;
-	unsigned int		min_size;
-	u32			nulls_base;
+	u16			min_size;
 	bool			automatic_shrinking;
-	size_t			locks_mul;
+	u8			locks_mul;
+	u32			nulls_base;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	rht_obj_cmpfn_t		obj_cmpfn;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 3895486ef551..a930e436db5d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -967,7 +967,7 @@ int rhashtable_init(struct rhashtable *ht,
 			ht->max_elems = ht->p.max_size * 2;
 	}
 
-	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
+	ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
 
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(&ht->p);
-- 
cgit v1.2.3


From 7ed8578a96ad98231d8bf6388f776e034673e18a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Apr 2017 09:40:37 +0200
Subject: dm rq: change ->rq_end_io calling conventions

Instead of returning either a DM_ENDIO_* constant or an error code, add
a new DM_ENDIO_DONE value that means keep errno as is.  This allows us
to easily keep the existing error code in case where we can't push back,
and it also preparares for the new block level status codes with strict
type checking.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c         | 13 +++++++++----
 drivers/md/dm-rq.c            | 17 ++++++++++-------
 include/linux/device-mapper.h |  1 +
 3 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 8b394a08d427..926a6bcb32c8 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1469,6 +1469,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 {
 	struct dm_mpath_io *mpio = get_mpio(map_context);
 	struct pgpath *pgpath = mpio->pgpath;
+	int r = DM_ENDIO_DONE;
 
 	/*
 	 * We don't queue any clone request inside the multipath target
@@ -1484,14 +1485,18 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 	if (error && !noretry_error(error)) {
 		struct multipath *m = ti->private;
 
-		error = DM_ENDIO_REQUEUE;
+		r = DM_ENDIO_REQUEUE;
 
 		if (pgpath)
 			fail_path(pgpath);
 
 		if (atomic_read(&m->nr_valid_paths) == 0 &&
-		    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
-			error = dm_report_EIO(m);
+		    !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
+			if (error == -EIO)
+				error = dm_report_EIO(m);
+			/* complete with the original error */
+			r = DM_ENDIO_DONE;
+		}
 	}
 
 	if (pgpath) {
@@ -1501,7 +1506,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
 
-	return error;
+	return r;
 }
 
 static int do_end_io_bio(struct multipath *m, struct bio *clone,
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index d445b712970b..920e854caba9 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -287,7 +287,7 @@ static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_
 
 static void dm_done(struct request *clone, int error, bool mapped)
 {
-	int r = error;
+	int r = DM_ENDIO_DONE;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	dm_request_endio_fn rq_end_io = NULL;
 
@@ -298,7 +298,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			r = rq_end_io(tio->ti, clone, error, &tio->info);
 	}
 
-	if (unlikely(r == -EREMOTEIO)) {
+	if (unlikely(error == -EREMOTEIO)) {
 		if (req_op(clone) == REQ_OP_WRITE_SAME &&
 		    !clone->q->limits.max_write_same_sectors)
 			disable_write_same(tio->md);
@@ -307,16 +307,19 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			disable_write_zeroes(tio->md);
 	}
 
-	if (r <= 0)
+	switch (r) {
+	case DM_ENDIO_DONE:
 		/* The target wants to complete the I/O */
-		dm_end_request(clone, r);
-	else if (r == DM_ENDIO_INCOMPLETE)
+		dm_end_request(clone, error);
+		break;
+	case DM_ENDIO_INCOMPLETE:
 		/* The target will handle the I/O */
 		return;
-	else if (r == DM_ENDIO_REQUEUE)
+	case DM_ENDIO_REQUEUE:
 		/* The target wants to requeue the I/O */
 		dm_requeue_original_request(tio, false);
-	else {
+		break;
+	default:
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
 	}
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 925b63cdef52..5a02fc0ff311 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -593,6 +593,7 @@ extern struct ratelimit_state dm_ratelimit_state;
 /*
  * Definitions of return values from target end_io function.
  */
+#define DM_ENDIO_DONE		0
 #define DM_ENDIO_INCOMPLETE	1
 #define DM_ENDIO_REQUEUE	2
 
-- 
cgit v1.2.3


From 412445acb6cad4cef026daae37c4765fb9942c60 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Apr 2017 09:40:39 +0200
Subject: dm: introduce a new DM_MAPIO_KILL return value

This untangles the DM_MAPIO_* values returned from ->clone_and_map_rq
from the error codes used by the block layer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-rq.c            | 12 +++++-------
 drivers/md/dm-target.c        |  2 +-
 include/linux/device-mapper.h |  1 +
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 920e854caba9..a48130b90157 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -504,14 +504,12 @@ static int map_request(struct dm_rq_target_io *tio)
 		/* The target wants to requeue the I/O after a delay */
 		dm_requeue_original_request(tio, true);
 		break;
-	default:
-		if (r > 0) {
-			DMWARN("unimplemented target map return value: %d", r);
-			BUG();
-		}
-
+	case DM_MAPIO_KILL:
 		/* The target wants to complete the I/O */
-		dm_kill_unmapped_request(rq, r);
+		dm_kill_unmapped_request(rq, -EIO);
+	default:
+		DMWARN("unimplemented target map return value: %d", r);
+		BUG();
 	}
 
 	return r;
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
index 43d3445b121d..6264ff00dcf0 100644
--- a/drivers/md/dm-target.c
+++ b/drivers/md/dm-target.c
@@ -135,7 +135,7 @@ static int io_err_clone_and_map_rq(struct dm_target *ti, struct request *rq,
 				   union map_info *map_context,
 				   struct request **clone)
 {
-	return -EIO;
+	return DM_MAPIO_KILL;
 }
 
 static void io_err_release_clone_rq(struct request *clone)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 5a02fc0ff311..78ad0624cdae 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -604,6 +604,7 @@ extern struct ratelimit_state dm_ratelimit_state;
 #define DM_MAPIO_REMAPPED	1
 #define DM_MAPIO_REQUEUE	DM_ENDIO_REQUEUE
 #define DM_MAPIO_DELAY_REQUEUE	3
+#define DM_MAPIO_KILL		4
 
 #define dm_sector_div64(x, y)( \
 { \
-- 
cgit v1.2.3


From 5c0aea0e8d98e38858fbb3a09870ed8487a01da2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 28 Apr 2017 17:06:20 +0200
Subject: KVM: x86: don't hold kvm->lock in KVM_SET_GSI_ROUTING
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We needed the lock to avoid racing with creation of the irqchip on x86. As
kvm_set_irq_routing() calls srcu_synchronize_expedited(), this lock
might be held for a longer time.

Let's introduce an arch specific callback to check if we can actually
add irq routes. For x86, all we have to do is check if we have an
irqchip in the kernel. We don't need kvm->lock at that point as the
irqchip is marked as inititalized only when actually fully created.

Reported-by: Steve Rutherford <srutherford@google.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Fixes: 1df6ddede10a ("KVM: x86: race between KVM_SET_GSI_ROUTING and KVM_CREATE_IRQCHIP")
Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/irq.h              |  2 +-
 arch/x86/kvm/irq_comm.c         | 15 +++++++++------
 arch/x86/kvm/x86.c              | 11 +----------
 include/linux/kvm_host.h        |  1 +
 virt/kvm/irqchip.c              |  5 +++++
 virt/kvm/kvm_main.c             |  5 ++---
 7 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 84c8489531bb..f5bddf92faba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -728,7 +728,6 @@ struct kvm_hv {
 
 enum kvm_irqchip_mode {
 	KVM_IRQCHIP_NONE,
-	KVM_IRQCHIP_INIT_IN_PROGRESS, /* temporarily set during creation */
 	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
 	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
 };
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 0edd22c3344c..d5005cc26521 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -111,7 +111,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 	/* Matches smp_wmb() when setting irqchip_mode */
 	smp_rmb();
-	return mode > KVM_IRQCHIP_INIT_IN_PROGRESS;
+	return mode != KVM_IRQCHIP_NONE;
 }
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 4517a4c2ac3a..3cc3b2d130a0 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -274,16 +274,19 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+	return irqchip_in_kernel(kvm);
+}
+
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
 {
-	/* also allow creation of routes during KVM_IRQCHIP_INIT_IN_PROGRESS */
-	if (kvm->arch.irqchip_mode == KVM_IRQCHIP_NONE)
-		return -EINVAL;
-
-	/* Matches smp_wmb() when setting irqchip_mode */
-	smp_rmb();
+	/* We can't check irqchip_in_kernel() here as some callers are
+	 * currently inititalizing the irqchip. Other callers should therefore
+	 * check kvm_arch_can_set_irq_routing() before calling this function.
+	 */
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		if (irqchip_split(kvm))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index be2ade58edb9..2fe9aa116288 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3919,14 +3919,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			goto split_irqchip_unlock;
 		if (kvm->created_vcpus)
 			goto split_irqchip_unlock;
-		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_empty_irq_routing(kvm);
-		if (r) {
-			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
-			/* Pairs with smp_rmb() when reading irqchip_mode */
-			smp_wmb();
+		if (r)
 			goto split_irqchip_unlock;
-		}
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
@@ -4012,12 +4007,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto create_irqchip_unlock;
 		}
 
-		kvm->arch.irqchip_mode = KVM_IRQCHIP_INIT_IN_PROGRESS;
 		r = kvm_setup_default_irq_routing(kvm);
 		if (r) {
-			kvm->arch.irqchip_mode = KVM_IRQCHIP_NONE;
-			/* Pairs with smp_rmb() when reading irqchip_mode */
-			smp_wmb();
 			kvm_ioapic_destroy(kvm);
 			kvm_pic_destroy(kvm);
 			goto create_irqchip_unlock;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a5bfffa8c8d4..25cf258a1c9b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1018,6 +1018,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #define KVM_MAX_IRQ_ROUTES 1024
 #endif
 
+bool kvm_arch_can_set_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index cc30d01a56be..31e40c9e81df 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -172,6 +172,11 @@ void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm)
 {
 }
 
+bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm)
+{
+	return true;
+}
+
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *ue,
 			unsigned nr,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 035bc51f656f..6281cc2446d5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3075,6 +3075,8 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (copy_from_user(&routing, argp, sizeof(routing)))
 			goto out;
 		r = -EINVAL;
+		if (!kvm_arch_can_set_irq_routing(kvm))
+			goto out;
 		if (routing.nr > KVM_MAX_IRQ_ROUTES)
 			goto out;
 		if (routing.flags)
@@ -3090,11 +3092,8 @@ static long kvm_vm_ioctl(struct file *filp,
 					   routing.nr * sizeof(*entries)))
 				goto out_free_irq_routing;
 		}
-		/* avoid races with KVM_CREATE_IRQCHIP on x86 */
-		mutex_lock(&kvm->lock);
 		r = kvm_set_irq_routing(kvm, entries, routing.nr,
 					routing.flags);
-		mutex_unlock(&kvm->lock);
 out_free_irq_routing:
 		vfree(entries);
 		break;
-- 
cgit v1.2.3


From 45753c5f315749711b935a2506ee5c10eef5c23d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 2 May 2017 10:31:18 +0200
Subject: srcu: Debloat the <linux/rcu_segcblist.h> header

Linus noticed that the <linux/rcu_segcblist.h> has huge inline functions
which should not be inline at all.

As a first step in cleaning this up, move them all to kernel/rcu/ and
only keep an absolute minimum of data type defines in the header:

  before:   -rw-r--r-- 1 mingo mingo 22284 May  2 10:25 include/linux/rcu_segcblist.h
   after:   -rw-r--r-- 1 mingo mingo  3180 May  2 10:22 include/linux/rcu_segcblist.h

More can be done, such as uninlining the large functions, which inlining
is unjustified even if it's an RCU internal matter.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcu_segcblist.h | 628 +---------------------------------------
 kernel/rcu/rcu_segcblist.h    | 645 ++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/srcutiny.c         |   1 +
 kernel/rcu/srcutree.c         |   1 +
 kernel/rcu/tree.h             |   3 +-
 5 files changed, 652 insertions(+), 626 deletions(-)
 create mode 100644 kernel/rcu/rcu_segcblist.h

(limited to 'include/linux')

diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index ced8f313fd05..ba4d2621d9ca 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -20,8 +20,8 @@
  * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  */
 
-#ifndef __KERNEL_RCU_SEGCBLIST_H
-#define __KERNEL_RCU_SEGCBLIST_H
+#ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H
+#define __INCLUDE_LINUX_RCU_SEGCBLIST_H
 
 /* Simple unsegmented callback lists. */
 struct rcu_cblist {
@@ -33,102 +33,6 @@ struct rcu_cblist {
 
 #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head }
 
-/* Initialize simple callback list. */
-static inline void rcu_cblist_init(struct rcu_cblist *rclp)
-{
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-	rclp->len = 0;
-	rclp->len_lazy = 0;
-}
-
-/* Is simple callback list empty? */
-static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
-{
-	return !rclp->head;
-}
-
-/* Return number of callbacks in simple callback list. */
-static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len;
-}
-
-/* Return number of lazy callbacks in simple callback list. */
-static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
-{
-	return rclp->len_lazy;
-}
-
-/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
-	int cnt = 0;
-	struct rcu_head **rhpp = &rclp->head;
-
-	for (;;) {
-		if (!*rhpp)
-			return cnt;
-		if (++cnt > lim)
-			return -1;
-		rhpp = &(*rhpp)->next;
-	}
-}
-
-/*
- * Dequeue the oldest rcu_head structure from the specified callback
- * list.  This function assumes that the callback is non-lazy, but
- * the caller can later invoke rcu_cblist_dequeued_lazy() if it
- * finds otherwise (and if it cares about laziness).  This allows
- * different users to have different ways of determining laziness.
- */
-static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
-{
-	struct rcu_head *rhp;
-
-	rhp = rclp->head;
-	if (!rhp)
-		return NULL;
-	rclp->len--;
-	rclp->head = rhp->next;
-	if (!rclp->head)
-		rclp->tail = &rclp->head;
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
-{
-	rclp->len_lazy--;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
-	return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer.  Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
-	WARN_ON_ONCE(rcu_cblist_empty(rclp));
-	return rclp->tail;
-}
-
 /* Complicated segmented callback lists.  ;-) */
 
 /*
@@ -183,530 +87,4 @@ struct rcu_segcblist {
 	.tails[RCU_NEXT_TAIL] = &n.head, \
 }
 
-/*
- * Initialize an rcu_segcblist structure.
- */
-static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
-{
-	int i;
-
-	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
-	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
-	rsclp->head = NULL;
-	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = &rsclp->head;
-	rsclp->len = 0;
-	rsclp->len_lazy = 0;
-}
-
-/*
- * Is the specified rcu_segcblist structure empty?
- *
- * But careful!  The fact that the ->head field is NULL does not
- * necessarily imply that there are no callbacks associated with
- * this structure.  When callbacks are being invoked, they are
- * removed as a group.  If callback invocation must be preempted,
- * the remaining callbacks will be added back to the list.  Either
- * way, the counts are updated later.
- *
- * So it is often the case that rcu_segcblist_n_cbs() should be used
- * instead.
- */
-static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
-{
-	return !rsclp->head;
-}
-
-/* Return number of callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
-{
-	return READ_ONCE(rsclp->len);
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len_lazy;
-}
-
-/* Return number of lazy callbacks in segmented callback list. */
-static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
-{
-	return rsclp->len - rsclp->len_lazy;
-}
-
-/*
- * Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
- */
-static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
-{
-	return !!rsclp->tails[RCU_NEXT_TAIL];
-}
-
-/*
- * Disable the specified rcu_segcblist structure, so that callbacks can
- * no longer be posted to it.  This structure must be empty.
- */
-static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
-	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
-	rsclp->tails[RCU_NEXT_TAIL] = NULL;
-}
-
-/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
-	if (seg == RCU_DONE_TAIL)
-		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
-	return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
- * Are all segments following the specified segment of the specified
- * rcu_segcblist structure empty of callbacks?  (The specified
- * segment might well contain callbacks.)
- */
-static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
-{
-	return !*rsclp->tails[seg];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are ready to be invoked?
- */
-static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * are still pending, that is, not yet ready to be invoked?
- */
-static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
-}
-
-/*
- * Dequeue and return the first ready-to-invoke callback.  If there
- * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
- * to avoid interference.  Does not protect from interference from other
- * CPUs or tasks.
- */
-static inline struct rcu_head *
-rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-	int i;
-	struct rcu_head *rhp;
-
-	local_irq_save(flags);
-	if (!rcu_segcblist_ready_cbs(rsclp)) {
-		local_irq_restore(flags);
-		return NULL;
-	}
-	rhp = rsclp->head;
-	BUG_ON(!rhp);
-	rsclp->head = rhp->next;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
-		if (rsclp->tails[i] != &rhp->next)
-			break;
-		rsclp->tails[i] = &rsclp->head;
-	}
-	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
-	WRITE_ONCE(rsclp->len, rsclp->len - 1);
-	local_irq_restore(flags);
-	return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	rsclp->len_lazy--;
-	local_irq_restore(flags);
-}
-
-/*
- * Return a pointer to the first callback in the specified rcu_segcblist
- * structure.  This is useful for diagnostics.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return rsclp->head;
-	return NULL;
-}
-
-/*
- * Return a pointer to the first pending callback in the specified
- * rcu_segcblist structure.  This is useful just after posting a given
- * callback -- if that callback is the first pending callback, then
- * you cannot rely on someone else having already started up the required
- * grace period.
- */
-static inline struct rcu_head *
-rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
-{
-	if (rcu_segcblist_is_enabled(rsclp))
-		return *rsclp->tails[RCU_DONE_TAIL];
-	return NULL;
-}
-
-/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
-	return rcu_segcblist_is_enabled(rsclp) &&
-	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
- * Enqueue the specified callback onto the specified rcu_segcblist
- * structure, updating accounting as needed.  Note that the ->len
- * field may be accessed locklessly, hence the WRITE_ONCE().
- * The ->len field is used by rcu_barrier() and friends to determine
- * if it must post a callback on this structure, and it is OK
- * for rcu_barrier() to sometimes post callbacks needlessly, but
- * absolutely not OK for it to ever miss posting a callback.
- */
-static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
-					 struct rcu_head *rhp, bool lazy)
-{
-	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
-	if (lazy)
-		rsclp->len_lazy++;
-	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
-	rhp->next = NULL;
-	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
-	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
-}
-
-/*
- * Entrain the specified callback onto the specified rcu_segcblist at
- * the end of the last non-empty segment.  If the entire rcu_segcblist
- * is empty, make no change, but return false.
- *
- * This is intended for use by rcu_barrier()-like primitives, -not-
- * for normal grace-period use.  IMPORTANT:  The callback you enqueue
- * will wait for all prior callbacks, NOT necessarily for a grace
- * period.  You have been warned.
- */
-static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
-					 struct rcu_head *rhp, bool lazy)
-{
-	int i;
-
-	if (rcu_segcblist_n_cbs(rsclp) == 0)
-		return false;
-	WRITE_ONCE(rsclp->len, rsclp->len + 1);
-	if (lazy)
-		rsclp->len_lazy++;
-	smp_mb(); /* Ensure counts are updated before callback is entrained. */
-	rhp->next = NULL;
-	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1])
-			break;
-	*rsclp->tails[i] = rhp;
-	for (; i <= RCU_NEXT_TAIL; i++)
-		rsclp->tails[i] = &rhp->next;
-	return true;
-}
-
-/*
- * Extract only the counts from the specified rcu_segcblist structure,
- * and place them in the specified rcu_cblist structure.  This function
- * supports both callback orphaning and invocation, hence the separation
- * of counts and callbacks.  (Callbacks ready for invocation must be
- * orphaned and adopted separately from pending callbacks, but counts
- * apply to all callbacks.  Locking must be used to make sure that
- * both orphaned-callbacks lists are consistent.)
- */
-static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
-					       struct rcu_cblist *rclp)
-{
-	rclp->len_lazy += rsclp->len_lazy;
-	rclp->len += rsclp->len;
-	rsclp->len_lazy = 0;
-	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
-}
-
-/*
- * Extract only those callbacks ready to be invoked from the specified
- * rcu_segcblist structure and place them in the specified rcu_cblist
- * structure.
- */
-static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
-						  struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_ready_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
-	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
-			rsclp->tails[i] = &rsclp->head;
-}
-
-/*
- * Extract only those callbacks still pending (not yet ready to be
- * invoked) from the specified rcu_segcblist structure and place them in
- * the specified rcu_cblist structure.  Note that this loses information
- * about any callbacks that might have been partway done waiting for
- * their grace period.  Too bad!  They will have to start over.
- */
-static inline void
-rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
-			       struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rcu_segcblist_pend_cbs(rsclp))
-		return; /* Nothing to do. */
-	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
-	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
-	*rsclp->tails[RCU_DONE_TAIL] = NULL;
-	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
-		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
-}
-
-/*
- * Move the entire contents of the specified rcu_segcblist structure,
- * counts, callbacks, and all, to the specified rcu_cblist structure.
- * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
- * @@@ Memory barrier needed?  (Not if only used at boot time...)
- */
-static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
-					     struct rcu_cblist *rclp)
-{
-	rcu_segcblist_extract_done_cbs(rsclp, rclp);
-	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
-	rcu_segcblist_extract_count(rsclp, rclp);
-}
-
-/*
- * Insert counts from the specified rcu_cblist structure in the
- * specified rcu_segcblist structure.
- */
-static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
-					      struct rcu_cblist *rclp)
-{
-	rsclp->len_lazy += rclp->len_lazy;
-	/* ->len sampled locklessly. */
-	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
-	rclp->len_lazy = 0;
-	rclp->len = 0;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the beginning of the
- * done-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	int i;
-
-	if (!rclp->head)
-		return; /* No callbacks to move. */
-	*rclp->tail = rsclp->head;
-	rsclp->head = rclp->head;
-	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
-		if (&rsclp->head == rsclp->tails[i])
-			rsclp->tails[i] = rclp->tail;
-		else
-			break;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Move callbacks from the specified rcu_cblist to the end of the
- * new-callbacks segment of the specified rcu_segcblist.
- */
-static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
-						 struct rcu_cblist *rclp)
-{
-	if (!rclp->head)
-		return; /* Nothing to do. */
-	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
-	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
-	rclp->head = NULL;
-	rclp->tail = &rclp->head;
-}
-
-/*
- * Advance the callbacks in the specified rcu_segcblist structure based
- * on the current value passed in for the grace-period counter.
- */
-static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
-					 unsigned long seq)
-{
-	int i, j;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
-		return;
-
-	/*
-	 * Find all callbacks whose ->gp_seq numbers indicate that they
-	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
-	 */
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
-		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			break;
-		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
-	}
-
-	/* If no callbacks moved, nothing more need be done. */
-	if (i == RCU_WAIT_TAIL)
-		return;
-
-	/* Clean up tail pointers that might have been misordered above. */
-	for (j = RCU_WAIT_TAIL; j < i; j++)
-		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
-
-	/*
-	 * Callbacks moved, so clean up the misordered ->tails[] pointers
-	 * that now point into the middle of the list of ready-to-invoke
-	 * callbacks.  The overall effect is to copy down the later pointers
-	 * into the gap that was created by the now-ready segments.
-	 */
-	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
-		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
-			break;  /* No more callbacks. */
-		rsclp->tails[j] = rsclp->tails[i];
-		rsclp->gp_seq[j] = rsclp->gp_seq[i];
-	}
-}
-
-/*
- * "Accelerate" callbacks based on more-accurate grace-period information.
- * The reason for this is that RCU does not synchronize the beginnings and
- * ends of grace periods, and that callbacks are posted locally.  This in
- * turn means that the callbacks must be labelled conservatively early
- * on, as getting exact information would degrade both performance and
- * scalability.  When more accurate grace-period information becomes
- * available, previously posted callbacks can be "accelerated", marking
- * them to complete at the end of the earlier grace period.
- *
- * This function operates on an rcu_segcblist structure, and also the
- * grace-period sequence number seq at which new callbacks would become
- * ready to invoke.  Returns true if there are callbacks that won't be
- * ready to invoke until seq, false otherwise.
- */
-static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
-					    unsigned long seq)
-{
-	int i;
-
-	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
-	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
-		return false;
-
-	/*
-	 * Find the segment preceding the oldest segment of callbacks
-	 * whose ->gp_seq[] completion is at or after that passed in via
-	 * "seq", skipping any empty segments.  This oldest segment, along
-	 * with any later segments, can be merged in with any newly arrived
-	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
-	 * as their ->gp_seq[] grace-period completion sequence number.
-	 */
-	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
-		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
-		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
-			break;
-
-	/*
-	 * If all the segments contain callbacks that correspond to
-	 * earlier grace-period sequence numbers than "seq", leave.
-	 * Assuming that the rcu_segcblist structure has enough
-	 * segments in its arrays, this can only happen if some of
-	 * the non-done segments contain callbacks that really are
-	 * ready to invoke.  This situation will get straightened
-	 * out by the next call to rcu_segcblist_advance().
-	 *
-	 * Also advance to the oldest segment of callbacks whose
-	 * ->gp_seq[] completion is at or after that passed in via "seq",
-	 * skipping any empty segments.
-	 */
-	if (++i >= RCU_NEXT_TAIL)
-		return false;
-
-	/*
-	 * Merge all later callbacks, including newly arrived callbacks,
-	 * into the segment located by the for-loop above.  Assign "seq"
-	 * as the ->gp_seq[] value in order to correctly handle the case
-	 * where there were no pending callbacks in the rcu_segcblist
-	 * structure other than in the RCU_NEXT_TAIL segment.
-	 */
-	for (; i < RCU_NEXT_TAIL; i++) {
-		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
-		rsclp->gp_seq[i] = seq;
-	}
-	return true;
-}
-
-/*
- * Scan the specified rcu_segcblist structure for callbacks that need
- * a grace period later than the one specified by "seq".  We don't look
- * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
- * have a grace-period sequence number.
- */
-static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
-						  unsigned long seq)
-{
-	int i;
-
-	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
-		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
-		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
-			return true;
-	return false;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
-	return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer.  Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
-	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
-	return rsclp->tails[RCU_NEXT_TAIL];
-}
-
-#endif /* __KERNEL_RCU_SEGCBLIST_H */
+#endif /* __INCLUDE_LINUX_RCU_SEGCBLIST_H */
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
new file mode 100644
index 000000000000..d98d2f9b8d59
--- /dev/null
+++ b/kernel/rcu/rcu_segcblist.h
@@ -0,0 +1,645 @@
+/*
+ * RCU segmented callback lists
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#include <linux/rcu_segcblist.h>
+
+/* Initialize simple callback list. */
+static inline void rcu_cblist_init(struct rcu_cblist *rclp)
+{
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+	rclp->len = 0;
+	rclp->len_lazy = 0;
+}
+
+/* Is simple callback list empty? */
+static inline bool rcu_cblist_empty(struct rcu_cblist *rclp)
+{
+	return !rclp->head;
+}
+
+/* Return number of callbacks in simple callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len;
+}
+
+/* Return number of lazy callbacks in simple callback list. */
+static inline long rcu_cblist_n_lazy_cbs(struct rcu_cblist *rclp)
+{
+	return rclp->len_lazy;
+}
+
+/*
+ * Debug function to actually count the number of callbacks.
+ * If the number exceeds the limit specified, return -1.
+ */
+static inline long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
+{
+	int cnt = 0;
+	struct rcu_head **rhpp = &rclp->head;
+
+	for (;;) {
+		if (!*rhpp)
+			return cnt;
+		if (++cnt > lim)
+			return -1;
+		rhpp = &(*rhpp)->next;
+	}
+}
+
+/*
+ * Dequeue the oldest rcu_head structure from the specified callback
+ * list.  This function assumes that the callback is non-lazy, but
+ * the caller can later invoke rcu_cblist_dequeued_lazy() if it
+ * finds otherwise (and if it cares about laziness).  This allows
+ * different users to have different ways of determining laziness.
+ */
+static inline struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
+{
+	struct rcu_head *rhp;
+
+	rhp = rclp->head;
+	if (!rhp)
+		return NULL;
+	rclp->len--;
+	rclp->head = rhp->next;
+	if (!rclp->head)
+		rclp->tail = &rclp->head;
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
+{
+	rclp->len_lazy--;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
+{
+	return rclp->head;
+}
+
+/*
+ * Interim function to return rcu_cblist head pointer.  Longer term, the
+ * rcu_cblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
+{
+	WARN_ON_ONCE(rcu_cblist_empty(rclp));
+	return rclp->tail;
+}
+
+/*
+ * Initialize an rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_init(struct rcu_segcblist *rsclp)
+{
+	int i;
+
+	BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
+	BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
+	rsclp->head = NULL;
+	for (i = 0; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = &rsclp->head;
+	rsclp->len = 0;
+	rsclp->len_lazy = 0;
+}
+
+/*
+ * Is the specified rcu_segcblist structure empty?
+ *
+ * But careful!  The fact that the ->head field is NULL does not
+ * necessarily imply that there are no callbacks associated with
+ * this structure.  When callbacks are being invoked, they are
+ * removed as a group.  If callback invocation must be preempted,
+ * the remaining callbacks will be added back to the list.  Either
+ * way, the counts are updated later.
+ *
+ * So it is often the case that rcu_segcblist_n_cbs() should be used
+ * instead.
+ */
+static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
+{
+	return !rsclp->head;
+}
+
+/* Return number of callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
+{
+	return READ_ONCE(rsclp->len);
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len_lazy;
+}
+
+/* Return number of lazy callbacks in segmented callback list. */
+static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
+{
+	return rsclp->len - rsclp->len_lazy;
+}
+
+/*
+ * Is the specified rcu_segcblist enabled, for example, not corresponding
+ * to an offline or callback-offloaded CPU?
+ */
+static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
+{
+	return !!rsclp->tails[RCU_NEXT_TAIL];
+}
+
+/*
+ * Disable the specified rcu_segcblist structure, so that callbacks can
+ * no longer be posted to it.  This structure must be empty.
+ */
+static inline void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
+	WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
+	rsclp->tails[RCU_NEXT_TAIL] = NULL;
+}
+
+/*
+ * Is the specified segment of the specified rcu_segcblist structure
+ * empty of callbacks?
+ */
+static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
+{
+	if (seg == RCU_DONE_TAIL)
+		return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
+	return rsclp->tails[seg - 1] == rsclp->tails[seg];
+}
+
+/*
+ * Are all segments following the specified segment of the specified
+ * rcu_segcblist structure empty of callbacks?  (The specified
+ * segment might well contain callbacks.)
+ */
+static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
+{
+	return !*rsclp->tails[seg];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are ready to be invoked?
+ */
+static inline bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * are still pending, that is, not yet ready to be invoked?
+ */
+static inline bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL);
+}
+
+/*
+ * Dequeue and return the first ready-to-invoke callback.  If there
+ * are no ready-to-invoke callbacks, return NULL.  Disables interrupts
+ * to avoid interference.  Does not protect from interference from other
+ * CPUs or tasks.
+ */
+static inline struct rcu_head *
+rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+	int i;
+	struct rcu_head *rhp;
+
+	local_irq_save(flags);
+	if (!rcu_segcblist_ready_cbs(rsclp)) {
+		local_irq_restore(flags);
+		return NULL;
+	}
+	rhp = rsclp->head;
+	BUG_ON(!rhp);
+	rsclp->head = rhp->next;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
+		if (rsclp->tails[i] != &rhp->next)
+			break;
+		rsclp->tails[i] = &rsclp->head;
+	}
+	smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
+	WRITE_ONCE(rsclp->len, rsclp->len - 1);
+	local_irq_restore(flags);
+	return rhp;
+}
+
+/*
+ * Account for the fact that a previously dequeued callback turned out
+ * to be marked as lazy.
+ */
+static inline void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	rsclp->len_lazy--;
+	local_irq_restore(flags);
+}
+
+/*
+ * Return a pointer to the first callback in the specified rcu_segcblist
+ * structure.  This is useful for diagnostics.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return rsclp->head;
+	return NULL;
+}
+
+/*
+ * Return a pointer to the first pending callback in the specified
+ * rcu_segcblist structure.  This is useful just after posting a given
+ * callback -- if that callback is the first pending callback, then
+ * you cannot rely on someone else having already started up the required
+ * grace period.
+ */
+static inline struct rcu_head *
+rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
+{
+	if (rcu_segcblist_is_enabled(rsclp))
+		return *rsclp->tails[RCU_DONE_TAIL];
+	return NULL;
+}
+
+/*
+ * Does the specified rcu_segcblist structure contain callbacks that
+ * have not yet been processed beyond having been posted, that is,
+ * does it contain callbacks in its last segment?
+ */
+static inline bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
+{
+	return rcu_segcblist_is_enabled(rsclp) &&
+	       !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
+}
+
+/*
+ * Enqueue the specified callback onto the specified rcu_segcblist
+ * structure, updating accounting as needed.  Note that the ->len
+ * field may be accessed locklessly, hence the WRITE_ONCE().
+ * The ->len field is used by rcu_barrier() and friends to determine
+ * if it must post a callback on this structure, and it is OK
+ * for rcu_barrier() to sometimes post callbacks needlessly, but
+ * absolutely not OK for it to ever miss posting a callback.
+ */
+static inline void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is enqueued. */
+	rhp->next = NULL;
+	*rsclp->tails[RCU_NEXT_TAIL] = rhp;
+	rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+}
+
+/*
+ * Entrain the specified callback onto the specified rcu_segcblist at
+ * the end of the last non-empty segment.  If the entire rcu_segcblist
+ * is empty, make no change, but return false.
+ *
+ * This is intended for use by rcu_barrier()-like primitives, -not-
+ * for normal grace-period use.  IMPORTANT:  The callback you enqueue
+ * will wait for all prior callbacks, NOT necessarily for a grace
+ * period.  You have been warned.
+ */
+static inline bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
+					 struct rcu_head *rhp, bool lazy)
+{
+	int i;
+
+	if (rcu_segcblist_n_cbs(rsclp) == 0)
+		return false;
+	WRITE_ONCE(rsclp->len, rsclp->len + 1);
+	if (lazy)
+		rsclp->len_lazy++;
+	smp_mb(); /* Ensure counts are updated before callback is entrained. */
+	rhp->next = NULL;
+	for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1])
+			break;
+	*rsclp->tails[i] = rhp;
+	for (; i <= RCU_NEXT_TAIL; i++)
+		rsclp->tails[i] = &rhp->next;
+	return true;
+}
+
+/*
+ * Extract only the counts from the specified rcu_segcblist structure,
+ * and place them in the specified rcu_cblist structure.  This function
+ * supports both callback orphaning and invocation, hence the separation
+ * of counts and callbacks.  (Callbacks ready for invocation must be
+ * orphaned and adopted separately from pending callbacks, but counts
+ * apply to all callbacks.  Locking must be used to make sure that
+ * both orphaned-callbacks lists are consistent.)
+ */
+static inline void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
+					       struct rcu_cblist *rclp)
+{
+	rclp->len_lazy += rsclp->len_lazy;
+	rclp->len += rsclp->len;
+	rsclp->len_lazy = 0;
+	WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+}
+
+/*
+ * Extract only those callbacks ready to be invoked from the specified
+ * rcu_segcblist structure and place them in the specified rcu_cblist
+ * structure.
+ */
+static inline void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
+						  struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_ready_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	rclp->tail = rsclp->tails[RCU_DONE_TAIL];
+	for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
+			rsclp->tails[i] = &rsclp->head;
+}
+
+/*
+ * Extract only those callbacks still pending (not yet ready to be
+ * invoked) from the specified rcu_segcblist structure and place them in
+ * the specified rcu_cblist structure.  Note that this loses information
+ * about any callbacks that might have been partway done waiting for
+ * their grace period.  Too bad!  They will have to start over.
+ */
+static inline void
+rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
+			       struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rcu_segcblist_pend_cbs(rsclp))
+		return; /* Nothing to do. */
+	*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
+	rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
+	*rsclp->tails[RCU_DONE_TAIL] = NULL;
+	for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
+		rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+}
+
+/*
+ * Move the entire contents of the specified rcu_segcblist structure,
+ * counts, callbacks, and all, to the specified rcu_cblist structure.
+ * @@@ Why do we need this???  Moving early-boot CBs to NOCB lists?
+ * @@@ Memory barrier needed?  (Not if only used at boot time...)
+ */
+static inline void rcu_segcblist_extract_all(struct rcu_segcblist *rsclp,
+					     struct rcu_cblist *rclp)
+{
+	rcu_segcblist_extract_done_cbs(rsclp, rclp);
+	rcu_segcblist_extract_pend_cbs(rsclp, rclp);
+	rcu_segcblist_extract_count(rsclp, rclp);
+}
+
+/*
+ * Insert counts from the specified rcu_cblist structure in the
+ * specified rcu_segcblist structure.
+ */
+static inline void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
+					      struct rcu_cblist *rclp)
+{
+	rsclp->len_lazy += rclp->len_lazy;
+	/* ->len sampled locklessly. */
+	WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+	rclp->len_lazy = 0;
+	rclp->len = 0;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the beginning of the
+ * done-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	int i;
+
+	if (!rclp->head)
+		return; /* No callbacks to move. */
+	*rclp->tail = rsclp->head;
+	rsclp->head = rclp->head;
+	for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
+		if (&rsclp->head == rsclp->tails[i])
+			rsclp->tails[i] = rclp->tail;
+		else
+			break;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Move callbacks from the specified rcu_cblist to the end of the
+ * new-callbacks segment of the specified rcu_segcblist.
+ */
+static inline void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
+						 struct rcu_cblist *rclp)
+{
+	if (!rclp->head)
+		return; /* Nothing to do. */
+	*rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
+	rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+	rclp->head = NULL;
+	rclp->tail = &rclp->head;
+}
+
+/*
+ * Advance the callbacks in the specified rcu_segcblist structure based
+ * on the current value passed in for the grace-period counter.
+ */
+static inline void rcu_segcblist_advance(struct rcu_segcblist *rsclp,
+					 unsigned long seq)
+{
+	int i, j;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return;
+
+	/*
+	 * Find all callbacks whose ->gp_seq numbers indicate that they
+	 * are ready to invoke, and put them into the RCU_DONE_TAIL segment.
+	 */
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
+		if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			break;
+		rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+	}
+
+	/* If no callbacks moved, nothing more need be done. */
+	if (i == RCU_WAIT_TAIL)
+		return;
+
+	/* Clean up tail pointers that might have been misordered above. */
+	for (j = RCU_WAIT_TAIL; j < i; j++)
+		rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+
+	/*
+	 * Callbacks moved, so clean up the misordered ->tails[] pointers
+	 * that now point into the middle of the list of ready-to-invoke
+	 * callbacks.  The overall effect is to copy down the later pointers
+	 * into the gap that was created by the now-ready segments.
+	 */
+	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
+		if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
+			break;  /* No more callbacks. */
+		rsclp->tails[j] = rsclp->tails[i];
+		rsclp->gp_seq[j] = rsclp->gp_seq[i];
+	}
+}
+
+/*
+ * "Accelerate" callbacks based on more-accurate grace-period information.
+ * The reason for this is that RCU does not synchronize the beginnings and
+ * ends of grace periods, and that callbacks are posted locally.  This in
+ * turn means that the callbacks must be labelled conservatively early
+ * on, as getting exact information would degrade both performance and
+ * scalability.  When more accurate grace-period information becomes
+ * available, previously posted callbacks can be "accelerated", marking
+ * them to complete at the end of the earlier grace period.
+ *
+ * This function operates on an rcu_segcblist structure, and also the
+ * grace-period sequence number seq at which new callbacks would become
+ * ready to invoke.  Returns true if there are callbacks that won't be
+ * ready to invoke until seq, false otherwise.
+ */
+static inline bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp,
+					    unsigned long seq)
+{
+	int i;
+
+	WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
+	if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
+		return false;
+
+	/*
+	 * Find the segment preceding the oldest segment of callbacks
+	 * whose ->gp_seq[] completion is at or after that passed in via
+	 * "seq", skipping any empty segments.  This oldest segment, along
+	 * with any later segments, can be merged in with any newly arrived
+	 * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
+	 * as their ->gp_seq[] grace-period completion sequence number.
+	 */
+	for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
+		if (rsclp->tails[i] != rsclp->tails[i - 1] &&
+		    ULONG_CMP_LT(rsclp->gp_seq[i], seq))
+			break;
+
+	/*
+	 * If all the segments contain callbacks that correspond to
+	 * earlier grace-period sequence numbers than "seq", leave.
+	 * Assuming that the rcu_segcblist structure has enough
+	 * segments in its arrays, this can only happen if some of
+	 * the non-done segments contain callbacks that really are
+	 * ready to invoke.  This situation will get straightened
+	 * out by the next call to rcu_segcblist_advance().
+	 *
+	 * Also advance to the oldest segment of callbacks whose
+	 * ->gp_seq[] completion is at or after that passed in via "seq",
+	 * skipping any empty segments.
+	 */
+	if (++i >= RCU_NEXT_TAIL)
+		return false;
+
+	/*
+	 * Merge all later callbacks, including newly arrived callbacks,
+	 * into the segment located by the for-loop above.  Assign "seq"
+	 * as the ->gp_seq[] value in order to correctly handle the case
+	 * where there were no pending callbacks in the rcu_segcblist
+	 * structure other than in the RCU_NEXT_TAIL segment.
+	 */
+	for (; i < RCU_NEXT_TAIL; i++) {
+		rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+		rsclp->gp_seq[i] = seq;
+	}
+	return true;
+}
+
+/*
+ * Scan the specified rcu_segcblist structure for callbacks that need
+ * a grace period later than the one specified by "seq".  We don't look
+ * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't
+ * have a grace-period sequence number.
+ */
+static inline bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
+						  unsigned long seq)
+{
+	int i;
+
+	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
+		if (rsclp->tails[i - 1] != rsclp->tails[i] &&
+		    ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
+			return true;
+	return false;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
+{
+	return rsclp->head;
+}
+
+/*
+ * Interim function to return rcu_segcblist head pointer.  Longer term, the
+ * rcu_segcblist will be used more pervasively, removing the need for this
+ * function.
+ */
+static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
+{
+	WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
+	return rsclp->tails[RCU_NEXT_TAIL];
+}
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b8293527ee18..36e1f82faed1 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -30,6 +30,7 @@
 #include <linux/srcu.h>
 
 #include <linux/rcu_node_tree.h>
+#include "rcu_segcblist.h"
 #include "rcu.h"
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 87b070de6371..3ae8474557df 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -38,6 +38,7 @@
 #include <linux/srcu.h>
 
 #include "rcu.h"
+#include "rcu_segcblist.h"
 
 ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */
 module_param(exp_holdoff, ulong, 0444);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0e598ab08fea..ba38262c3554 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -30,9 +30,10 @@
 #include <linux/seqlock.h>
 #include <linux/swait.h>
 #include <linux/stop_machine.h>
-#include <linux/rcu_segcblist.h>
 #include <linux/rcu_node_tree.h>
 
+#include "rcu_segcblist.h"
+
 /*
  * Dynticks per-CPU state.
  */
-- 
cgit v1.2.3


From c0332694903a37cf8ecdc9102d5c9e09cf8643d0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Mon, 1 May 2017 08:58:49 -0700
Subject: block: Remove elevator_change()

Since commit 84253394927c ("remove the mg_disk driver") removed the
only caller of elevator_change(), also remove the elevator_change()
function itself.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/elevator.c         | 13 -------------
 include/linux/elevator.h |  1 -
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/elevator.c b/block/elevator.c
index bf11e70f008b..80f485451096 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1088,19 +1088,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	return elevator_switch(q, e);
 }
 
-int elevator_change(struct request_queue *q, const char *name)
-{
-	int ret;
-
-	/* Protect q->elevator from elevator_init() */
-	mutex_lock(&q->sysfs_lock);
-	ret = __elevator_change(q, name);
-	mutex_unlock(&q->sysfs_lock);
-
-	return ret;
-}
-EXPORT_SYMBOL(elevator_change);
-
 static inline bool elv_support_iosched(struct request_queue *q)
 {
 	if (q->mq_ops && q->tag_set && (q->tag_set->flags &
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 3a216318ae73..d44840368ee7 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -214,7 +214,6 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
 
 extern int elevator_init(struct request_queue *, char *);
 extern void elevator_exit(struct request_queue *, struct elevator_queue *);
-extern int elevator_change(struct request_queue *, const char *);
 extern bool elv_bio_merge_ok(struct request *, struct bio *);
 extern struct elevator_queue *elevator_alloc(struct request_queue *,
 					struct elevator_type *);
-- 
cgit v1.2.3


From d6296d39e90c9075bc2fc15f1e86dac44930d4b5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 1 May 2017 10:19:08 -0600
Subject: blk-mq: update ->init_request and ->exit_request prototypes

Remove the request_idx parameter, which can't be used safely now that we
support I/O schedulers with blk-mq.  Except for a superflous check in
mtip32xx it was unused anyway.

Also pass the tag_set instead of just the driver data - this allows drivers
to avoid some code duplication in a follow on cleanup.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c                    | 18 +++++-------------
 drivers/block/loop.c              |  5 ++---
 drivers/block/mtip32xx/mtip32xx.c | 20 ++++++--------------
 drivers/block/nbd.c               |  7 +++----
 drivers/block/rbd.c               |  5 ++---
 drivers/block/virtio_blk.c        |  7 +++----
 drivers/md/dm-rq.c                |  7 +++----
 drivers/mtd/ubi/block.c           |  7 +++----
 drivers/nvme/host/fc.c            | 20 +++++++++-----------
 drivers/nvme/host/pci.c           | 15 +++++++--------
 drivers/nvme/host/rdma.c          | 28 ++++++++++++++--------------
 drivers/nvme/target/loop.c        | 17 +++++++++--------
 drivers/scsi/scsi_lib.c           | 13 ++++++-------
 include/linux/blk-mq.h            |  4 ++--
 14 files changed, 74 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index bf90684a007a..b81e4a7cd7f2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1655,8 +1655,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 
 			if (!rq)
 				continue;
-			set->ops->exit_request(set->driver_data, rq,
-						hctx_idx, i);
+			set->ops->exit_request(set, rq, hctx_idx);
 			tags->static_rqs[i] = NULL;
 		}
 	}
@@ -1787,8 +1786,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 
 			tags->static_rqs[i] = rq;
 			if (set->ops->init_request) {
-				if (set->ops->init_request(set->driver_data,
-						rq, hctx_idx, i,
+				if (set->ops->init_request(set, rq, hctx_idx,
 						node)) {
 					tags->static_rqs[i] = NULL;
 					goto fail;
@@ -1849,14 +1847,10 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
-	unsigned flush_start_tag = set->queue_depth;
-
 	blk_mq_tag_idle(hctx);
 
 	if (set->ops->exit_request)
-		set->ops->exit_request(set->driver_data,
-				       hctx->fq->flush_rq, hctx_idx,
-				       flush_start_tag + hctx_idx);
+		set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
 
 	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
 
@@ -1889,7 +1883,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
 	int node;
-	unsigned flush_start_tag = set->queue_depth;
 
 	node = hctx->numa_node;
 	if (node == NUMA_NO_NODE)
@@ -1933,9 +1926,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		goto sched_exit_hctx;
 
 	if (set->ops->init_request &&
-	    set->ops->init_request(set->driver_data,
-				   hctx->fq->flush_rq, hctx_idx,
-				   flush_start_tag + hctx_idx, node))
+	    set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
+				   node))
 		goto free_fq;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 994403efee19..28d932906f24 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1697,9 +1697,8 @@ static void loop_queue_work(struct kthread_work *work)
 	loop_handle_cmd(cmd);
 }
 
-static int loop_init_request(void *data, struct request *rq,
-		unsigned int hctx_idx, unsigned int request_idx,
-		unsigned int numa_node)
+static int loop_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
 	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 9108be601a64..96fe6500e941 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3818,10 +3818,10 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_ERROR;
 }
 
-static void mtip_free_cmd(void *data, struct request *rq,
-			  unsigned int hctx_idx, unsigned int request_idx)
+static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
+			  unsigned int hctx_idx)
 {
-	struct driver_data *dd = data;
+	struct driver_data *dd = set->driver_data;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	if (!cmd->command)
@@ -3831,20 +3831,12 @@ static void mtip_free_cmd(void *data, struct request *rq,
 				cmd->command, cmd->command_dma);
 }
 
-static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
-			 unsigned int request_idx, unsigned int numa_node)
+static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
+			 unsigned int hctx_idx, unsigned int numa_node)
 {
-	struct driver_data *dd = data;
+	struct driver_data *dd = set->driver_data;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
-	/*
-	 * For flush requests, request_idx starts at the end of the
-	 * tag space.  Since we don't support FLUSH/FUA, simply return
-	 * 0 as there's nothing to be done.
-	 */
-	if (request_idx >= MTIP_MAX_COMMAND_SLOTS)
-		return 0;
-
 	cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
 			&cmd->command_dma, GFP_KERNEL);
 	if (!cmd->command)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ac376b9b852d..6b98ec2a3824 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1396,12 +1396,11 @@ static void nbd_dbg_close(void)
 
 #endif
 
-static int nbd_init_request(void *data, struct request *rq,
-			    unsigned int hctx_idx, unsigned int request_idx,
-			    unsigned int numa_node)
+static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
+			    unsigned int hctx_idx, unsigned int numa_node)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
-	cmd->nbd = data;
+	cmd->nbd = set->driver_data;
 	return 0;
 }
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 089ac4179919..3670e8dd03fe 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4307,9 +4307,8 @@ out:
 	return ret;
 }
 
-static int rbd_init_request(void *data, struct request *rq,
-		unsigned int hctx_idx, unsigned int request_idx,
-		unsigned int numa_node)
+static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index f94614257462..94173de1efaa 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -573,11 +573,10 @@ static const struct device_attribute dev_attr_cache_type_rw =
 	__ATTR(cache_type, S_IRUGO|S_IWUSR,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
-static int virtblk_init_request(void *data, struct request *rq,
-		unsigned int hctx_idx, unsigned int request_idx,
-		unsigned int numa_node)
+static int virtblk_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
-	struct virtio_blk *vblk = data;
+	struct virtio_blk *vblk = set->driver_data;
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
 
 #ifdef CONFIG_VIRTIO_BLK_SCSI
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index bff7e3bdb4ed..522d4fa8db64 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -719,11 +719,10 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	return 0;
 }
 
-static int dm_mq_init_request(void *data, struct request *rq,
-		       unsigned int hctx_idx, unsigned int request_idx,
-		       unsigned int numa_node)
+static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
-	return __dm_rq_init_rq(data, rq);
+	return __dm_rq_init_rq(set->driver_data, rq);
 }
 
 static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 51f2be8889b5..5497e65439df 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -334,10 +334,9 @@ static int ubiblock_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 }
 
-static int ubiblock_init_request(void *data, struct request *req,
-				 unsigned int hctx_idx,
-				 unsigned int request_idx,
-				 unsigned int numa_node)
+static int ubiblock_init_request(struct blk_mq_tag_set *set,
+		struct request *req, unsigned int hctx_idx,
+		unsigned int numa_node)
 {
 	struct ubiblock_pdu *pdu = blk_mq_rq_to_pdu(req);
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 4976db56e351..70e689bf1cad 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1172,12 +1172,12 @@ __nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl,
 }
 
 static void
-nvme_fc_exit_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx)
+nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx)
 {
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 
-	return __nvme_fc_exit_request(data, op);
+	return __nvme_fc_exit_request(set->driver_data, op);
 }
 
 static int
@@ -1434,11 +1434,10 @@ out_on_error:
 }
 
 static int
-nvme_fc_init_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
-	struct nvme_fc_ctrl *ctrl = data;
+	struct nvme_fc_ctrl *ctrl = set->driver_data;
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 	struct nvme_fc_queue *queue = &ctrl->queues[hctx_idx+1];
 
@@ -1446,11 +1445,10 @@ nvme_fc_init_request(void *data, struct request *rq,
 }
 
 static int
-nvme_fc_init_admin_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+nvme_fc_init_admin_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
-	struct nvme_fc_ctrl *ctrl = data;
+	struct nvme_fc_ctrl *ctrl = set->driver_data;
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
 	struct nvme_fc_queue *queue = &ctrl->queues[0];
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c8541c3dcd19..56a315bd4d96 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -356,11 +356,11 @@ static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_i
 	nvmeq->tags = NULL;
 }
 
-static int nvme_admin_init_request(void *data, struct request *req,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+static int nvme_admin_init_request(struct blk_mq_tag_set *set,
+		struct request *req, unsigned int hctx_idx,
+		unsigned int numa_node)
 {
-	struct nvme_dev *dev = data;
+	struct nvme_dev *dev = set->driver_data;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = dev->queues[0];
 
@@ -383,11 +383,10 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 	return 0;
 }
 
-static int nvme_init_request(void *data, struct request *req,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
-	struct nvme_dev *dev = data;
+	struct nvme_dev *dev = set->driver_data;
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 29cf88ac3f61..dd1c6deef82f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -315,16 +315,16 @@ static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
 			DMA_TO_DEVICE);
 }
 
-static void nvme_rdma_exit_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx)
+static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx)
 {
-	return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
+	return __nvme_rdma_exit_request(set->driver_data, rq, hctx_idx + 1);
 }
 
-static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx)
+static void nvme_rdma_exit_admin_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx)
 {
-	return __nvme_rdma_exit_request(data, rq, 0);
+	return __nvme_rdma_exit_request(set->driver_data, rq, 0);
 }
 
 static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
@@ -358,18 +358,18 @@ out_free_qe:
 	return -ENOMEM;
 }
 
-static int nvme_rdma_init_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx,
+		unsigned int numa_node)
 {
-	return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
+	return __nvme_rdma_init_request(set->driver_data, rq, hctx_idx + 1);
 }
 
-static int nvme_rdma_init_admin_request(void *data, struct request *rq,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+static int nvme_rdma_init_admin_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx,
+		unsigned int numa_node)
 {
-	return __nvme_rdma_init_request(data, rq, 0);
+	return __nvme_rdma_init_request(set->driver_data, rq, 0);
 }
 
 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 304f1c87c160..feb497134aee 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -230,18 +230,19 @@ static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
 	return 0;
 }
 
-static int nvme_loop_init_request(void *data, struct request *req,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+static int nvme_loop_init_request(struct blk_mq_tag_set *set,
+		struct request *req, unsigned int hctx_idx,
+		unsigned int numa_node)
 {
-	return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1);
+	return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req),
+			hctx_idx + 1);
 }
 
-static int nvme_loop_init_admin_request(void *data, struct request *req,
-				unsigned int hctx_idx, unsigned int rq_idx,
-				unsigned int numa_node)
+static int nvme_loop_init_admin_request(struct blk_mq_tag_set *set,
+		struct request *req, unsigned int hctx_idx,
+		unsigned int numa_node)
 {
-	return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0);
+	return nvme_loop_init_iod(set->driver_data, blk_mq_rq_to_pdu(req), 0);
 }
 
 static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1c3e87d6c48f..327b10206d63 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1999,11 +1999,10 @@ static enum blk_eh_timer_return scsi_timeout(struct request *req,
 	return scsi_times_out(req);
 }
 
-static int scsi_init_request(void *data, struct request *rq,
-		unsigned int hctx_idx, unsigned int request_idx,
-		unsigned int numa_node)
+static int scsi_init_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx, unsigned int numa_node)
 {
-	struct Scsi_Host *shost = data;
+	struct Scsi_Host *shost = set->driver_data;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
 
 	cmd->sense_buffer =
@@ -2014,10 +2013,10 @@ static int scsi_init_request(void *data, struct request *rq,
 	return 0;
 }
 
-static void scsi_exit_request(void *data, struct request *rq,
-		unsigned int hctx_idx, unsigned int request_idx)
+static void scsi_exit_request(struct blk_mq_tag_set *set, struct request *rq,
+		unsigned int hctx_idx)
 {
-	struct Scsi_Host *shost = data;
+	struct Scsi_Host *shost = set->driver_data;
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
 
 	scsi_free_sense_buffer(shost, cmd->sense_buffer);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index f3e5e1de1bdb..a104832e7ae5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -86,9 +86,9 @@ typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
-typedef int (init_request_fn)(void *, struct request *, unsigned int,
+typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int, unsigned int);
-typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int);
 typedef int (reinit_request_fn)(void *, struct request *);
 
-- 
cgit v1.2.3


From 45a0642b4d021a2f50d5db9c191b5bfe60bfa1c7 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 2 May 2017 10:16:05 -0400
Subject: audit: kernel generated netlink traffic should have a portid of 0

We were setting the portid incorrectly in the netlink message headers,
fix that to always be 0 (nlmsg_pid = 0).

Signed-off-by: Paul Moore <paul@paul-moore.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h |  3 +--
 kernel/audit.c        | 23 ++++++-----------------
 kernel/audit.h        |  3 +--
 kernel/auditfilter.c  | 14 ++++++--------
 4 files changed, 14 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 504e784b7ffa..cc0497c39472 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -163,8 +163,7 @@ extern void audit_log_task_info(struct audit_buffer *ab,
 extern int		    audit_update_lsm_rules(void);
 
 				/* Private API (for audit.c only) */
-extern int audit_rule_change(int type, __u32 portid, int seq,
-				void *data, size_t datasz);
+extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
 extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
 
 extern u32 audit_enabled;
diff --git a/kernel/audit.c b/kernel/audit.c
index eff602c1aa79..b40f3c4727e1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -250,14 +250,6 @@ static struct sock *audit_get_sk(const struct net *net)
 	return aunet->sk;
 }
 
-static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
-{
-	if (ab) {
-		struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-		nlh->nlmsg_pid = portid;
-	}
-}
-
 void audit_panic(const char *message)
 {
 	switch (audit_failure) {
@@ -816,7 +808,7 @@ int audit_send_list(void *_dest)
 	return 0;
 }
 
-struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
+struct sk_buff *audit_make_reply(int seq, int type, int done,
 				 int multi, const void *payload, int size)
 {
 	struct sk_buff	*skb;
@@ -829,7 +821,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done,
 	if (!skb)
 		return NULL;
 
-	nlh	= nlmsg_put(skb, portid, seq, t, size, flags);
+	nlh	= nlmsg_put(skb, 0, seq, t, size, flags);
 	if (!nlh)
 		goto out_kfree_skb;
 	data = nlmsg_data(nlh);
@@ -873,7 +865,6 @@ static int audit_send_reply_thread(void *arg)
 static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
 			     int multi, const void *payload, int size)
 {
-	u32 portid = NETLINK_CB(request_skb).portid;
 	struct net *net = sock_net(NETLINK_CB(request_skb).sk);
 	struct sk_buff *skb;
 	struct task_struct *tsk;
@@ -883,12 +874,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int
 	if (!reply)
 		return;
 
-	skb = audit_make_reply(portid, seq, type, done, multi, payload, size);
+	skb = audit_make_reply(seq, type, done, multi, payload, size);
 	if (!skb)
 		goto out;
 
 	reply->net = get_net(net);
-	reply->portid = portid;
+	reply->portid = NETLINK_CB(request_skb).portid;
 	reply->skb = skb;
 
 	tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -1072,7 +1063,7 @@ static int audit_replace(pid_t pid)
 {
 	struct sk_buff *skb;
 
-	skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));
+	skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));
 	if (!skb)
 		return -ENOMEM;
 	return auditd_send_unicast_skb(skb);
@@ -1242,7 +1233,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					size--;
 				audit_log_n_untrustedstring(ab, data, size);
 			}
-			audit_set_portid(ab, NETLINK_CB(skb).portid);
 			audit_log_end(ab);
 		}
 		break;
@@ -1256,8 +1246,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			audit_log_end(ab);
 			return -EPERM;
 		}
-		err = audit_rule_change(msg_type, NETLINK_CB(skb).portid,
-					   seq, data, nlmsg_len(nlh));
+		err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
 		break;
 	case AUDIT_LIST_RULES:
 		err = audit_list_rules_send(skb, seq);
diff --git a/kernel/audit.h b/kernel/audit.h
index 0d87f8ab8778..18f3c2deeccf 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -237,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
 extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
 extern int parent_len(const char *path);
 extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
-extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type,
-					int done, int multi,
+extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
 					const void *payload, int size);
 extern void		    audit_panic(const char *message);
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 239d11c3122c..0b0aa5854dac 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1033,7 +1033,7 @@ out:
 }
 
 /* List rules using struct audit_rule_data. */
-static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
+static void audit_list_rules(int seq, struct sk_buff_head *q)
 {
 	struct sk_buff *skb;
 	struct audit_krule *r;
@@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q)
 			data = audit_krule_to_data(r);
 			if (unlikely(!data))
 				break;
-			skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES,
-					       0, 1, data,
+			skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
+					       data,
 					       sizeof(*data) + data->buflen);
 			if (skb)
 				skb_queue_tail(q, skb);
 			kfree(data);
 		}
 	}
-	skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
+	skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
 	if (skb)
 		skb_queue_tail(q, skb);
 }
@@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 /**
  * audit_rule_change - apply all rules to the specified message type
  * @type: audit message type
- * @portid: target port id for netlink audit messages
  * @seq: netlink audit message sequence (serial) number
  * @data: payload data
  * @datasz: size of payload data
  */
-int audit_rule_change(int type, __u32 portid, int seq, void *data,
-			size_t datasz)
+int audit_rule_change(int type, int seq, void *data, size_t datasz)
 {
 	int err = 0;
 	struct audit_entry *entry;
@@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
 	skb_queue_head_init(&dest->q);
 
 	mutex_lock(&audit_filter_mutex);
-	audit_list_rules(portid, seq, &dest->q);
+	audit_list_rules(seq, &dest->q);
 	mutex_unlock(&audit_filter_mutex);
 
 	tsk = kthread_run(audit_send_list, dest, "audit_send_list");
-- 
cgit v1.2.3


From 2115bb250f260089743e26decfb5f271ba71ca37 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Tue, 2 May 2017 10:16:05 -0400
Subject: audit: Use timespec64 to represent audit timestamps

struct timespec is not y2038 safe.
Audit timestamps are recorded in string format into
an audit buffer for a given context.
These mark the entry timestamps for the syscalls.
Use y2038 safe struct timespec64 to represent the times.
The log strings can handle this transition as strings can
hold upto 1024 characters.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Paul Moore <paul@paul-moore.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  4 ++--
 kernel/audit.c        | 10 +++++-----
 kernel/audit.h        |  2 +-
 kernel/auditsc.c      |  6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index cc0497c39472..2150bdccfbab 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -331,7 +331,7 @@ static inline void audit_ptrace(struct task_struct *t)
 				/* Private API (for audit.c only) */
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
-			      struct timespec *t, unsigned int *serial);
+			      struct timespec64 *t, unsigned int *serial);
 extern int audit_set_loginuid(kuid_t loginuid);
 
 static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
@@ -510,7 +510,7 @@ static inline void __audit_seccomp(unsigned long syscall, long signr, int code)
 static inline void audit_seccomp(unsigned long syscall, long signr, int code)
 { }
 static inline int auditsc_get_stamp(struct audit_context *ctx,
-			      struct timespec *t, unsigned int *serial)
+			      struct timespec64 *t, unsigned int *serial)
 {
 	return 0;
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index a2f7803a68d0..41efd2ad1931 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1638,10 +1638,10 @@ unsigned int audit_serial(void)
 }
 
 static inline void audit_get_stamp(struct audit_context *ctx,
-				   struct timespec *t, unsigned int *serial)
+				   struct timespec64 *t, unsigned int *serial)
 {
 	if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
-		*t = CURRENT_TIME;
+		ktime_get_real_ts64(t);
 		*serial = audit_serial();
 	}
 }
@@ -1665,7 +1665,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
 	struct audit_buffer *ab;
-	struct timespec t;
+	struct timespec64 t;
 	unsigned int uninitialized_var(serial);
 
 	if (audit_initialized != AUDIT_INITIALIZED)
@@ -1718,8 +1718,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 	}
 
 	audit_get_stamp(ab->ctx, &t, &serial);
-	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
-			 t.tv_sec, t.tv_nsec/1000000, serial);
+	audit_log_format(ab, "audit(%llu.%03lu:%u): ",
+			 (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
 
 	return ab;
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index 4987ea2a4702..ddfce2ea4891 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -112,7 +112,7 @@ struct audit_context {
 	enum audit_state    state, current_state;
 	unsigned int	    serial;     /* serial number for record */
 	int		    major;      /* syscall number */
-	struct timespec	    ctime;      /* time of syscall entry */
+	struct timespec64   ctime;      /* time of syscall entry */
 	unsigned long	    argv[4];    /* syscall arguments */
 	long		    return_code;/* syscall return code */
 	u64		    prio;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c2333155893..b2dcbe637b7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1532,7 +1532,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
 		return;
 
 	context->serial     = 0;
-	context->ctime      = CURRENT_TIME;
+	ktime_get_real_ts64(&context->ctime);
 	context->in_syscall = 1;
 	context->current_state  = state;
 	context->ppid       = 0;
@@ -1941,13 +1941,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
 /**
  * auditsc_get_stamp - get local copies of audit_context values
  * @ctx: audit_context for the task
- * @t: timespec to store time recorded in the audit_context
+ * @t: timespec64 to store time recorded in the audit_context
  * @serial: serial value that is recorded in the audit_context
  *
  * Also sets the context as auditable.
  */
 int auditsc_get_stamp(struct audit_context *ctx,
-		       struct timespec *t, unsigned int *serial)
+		       struct timespec64 *t, unsigned int *serial)
 {
 	if (!ctx->in_syscall)
 		return 0;
-- 
cgit v1.2.3


From 8d3f87d8cd0a16c58ae7e4410938528866c1c0db Mon Sep 17 00:00:00 2001
From: "sudarsana.kalluru@cavium.com" <sudarsana.kalluru@cavium.com>
Date: Tue, 2 May 2017 01:11:03 -0700
Subject: qed*: Fix issues in the ptp filter config implementation.

PTP hardware filter configuration performed by the driver for a given
user requested config is not correct for some of the PTP modes.
Following changes are needed for PTP config-filter implementation.
 1. NIG_REG_TX_PTP_EN register - Bits 0/1/2 respectively enables
    TimeSync/"V1 frame format support"/"V2 frame format support" on
    the TX side. Set the associated bits based on the user request.
 2. ptp4l application fails to operate in Peer Delay mode. Following
    changes are needed to fix this,
    a. Driver should enable (set to 0) DA #1-related bits for IPv4,
       IPv6 and MAC destination addresses in these registers:
         NIG_REG_TX_LLH_PTP_RULE_MASK
         NIG_REG_LLH_PTP_RULE_MASK
    b. NIG_REG_LLH_PTP_PARAM_MASK/NIG_REG_TX_LLH_PTP_PARAM_MASK should
       be set to 0x0 in all modes.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_ptp.c   | 84 ++++++++++++++++++-----------
 drivers/net/ethernet/qlogic/qede/qede_ptp.c | 34 +++++++++---
 include/linux/qed/qed_eth_if.h              | 23 +++++---
 3 files changed, 98 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_ptp.c b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
index 1871ebfdb793..434a164a76ed 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ptp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ptp.c
@@ -188,39 +188,73 @@ static int qed_ptp_hw_read_cc(struct qed_dev *cdev, u64 *phc_cycles)
 }
 
 /* Filter PTP protocol packets that need to be timestamped */
-static int qed_ptp_hw_cfg_rx_filters(struct qed_dev *cdev,
-				     enum qed_ptp_filter_type type)
+static int qed_ptp_hw_cfg_filters(struct qed_dev *cdev,
+				  enum qed_ptp_filter_type rx_type,
+				  enum qed_ptp_hwtstamp_tx_type tx_type)
 {
 	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
 	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
-	u32 rule_mask, parm_mask;
+	u32 rule_mask, enable_cfg = 0x0;
 
-	switch (type) {
-	case QED_PTP_FILTER_L2_IPV4_IPV6:
-		parm_mask = 0x6AA;
-		rule_mask = 0x3EEE;
+	switch (rx_type) {
+	case QED_PTP_FILTER_NONE:
+		enable_cfg = 0x0;
+		rule_mask = 0x3FFF;
 		break;
-	case QED_PTP_FILTER_L2:
-		parm_mask = 0x6BF;
-		rule_mask = 0x3EFF;
+	case QED_PTP_FILTER_ALL:
+		enable_cfg = 0x7;
+		rule_mask = 0x3CAA;
 		break;
-	case QED_PTP_FILTER_IPV4_IPV6:
-		parm_mask = 0x7EA;
-		rule_mask = 0x3FFE;
+	case QED_PTP_FILTER_V1_L4_EVENT:
+		enable_cfg = 0x3;
+		rule_mask = 0x3FFA;
 		break;
-	case QED_PTP_FILTER_IPV4:
-		parm_mask = 0x7EE;
+	case QED_PTP_FILTER_V1_L4_GEN:
+		enable_cfg = 0x3;
 		rule_mask = 0x3FFE;
 		break;
+	case QED_PTP_FILTER_V2_L4_EVENT:
+		enable_cfg = 0x5;
+		rule_mask = 0x3FAA;
+		break;
+	case QED_PTP_FILTER_V2_L4_GEN:
+		enable_cfg = 0x5;
+		rule_mask = 0x3FEE;
+		break;
+	case QED_PTP_FILTER_V2_L2_EVENT:
+		enable_cfg = 0x5;
+		rule_mask = 0x3CFF;
+		break;
+	case QED_PTP_FILTER_V2_L2_GEN:
+		enable_cfg = 0x5;
+		rule_mask = 0x3EFF;
+		break;
+	case QED_PTP_FILTER_V2_EVENT:
+		enable_cfg = 0x5;
+		rule_mask = 0x3CAA;
+		break;
+	case QED_PTP_FILTER_V2_GEN:
+		enable_cfg = 0x5;
+		rule_mask = 0x3EEE;
+		break;
 	default:
-		DP_INFO(p_hwfn, "Invalid PTP filter type %d\n", type);
+		DP_INFO(p_hwfn, "Invalid PTP filter type %d\n", rx_type);
 		return -EINVAL;
 	}
 
-	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, parm_mask);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_PARAM_MASK, 0);
 	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_RULE_MASK, rule_mask);
+	qed_wr(p_hwfn, p_ptt, NIG_REG_RX_PTP_EN, enable_cfg);
 
-	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_TO_HOST, 0x1);
+	if (tx_type == QED_PTP_HWTSTAMP_TX_OFF) {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, 0x0);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x7FF);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3FFF);
+	} else {
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_PTP_EN, enable_cfg);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0);
+		qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, rule_mask);
+	}
 
 	/* Reset possibly old timestamps */
 	qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_PTP_HOST_BUF_SEQID,
@@ -383,17 +417,6 @@ static int qed_ptp_hw_enable(struct qed_dev *cdev)
 	return 0;
 }
 
-static int qed_ptp_hw_hwtstamp_tx_on(struct qed_dev *cdev)
-{
-	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
-	struct qed_ptt *p_ptt = p_hwfn->p_ptp_ptt;
-
-	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_PARAM_MASK, 0x6AA);
-	qed_wr(p_hwfn, p_ptt, NIG_REG_TX_LLH_PTP_RULE_MASK, 0x3EEE);
-
-	return 0;
-}
-
 static int qed_ptp_hw_disable(struct qed_dev *cdev)
 {
 	struct qed_hwfn *p_hwfn = QED_LEADING_HWFN(cdev);
@@ -419,8 +442,7 @@ static int qed_ptp_hw_disable(struct qed_dev *cdev)
 }
 
 const struct qed_eth_ptp_ops qed_ptp_ops_pass = {
-	.hwtstamp_tx_on = qed_ptp_hw_hwtstamp_tx_on,
-	.cfg_rx_filters = qed_ptp_hw_cfg_rx_filters,
+	.cfg_filters = qed_ptp_hw_cfg_filters,
 	.read_rx_ts = qed_ptp_hw_read_rx_ts,
 	.read_tx_ts = qed_ptp_hw_read_tx_ts,
 	.read_cc = qed_ptp_hw_read_cc,
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ptp.c b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
index aa4b5e7bb8e1..24f06e2ef43e 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ptp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ptp.c
@@ -209,6 +209,8 @@ static u64 qede_ptp_read_cc(const struct cyclecounter *cc)
 
 static int qede_ptp_cfg_filters(struct qede_dev *edev)
 {
+	enum qed_ptp_hwtstamp_tx_type tx_type = QED_PTP_HWTSTAMP_TX_ON;
+	enum qed_ptp_filter_type rx_filter = QED_PTP_FILTER_NONE;
 	struct qede_ptp *ptp = edev->ptp;
 
 	if (!ptp)
@@ -222,7 +224,12 @@ static int qede_ptp_cfg_filters(struct qede_dev *edev)
 	switch (ptp->tx_type) {
 	case HWTSTAMP_TX_ON:
 		edev->flags |= QEDE_TX_TIMESTAMPING_EN;
-		ptp->ops->hwtstamp_tx_on(edev->cdev);
+		tx_type = QED_PTP_HWTSTAMP_TX_ON;
+		break;
+
+	case HWTSTAMP_TX_OFF:
+		edev->flags &= ~QEDE_TX_TIMESTAMPING_EN;
+		tx_type = QED_PTP_HWTSTAMP_TX_OFF;
 		break;
 
 	case HWTSTAMP_TX_ONESTEP_SYNC:
@@ -233,42 +240,57 @@ static int qede_ptp_cfg_filters(struct qede_dev *edev)
 	spin_lock_bh(&ptp->lock);
 	switch (ptp->rx_filter) {
 	case HWTSTAMP_FILTER_NONE:
+		rx_filter = QED_PTP_FILTER_NONE;
 		break;
 	case HWTSTAMP_FILTER_ALL:
 	case HWTSTAMP_FILTER_SOME:
 		ptp->rx_filter = HWTSTAMP_FILTER_NONE;
+		rx_filter = QED_PTP_FILTER_ALL;
 		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
+		rx_filter = QED_PTP_FILTER_V1_L4_EVENT;
+		break;
 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
 		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
 		/* Initialize PTP detection for UDP/IPv4 events */
-		ptp->ops->cfg_rx_filters(edev->cdev, QED_PTP_FILTER_IPV4);
+		rx_filter = QED_PTP_FILTER_V1_L4_GEN;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
+		rx_filter = QED_PTP_FILTER_V2_L4_EVENT;
+		break;
 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
 		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT;
 		/* Initialize PTP detection for UDP/IPv4 or UDP/IPv6 events */
-		ptp->ops->cfg_rx_filters(edev->cdev, QED_PTP_FILTER_IPV4_IPV6);
+		rx_filter = QED_PTP_FILTER_V2_L4_GEN;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
+		rx_filter = QED_PTP_FILTER_V2_L2_EVENT;
+		break;
 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
 		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_L2_EVENT;
 		/* Initialize PTP detection L2 events */
-		ptp->ops->cfg_rx_filters(edev->cdev, QED_PTP_FILTER_L2);
+		rx_filter = QED_PTP_FILTER_V2_L2_GEN;
 		break;
 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
+		rx_filter = QED_PTP_FILTER_V2_EVENT;
+		break;
 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
 		ptp->rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT;
 		/* Initialize PTP detection L2, UDP/IPv4 or UDP/IPv6 events */
-		ptp->ops->cfg_rx_filters(edev->cdev,
-					 QED_PTP_FILTER_L2_IPV4_IPV6);
+		rx_filter = QED_PTP_FILTER_V2_GEN;
 		break;
 	}
 
+	ptp->ops->cfg_filters(edev->cdev, rx_filter, tx_type);
+
 	spin_unlock_bh(&ptp->lock);
 
 	return 0;
diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h
index 15fa7c6e4c6f..d66d16a559e1 100644
--- a/include/linux/qed/qed_eth_if.h
+++ b/include/linux/qed/qed_eth_if.h
@@ -164,10 +164,21 @@ struct qed_eth_cb_ops {
 #define QED_MAX_PHC_DRIFT_PPB   291666666
 
 enum qed_ptp_filter_type {
-	QED_PTP_FILTER_L2,
-	QED_PTP_FILTER_IPV4,
-	QED_PTP_FILTER_IPV4_IPV6,
-	QED_PTP_FILTER_L2_IPV4_IPV6
+	QED_PTP_FILTER_NONE,
+	QED_PTP_FILTER_ALL,
+	QED_PTP_FILTER_V1_L4_EVENT,
+	QED_PTP_FILTER_V1_L4_GEN,
+	QED_PTP_FILTER_V2_L4_EVENT,
+	QED_PTP_FILTER_V2_L4_GEN,
+	QED_PTP_FILTER_V2_L2_EVENT,
+	QED_PTP_FILTER_V2_L2_GEN,
+	QED_PTP_FILTER_V2_EVENT,
+	QED_PTP_FILTER_V2_GEN
+};
+
+enum qed_ptp_hwtstamp_tx_type {
+	QED_PTP_HWTSTAMP_TX_OFF,
+	QED_PTP_HWTSTAMP_TX_ON,
 };
 
 #ifdef CONFIG_DCB
@@ -230,8 +241,8 @@ struct qed_eth_dcbnl_ops {
 #endif
 
 struct qed_eth_ptp_ops {
-	int (*hwtstamp_tx_on)(struct qed_dev *);
-	int (*cfg_rx_filters)(struct qed_dev *, enum qed_ptp_filter_type);
+	int (*cfg_filters)(struct qed_dev *, enum qed_ptp_filter_type,
+			   enum qed_ptp_hwtstamp_tx_type);
 	int (*read_rx_ts)(struct qed_dev *, u64 *);
 	int (*read_tx_ts)(struct qed_dev *, u64 *);
 	int (*read_cc)(struct qed_dev *, u64 *);
-- 
cgit v1.2.3


From 9b2bbdb227588455afcc3b03475fa9b0a35d83af Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 6 Mar 2017 18:19:39 +0200
Subject: virtio: wrap find_vqs

We are going to add more parameters to find_vqs, let's wrap the call so
we don't need to tweak all drivers every time.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c                 | 3 +--
 drivers/char/virtio_console.c              | 6 +++---
 drivers/crypto/virtio/virtio_crypto_core.c | 3 +--
 drivers/gpu/drm/virtio/virtgpu_kms.c       | 3 +--
 drivers/net/caif/caif_virtio.c             | 3 +--
 drivers/net/virtio_net.c                   | 3 +--
 drivers/rpmsg/virtio_rpmsg_bus.c           | 2 +-
 drivers/scsi/virtio_scsi.c                 | 3 +--
 drivers/virtio/virtio_balloon.c            | 3 +--
 drivers/virtio/virtio_input.c              | 3 +--
 include/linux/virtio_config.h              | 9 +++++++++
 net/vmw_vsock/virtio_transport.c           | 6 +++---
 12 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1d4c9f8bc1e1..c08c30c35035 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -455,8 +455,7 @@ static int init_vq(struct virtio_blk *vblk)
 	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
-			&desc);
+	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
 	if (err)
 		goto out;
 
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 87fe111d0be6..d0699c5fec43 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -1945,9 +1945,9 @@ static int init_vqs(struct ports_device *portdev)
 		}
 	}
 	/* Find the queues. */
-	err = portdev->vdev->config->find_vqs(portdev->vdev, nr_queues, vqs,
-					      io_callbacks,
-					      (const char **)io_names, NULL);
+	err = virtio_find_vqs(portdev->vdev, nr_queues, vqs,
+			      io_callbacks,
+			      (const char **)io_names, NULL);
 	if (err)
 		goto free;
 
diff --git a/drivers/crypto/virtio/virtio_crypto_core.c b/drivers/crypto/virtio/virtio_crypto_core.c
index 21472e427f6f..a111cd72797b 100644
--- a/drivers/crypto/virtio/virtio_crypto_core.c
+++ b/drivers/crypto/virtio/virtio_crypto_core.c
@@ -119,8 +119,7 @@ static int virtcrypto_find_vqs(struct virtio_crypto *vi)
 		names[i] = vi->data_vq[i].name;
 	}
 
-	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 names, NULL);
+	ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, callbacks, names, NULL);
 	if (ret)
 		goto err_find;
 
diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c
index 491866865c33..1e1c90b30d4a 100644
--- a/drivers/gpu/drm/virtio/virtgpu_kms.c
+++ b/drivers/gpu/drm/virtio/virtgpu_kms.c
@@ -175,8 +175,7 @@ int virtio_gpu_driver_load(struct drm_device *dev, unsigned long flags)
 	DRM_INFO("virgl 3d acceleration not supported by guest\n");
 #endif
 
-	ret = vgdev->vdev->config->find_vqs(vgdev->vdev, 2, vqs,
-					    callbacks, names, NULL);
+	ret = virtio_find_vqs(vgdev->vdev, 2, vqs, callbacks, names, NULL);
 	if (ret) {
 		DRM_ERROR("failed to find virt queues\n");
 		goto err_vqs;
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index bc0eb47eccee..6122768c8644 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -679,8 +679,7 @@ static int cfv_probe(struct virtio_device *vdev)
 		goto err;
 
 	/* Get the TX virtio ring. This is a "guest side vring". */
-	err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names,
-			NULL);
+	err = virtio_find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names, NULL);
 	if (err)
 		goto err;
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f36584616e7d..71f447ab440e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2079,8 +2079,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 		names[txq2vq(i)] = vi->sq[i].name;
 	}
 
-	ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
-					 names, NULL);
+	ret = virtio_find_vqs(vi->vdev, total_vqs, vqs, callbacks, names, NULL);
 	if (ret)
 		goto err_find;
 
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 5e66e081027e..f7cade09d38a 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -869,7 +869,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
 	init_waitqueue_head(&vrp->sendq);
 
 	/* We expect two virtqueues, rx and tx (and in this order) */
-	err = vdev->config->find_vqs(vdev, 2, vqs, vq_cbs, names, NULL);
+	err = virtio_find_vqs(vdev, 2, vqs, vq_cbs, names, NULL);
 	if (err)
 		goto free_vrp;
 
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 939c47df73fa..e9222dcb9707 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -870,8 +870,7 @@ static int virtscsi_init(struct virtio_device *vdev,
 	}
 
 	/* Discover virtqueues and write information to configuration.  */
-	err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names,
-			&desc);
+	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
 	if (err)
 		goto out;
 
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 34adf9b9c053..408c174ef0d5 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -418,8 +418,7 @@ static int init_vqs(struct virtio_balloon *vb)
 	 * optionally stat.
 	 */
 	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
-			NULL);
+	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
 	if (err)
 		return err;
 
diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c
index 79f1293cda93..3a0468f2ceb0 100644
--- a/drivers/virtio/virtio_input.c
+++ b/drivers/virtio/virtio_input.c
@@ -173,8 +173,7 @@ static int virtinput_init_vqs(struct virtio_input *vi)
 	static const char * const names[] = { "events", "status" };
 	int err;
 
-	err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names,
-			NULL);
+	err = virtio_find_vqs(vi->vdev, 2, vqs, cbs, names, NULL);
 	if (err)
 		return err;
 	vi->evt = vqs[0];
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 8355bab175e1..47f3d805c290 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -179,6 +179,15 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	return vq;
 }
 
+static inline
+int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+			struct virtqueue *vqs[], vq_callback_t *callbacks[],
+			const char * const names[],
+			struct irq_affinity *desc)
+{
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
+}
+
 /**
  * virtio_device_ready - enable vq use in probe function
  * @vdev: the device
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 68675a151f22..97e26e2955e1 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -573,9 +573,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
 
 	vsock->vdev = vdev;
 
-	ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX,
-					    vsock->vqs, callbacks, names,
-					    NULL);
+	ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX,
+			      vsock->vqs, callbacks, names,
+			      NULL);
 	if (ret < 0)
 		goto out;
 
-- 
cgit v1.2.3


From f94682dde5ed23eed13533a37dfce942e60ade4e Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 6 Mar 2017 18:32:29 +0200
Subject: virtio: add context flag to find vqs

Allows maintaining extra context per vq.  For ease of use, passing in
NULL is legal and disables the feature for all vqs.

Includes fixes by Christian for s390, acked by Cornelia.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/misc/mic/vop/vop_main.c        |  9 ++++++---
 drivers/remoteproc/remoteproc_virtio.c | 10 ++++++----
 drivers/s390/virtio/kvm_virtio.c       |  8 +++++---
 drivers/s390/virtio/virtio_ccw.c       |  7 ++++---
 drivers/virtio/virtio_mmio.c           |  8 +++++---
 drivers/virtio/virtio_pci_common.c     | 17 +++++++++++------
 drivers/virtio/virtio_pci_common.h     |  4 +++-
 drivers/virtio/virtio_pci_legacy.c     |  4 +++-
 drivers/virtio/virtio_pci_modern.c     | 12 ++++++++----
 drivers/virtio/virtio_ring.c           |  7 +++++--
 include/linux/virtio_config.h          | 18 +++++++++++++++---
 include/linux/virtio_ring.h            |  3 +++
 12 files changed, 74 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/mic/vop/vop_main.c b/drivers/misc/mic/vop/vop_main.c
index c2e29d7f0de8..a341938c7e2c 100644
--- a/drivers/misc/mic/vop/vop_main.c
+++ b/drivers/misc/mic/vop/vop_main.c
@@ -278,7 +278,7 @@ static void vop_del_vqs(struct virtio_device *dev)
 static struct virtqueue *vop_find_vq(struct virtio_device *dev,
 				     unsigned index,
 				     void (*callback)(struct virtqueue *vq),
-				     const char *name)
+				     const char *name, bool ctx)
 {
 	struct _vop_vdev *vdev = to_vopvdev(dev);
 	struct vop_device *vpdev = vdev->vpdev;
@@ -314,6 +314,7 @@ static struct virtqueue *vop_find_vq(struct virtio_device *dev,
 				le16_to_cpu(config.num), MIC_VIRTIO_RING_ALIGN,
 				dev,
 				false,
+				ctx,
 				(void __force *)va, vop_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -374,7 +375,8 @@ unmap:
 static int vop_find_vqs(struct virtio_device *dev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
-			const char * const names[], struct irq_affinity *desc)
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc)
 {
 	struct _vop_vdev *vdev = to_vopvdev(dev);
 	struct vop_device *vpdev = vdev->vpdev;
@@ -388,7 +390,8 @@ static int vop_find_vqs(struct virtio_device *dev, unsigned nvqs,
 	for (i = 0; i < nvqs; ++i) {
 		dev_dbg(_vop_dev(vdev), "%s: %d: %s\n",
 			__func__, i, names[i]);
-		vqs[i] = vop_find_vq(dev, i, callbacks[i], names[i]);
+		vqs[i] = vop_find_vq(dev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
 			goto error;
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index 0142cc3f0c91..294634836b32 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(rproc_vq_interrupt);
 static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 				    unsigned int id,
 				    void (*callback)(struct virtqueue *vq),
-				    const char *name)
+				    const char *name, bool ctx)
 {
 	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
 	struct rproc *rproc = vdev_to_rproc(vdev);
@@ -103,8 +103,8 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev,
 	 * Create the new vq, and tell virtio we're not interested in
 	 * the 'weak' smp barriers, since we're talking with a real device.
 	 */
-	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, addr,
-				 rproc_virtio_notify, callback, name);
+	vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx,
+				 addr, rproc_virtio_notify, callback, name);
 	if (!vq) {
 		dev_err(dev, "vring_new_virtqueue %s failed\n", name);
 		rproc_free_vring(rvring);
@@ -138,12 +138,14 @@ static int rproc_virtio_find_vqs(struct virtio_device *vdev, unsigned int nvqs,
 				 struct virtqueue *vqs[],
 				 vq_callback_t *callbacks[],
 				 const char * const names[],
+				 const bool * ctx,
 				 struct irq_affinity *desc)
 {
 	int i, ret;
 
 	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = rp_find_vq(vdev, i, callbacks[i], names[i]);
+		vqs[i] = rp_find_vq(vdev, i, callbacks[i], names[i],
+				    ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			ret = PTR_ERR(vqs[i]);
 			goto error;
diff --git a/drivers/s390/virtio/kvm_virtio.c b/drivers/s390/virtio/kvm_virtio.c
index 2ce0b3eb2efe..a99d09a11f05 100644
--- a/drivers/s390/virtio/kvm_virtio.c
+++ b/drivers/s390/virtio/kvm_virtio.c
@@ -189,7 +189,7 @@ static bool kvm_notify(struct virtqueue *vq)
 static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 				     unsigned index,
 				     void (*callback)(struct virtqueue *vq),
-				     const char *name)
+				     const char *name, bool ctx)
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
 	struct kvm_vqconfig *config;
@@ -211,7 +211,7 @@ static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
 		goto out;
 
 	vq = vring_new_virtqueue(index, config->num, KVM_S390_VIRTIO_RING_ALIGN,
-				 vdev, true, (void *) config->address,
+				 vdev, true, ctx, (void *) config->address,
 				 kvm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
@@ -256,6 +256,7 @@ static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			struct virtqueue *vqs[],
 			vq_callback_t *callbacks[],
 			const char * const names[],
+			const bool *ctx,
 			struct irq_affinity *desc)
 {
 	struct kvm_device *kdev = to_kvmdev(vdev);
@@ -266,7 +267,8 @@ static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		return -ENOENT;
 
 	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+		vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i]))
 			goto error;
 	}
diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index 0ed209f3d8b0..2a76ea78a0bf 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -484,7 +484,7 @@ static void virtio_ccw_del_vqs(struct virtio_device *vdev)
 
 static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 					     int i, vq_callback_t *callback,
-					     const char *name,
+					     const char *name, bool ctx,
 					     struct ccw1 *ccw)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
@@ -522,7 +522,7 @@ static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
 	}
 
 	vq = vring_new_virtqueue(i, info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
-				 true, info->queue, virtio_ccw_kvm_notify,
+				 true, ctx, info->queue, virtio_ccw_kvm_notify,
 				 callback, name);
 	if (!vq) {
 		/* For now, we fail if we can't get the requested size. */
@@ -629,6 +629,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			       struct virtqueue *vqs[],
 			       vq_callback_t *callbacks[],
 			       const char * const names[],
+			       const bool *ctx,
 			       struct irq_affinity *desc)
 {
 	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
@@ -642,7 +643,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 
 	for (i = 0; i < nvqs; ++i) {
 		vqs[i] = virtio_ccw_setup_vq(vdev, i, callbacks[i], names[i],
-					     ccw);
+					     ctx ? ctx[i] : false, ccw);
 		if (IS_ERR(vqs[i])) {
 			ret = PTR_ERR(vqs[i]);
 			vqs[i] = NULL;
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index 78343b8f9034..74dc7170fd35 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -351,7 +351,7 @@ static void vm_del_vqs(struct virtio_device *vdev)
 
 static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 				  void (*callback)(struct virtqueue *vq),
-				  const char *name)
+				  const char *name, bool ctx)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 	struct virtio_mmio_vq_info *info;
@@ -388,7 +388,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index,
 
 	/* Create the vring */
 	vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev,
-				 true, true, vm_notify, callback, name);
+				 true, true, ctx, vm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
@@ -447,6 +447,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		       struct virtqueue *vqs[],
 		       vq_callback_t *callbacks[],
 		       const char * const names[],
+		       const bool *ctx,
 		       struct irq_affinity *desc)
 {
 	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
@@ -459,7 +460,8 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		return err;
 
 	for (i = 0; i < nvqs; ++i) {
-		vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i]);
+		vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false);
 		if (IS_ERR(vqs[i])) {
 			vm_del_vqs(vdev);
 			return PTR_ERR(vqs[i]);
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c
index 698d5d06fa03..007a4f366086 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -172,6 +172,7 @@ error:
 static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
 				     void (*callback)(struct virtqueue *vq),
 				     const char *name,
+				     bool ctx,
 				     u16 msix_vec)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -183,7 +184,7 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index,
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 
-	vq = vp_dev->setup_vq(vp_dev, info, index, callback, name,
+	vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx,
 			      msix_vec);
 	if (IS_ERR(vq))
 		goto out_info;
@@ -274,6 +275,7 @@ void vp_del_vqs(struct virtio_device *vdev)
 static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
 		const char * const names[], bool per_vq_vectors,
+		const bool *ctx,
 		struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
@@ -315,6 +317,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,
 		else
 			msix_vec = VP_MSIX_VQ_VECTOR;
 		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false,
 				     msix_vec);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
@@ -345,7 +348,7 @@ error_find:
 
 static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[])
+		const char * const names[], const bool *ctx)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	int i, err;
@@ -367,6 +370,7 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs,
 			continue;
 		}
 		vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i],
+				     ctx ? ctx[i] : false,
 				     VIRTIO_MSI_NO_VECTOR);
 		if (IS_ERR(vqs[i])) {
 			err = PTR_ERR(vqs[i]);
@@ -383,20 +387,21 @@ out_del_vqs:
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], struct irq_affinity *desc)
+		const char * const names[], const bool *ctx,
+		struct irq_affinity *desc)
 {
 	int err;
 
 	/* Try MSI-X with one vector per queue. */
-	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, desc);
+	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc);
 	if (!err)
 		return 0;
 	/* Fallback: MSI-X with one vector for config, one shared for queues. */
-	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, desc);
+	err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc);
 	if (!err)
 		return 0;
 	/* Finally fall back to regular interrupts. */
-	return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names);
+	return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx);
 }
 
 const char *vp_bus_name(struct virtio_device *vdev)
diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h
index e96334aec1e0..135ee3cf7175 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -102,6 +102,7 @@ struct virtio_pci_device {
 				      unsigned idx,
 				      void (*callback)(struct virtqueue *vq),
 				      const char *name,
+				      bool ctx,
 				      u16 msix_vec);
 	void (*del_vq)(struct virtio_pci_vq_info *info);
 
@@ -131,7 +132,8 @@ void vp_del_vqs(struct virtio_device *vdev);
 /* the config->find_vqs() implementation */
 int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], struct irq_affinity *desc);
+		const char * const names[], const bool *ctx,
+		struct irq_affinity *desc);
 const char *vp_bus_name(struct virtio_device *vdev);
 
 /* Setup the affinity for a virtqueue:
diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c
index 4bfa48fb1324..2780886e8ba3 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -116,6 +116,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				  unsigned index,
 				  void (*callback)(struct virtqueue *vq),
 				  const char *name,
+				  bool ctx,
 				  u16 msix_vec)
 {
 	struct virtqueue *vq;
@@ -135,7 +136,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	/* create the vring */
 	vq = vring_create_virtqueue(index, num,
 				    VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev,
-				    true, false, vp_notify, callback, name);
+				    true, false, ctx,
+				    vp_notify, callback, name);
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index 8978f109d2d7..2555d80f6eec 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -297,6 +297,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 				  unsigned index,
 				  void (*callback)(struct virtqueue *vq),
 				  const char *name,
+				  bool ctx,
 				  u16 msix_vec)
 {
 	struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common;
@@ -328,7 +329,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev,
 	/* create the vring */
 	vq = vring_create_virtqueue(index, num,
 				    SMP_CACHE_BYTES, &vp_dev->vdev,
-				    true, true, vp_notify, callback, name);
+				    true, true, ctx,
+				    vp_notify, callback, name);
 	if (!vq)
 		return ERR_PTR(-ENOMEM);
 
@@ -387,12 +389,14 @@ err_map_notify:
 }
 
 static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs,
-		struct virtqueue *vqs[], vq_callback_t *callbacks[],
-		const char * const names[], struct irq_affinity *desc)
+			      struct virtqueue *vqs[],
+			      vq_callback_t *callbacks[],
+			      const char * const names[], const bool *ctx,
+			      struct irq_affinity *desc)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
 	struct virtqueue *vq;
-	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
+	int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc);
 
 	if (rc)
 		return rc;
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 409aeaa49246..b23b5fae468b 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -916,6 +916,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					struct vring vring,
 					struct virtio_device *vdev,
 					bool weak_barriers,
+					bool context,
 					bool (*notify)(struct virtqueue *),
 					void (*callback)(struct virtqueue *),
 					const char *name)
@@ -1019,6 +1020,7 @@ struct virtqueue *vring_create_virtqueue(
 	struct virtio_device *vdev,
 	bool weak_barriers,
 	bool may_reduce_num,
+	bool context,
 	bool (*notify)(struct virtqueue *),
 	void (*callback)(struct virtqueue *),
 	const char *name)
@@ -1058,7 +1060,7 @@ struct virtqueue *vring_create_virtqueue(
 	queue_size_in_bytes = vring_size(num, vring_align);
 	vring_init(&vring, num, queue, vring_align);
 
-	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
+	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
 				   notify, callback, name);
 	if (!vq) {
 		vring_free_queue(vdev, queue_size_in_bytes, queue,
@@ -1079,6 +1081,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool context,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
@@ -1086,7 +1089,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 {
 	struct vring vring;
 	vring_init(&vring, num, pages, vring_align);
-	return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
+	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
 				     notify, callback, name);
 }
 EXPORT_SYMBOL_GPL(vring_new_virtqueue);
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 47f3d805c290..0133d8a12ccd 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -72,7 +72,8 @@ struct virtio_config_ops {
 	void (*reset)(struct virtio_device *vdev);
 	int (*find_vqs)(struct virtio_device *, unsigned nvqs,
 			struct virtqueue *vqs[], vq_callback_t *callbacks[],
-			const char * const names[], struct irq_affinity *desc);
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc);
 	void (*del_vqs)(struct virtio_device *);
 	u64 (*get_features)(struct virtio_device *vdev);
 	int (*finalize_features)(struct virtio_device *vdev);
@@ -173,7 +174,8 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 	vq_callback_t *callbacks[] = { c };
 	const char *names[] = { n };
 	struct virtqueue *vq;
-	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL);
+	int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL,
+					 NULL);
 	if (err < 0)
 		return ERR_PTR(err);
 	return vq;
@@ -185,7 +187,17 @@ int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 			const char * const names[],
 			struct irq_affinity *desc)
 {
-	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, desc);
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc);
+}
+
+static inline
+int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs,
+			struct virtqueue *vqs[], vq_callback_t *callbacks[],
+			const char * const names[], const bool *ctx,
+			struct irq_affinity *desc)
+{
+	return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx,
+				      desc);
 }
 
 /**
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index e8d36938f09a..270cfa81830e 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -71,6 +71,7 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
 					 struct virtio_device *vdev,
 					 bool weak_barriers,
 					 bool may_reduce_num,
+					 bool ctx,
 					 bool (*notify)(struct virtqueue *vq),
 					 void (*callback)(struct virtqueue *vq),
 					 const char *name);
@@ -80,6 +81,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 					struct vring vring,
 					struct virtio_device *vdev,
 					bool weak_barriers,
+					bool ctx,
 					bool (*notify)(struct virtqueue *),
 					void (*callback)(struct virtqueue *),
 					const char *name);
@@ -93,6 +95,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
 				      bool weak_barriers,
+				      bool ctx,
 				      void *pages,
 				      bool (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
-- 
cgit v1.2.3


From 5a08b04f637921e44ba767c07c74b0535504ab71 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 7 Feb 2017 06:15:13 +0200
Subject: virtio: allow extra context per descriptor

Allow extra context per descriptor. To avoid slow down for data path,
this disables use of indirect descriptors for this vq.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_ring.c | 70 ++++++++++++++++++++++++++++++++++++--------
 include/linux/virtio.h       |  9 ++++++
 2 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index b23b5fae468b..5e1b548828e6 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -263,6 +263,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 				unsigned int out_sgs,
 				unsigned int in_sgs,
 				void *data,
+				void *ctx,
 				gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -275,6 +276,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	START_USE(vq);
 
 	BUG_ON(data == NULL);
+	BUG_ON(ctx && vq->indirect);
 
 	if (unlikely(vq->broken)) {
 		END_USE(vq);
@@ -389,6 +391,8 @@ static inline int virtqueue_add(struct virtqueue *_vq,
 	vq->desc_state[head].data = data;
 	if (indirect)
 		vq->desc_state[head].indir_desc = desc;
+	if (ctx)
+		vq->desc_state[head].indir_desc = ctx;
 
 	/* Put entry in available array (but don't update avail->idx until they
 	 * do sync). */
@@ -461,7 +465,8 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 		for (sg = sgs[i]; sg; sg = sg_next(sg))
 			total_sg++;
 	}
-	return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
+	return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs,
+			     data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 
@@ -483,7 +488,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq,
 			 void *data,
 			 gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, num, 1, 0, data, gfp);
+	return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
 
@@ -505,10 +510,34 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
 			void *data,
 			gfp_t gfp)
 {
-	return virtqueue_add(vq, &sg, num, 0, 1, data, gfp);
+	return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp);
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
 
+/**
+ * virtqueue_add_inbuf_ctx - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: scatterlist (must be well-formed and terminated!)
+ * @num: the number of entries in @sg writable by other side
+ * @data: the token identifying the buffer.
+ * @ctx: extra context for the token
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
+ */
+int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
+			struct scatterlist *sg, unsigned int num,
+			void *data,
+			void *ctx,
+			gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @vq: the struct virtqueue
@@ -598,7 +627,8 @@ bool virtqueue_kick(struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
-static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
+static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
+		       void **ctx)
 {
 	unsigned int i, j;
 	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
@@ -622,10 +652,15 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 	/* Plus final descriptor */
 	vq->vq.num_free++;
 
-	/* Free the indirect table, if any, now that it's unmapped. */
-	if (vq->desc_state[head].indir_desc) {
+	if (vq->indirect) {
 		struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
-		u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
+		u32 len;
+
+		/* Free the indirect table, if any, now that it's unmapped. */
+		if (!indir_desc)
+			return;
+
+		len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
 
 		BUG_ON(!(vq->vring.desc[head].flags &
 			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
@@ -634,8 +669,10 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 		for (j = 0; j < len / sizeof(struct vring_desc); j++)
 			vring_unmap_one(vq, &indir_desc[j]);
 
-		kfree(vq->desc_state[head].indir_desc);
+		kfree(indir_desc);
 		vq->desc_state[head].indir_desc = NULL;
+	} else if (ctx) {
+		*ctx = vq->desc_state[head].indir_desc;
 	}
 }
 
@@ -660,7 +697,8 @@ static inline bool more_used(const struct vring_virtqueue *vq)
  * Returns NULL if there are no used buffers, or the "data" token
  * handed to virtqueue_add_*().
  */
-void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
+void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
+			    void **ctx)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	void *ret;
@@ -698,7 +736,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 
 	/* detach_buf clears data, so grab it now. */
 	ret = vq->desc_state[i].data;
-	detach_buf(vq, i);
+	detach_buf(vq, i, ctx);
 	vq->last_used_idx++;
 	/* If we expect an interrupt for the next entry, tell host
 	 * by writing event index and flush out the write before
@@ -715,8 +753,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	END_USE(vq);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(virtqueue_get_buf);
+EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
 
+void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
+{
+	return virtqueue_get_buf_ctx(_vq, len, NULL);
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_buf);
 /**
  * virtqueue_disable_cb - disable callbacks
  * @vq: the struct virtqueue we're talking about.
@@ -878,7 +921,7 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 			continue;
 		/* detach_buf clears data, so grab it now. */
 		buf = vq->desc_state[i].data;
-		detach_buf(vq, i);
+		detach_buf(vq, i, NULL);
 		vq->avail_idx_shadow--;
 		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
 		END_USE(vq);
@@ -951,7 +994,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	vq->last_add_time_valid = false;
 #endif
 
-	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
+		!context;
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 7edfbdb55a99..ed04753278d4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -44,6 +44,12 @@ int virtqueue_add_inbuf(struct virtqueue *vq,
 			void *data,
 			gfp_t gfp);
 
+int virtqueue_add_inbuf_ctx(struct virtqueue *vq,
+			    struct scatterlist sg[], unsigned int num,
+			    void *data,
+			    void *ctx,
+			    gfp_t gfp);
+
 int virtqueue_add_sgs(struct virtqueue *vq,
 		      struct scatterlist *sgs[],
 		      unsigned int out_sgs,
@@ -59,6 +65,9 @@ bool virtqueue_notify(struct virtqueue *vq);
 
 void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
 
+void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len,
+			    void **ctx);
+
 void virtqueue_disable_cb(struct virtqueue *vq);
 
 bool virtqueue_enable_cb(struct virtqueue *vq);
-- 
cgit v1.2.3


From b9dd46188edc2f0d1f37328637860bb65a771124 Mon Sep 17 00:00:00 2001
From: Jin Qian <jinqian@google.com>
Date: Tue, 25 Apr 2017 16:28:48 -0700
Subject: f2fs: sanity check segment count

F2FS uses 4 bytes to represent block address. As a result, supported
size of disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments.

Signed-off-by: Jin Qian <jinqian@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/super.c         | 7 +++++++
 include/linux/f2fs_fs.h | 6 ++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 97c07a5153e9..4cd3bee6775f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1494,6 +1494,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
 		return 1;
 	}
 
+	if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid segment count (%u)",
+			le32_to_cpu(raw_super->segment_count));
+		return 1;
+	}
+
 	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
 	if (sanity_check_area_boundary(sbi, bh))
 		return 1;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 639cbdf65e2b..093549e10ee2 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -301,6 +301,12 @@ struct f2fs_nat_block {
 #define SIT_VBLOCK_MAP_SIZE 64
 #define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry))
 
+/*
+ * F2FS uses 4 bytes to represent block address. As a result, supported size of
+ * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments.
+ */
+#define F2FS_MAX_SEGMENT       ((16 * 1024 * 1024) / 2)
+
 /*
  * Note that f2fs_sit_entry->vblocks has the following bit-field information.
  * [15:10] : allocation type such as CURSEG_XXXX_TYPE
-- 
cgit v1.2.3


From 4d463c4dbc5c1c5d73e488d52faeec05570443a0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 3 May 2017 00:39:17 +0200
Subject: xdp: use common helper for netlink extended ack reporting

Small follow-up to d74a32acd59a ("xdp: use netlink extended ACK reporting")
in order to let drivers all use the same NL_SET_ERR_MSG_MOD() helper macro
for reporting. This also ensures that we consistently add the driver's
prefix for dumping the report in user space to indicate that the error
message is driver specific and not coming from core code. Furthermore,
NL_SET_ERR_MSG_MOD() now reuses NL_SET_ERR_MSG() and thus makes all macros
check the pointer as suggested.

References: https://www.spinics.net/lists/netdev/msg433267.html
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c |  4 ++--
 drivers/net/virtio_net.c                            |  8 ++++----
 include/linux/netlink.h                             | 19 ++++++++-----------
 3 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index db20376260f5..82bd6b0935f1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2532,11 +2532,11 @@ nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp,
 	if (!dp->xdp_prog)
 		return 0;
 	if (dp->fl_bufsz > PAGE_SIZE) {
-		NL_MOD_TRY_SET_ERR_MSG(extack, "MTU too large w/ XDP enabled");
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large w/ XDP enabled");
 		return -EINVAL;
 	}
 	if (dp->num_tx_rings > nn->max_tx_rings) {
-		NL_MOD_TRY_SET_ERR_MSG(extack, "Insufficient number of TX rings w/ XDP enabled");
+		NL_SET_ERR_MSG_MOD(extack, "Insufficient number of TX rings w/ XDP enabled");
 		return -EINVAL;
 	}
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3d0bc484b3d7..1c6d3923c224 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1891,17 +1891,17 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
 	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
 	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
-		NL_SET_ERR_MSG(extack, "can't set XDP while host is implementing LRO, disable LRO first");
+		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
 		return -EOPNOTSUPP;
 	}
 
 	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
-		NL_SET_ERR_MSG(extack, "XDP expects header/data in single page, any_header_sg required");
+		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
 		return -EINVAL;
 	}
 
 	if (dev->mtu > max_sz) {
-		NL_SET_ERR_MSG(extack, "MTU too large to enable XDP");
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
 		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
 		return -EINVAL;
 	}
@@ -1912,7 +1912,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 
 	/* XDP requires extra queues for XDP_TX */
 	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
-		NL_SET_ERR_MSG(extack, "Too few free TX rings available");
+		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
 		netdev_warn(dev, "request %i queues but max is %i\n",
 			    curr_qp + xdp_qp, vi->max_queue_pairs);
 		return -ENOMEM;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index c20395edf2de..5fff5ba5964e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -86,19 +86,16 @@ struct netlink_ext_ack {
  * Currently string formatting is not supported (due
  * to the lack of an output buffer.)
  */
-#define NL_SET_ERR_MSG(extack, msg) do {	\
-	static const char _msg[] = (msg);	\
-						\
-	(extack)->_msg = _msg;			\
+#define NL_SET_ERR_MSG(extack, msg) do {		\
+	static const char __msg[] = (msg);		\
+	struct netlink_ext_ack *__extack = (extack);	\
+							\
+	if (__extack)					\
+		__extack->_msg = __msg;			\
 } while (0)
 
-#define NL_MOD_TRY_SET_ERR_MSG(extack, msg) do {		\
-	static const char _msg[] = KBUILD_MODNAME ": " msg;	\
-	struct netlink_ext_ack *_extack = (extack);		\
-								\
-	if (_extack)						\
-		_extack->_msg = _msg;				\
-} while (0)
+#define NL_SET_ERR_MSG_MOD(extack, msg)			\
+	NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg)
 
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
-- 
cgit v1.2.3


From 4e335d9e7ddbcf83d03e7fbe65797ebed2272c18 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 2 May 2017 16:20:18 +0200
Subject: Revert "KVM: Support vCPU-based gfn->hva cache"

This reverts commit bbd6411513aa8ef3ea02abab61318daf87c1af1e.

I've been sitting on this revert for too long and it unfortunately
missed 4.11.  It's also the reason why I haven't merged ring-based
dirty tracking for 4.12.

Using kvm_vcpu_memslots in kvm_gfn_to_hva_cache_init and
kvm_vcpu_write_guest_offset_cached means that the MSR value can
now be used to access SMRAM, simply by making it point to an SMRAM
physical address.  This is problematic because it lets the guest
OS overwrite memory that it shouldn't be able to touch.

Cc: stable@vger.kernel.org
Fixes: bbd6411513aa8ef3ea02abab61318daf87c1af1e
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c     | 22 ++++++++++++----------
 arch/x86/kvm/x86.c       | 41 +++++++++++++++++++++--------------------
 include/linux/kvm_host.h | 16 ++++++++--------
 virt/kvm/kvm_main.c      | 34 +++++++++++++++++-----------------
 4 files changed, 58 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bad6a25067bc..9fa5b8164961 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
-	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
-					   sizeof(val));
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+				      sizeof(val));
 }
 
 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
 {
-	return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
-					  sizeof(*val));
+
+	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+				      sizeof(*val));
 }
 
 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -2285,8 +2287,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
 	if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
 		return;
 
-	if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
-				       sizeof(u32)))
+	if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+				  sizeof(u32)))
 		return;
 
 	apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2338,14 +2340,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
 		max_isr = 0;
 	data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
-				    sizeof(u32));
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+				sizeof(u32));
 }
 
 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
 	if (vapic_addr) {
-		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
 					&vcpu->arch.apic->vapic_cache,
 					vapic_addr, sizeof(u32)))
 			return -EINVAL;
@@ -2439,7 +2441,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
 	vcpu->arch.pv_eoi.msr_val = data;
 	if (!pv_eoi_enabled(vcpu))
 		return 0;
-	return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
+	return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
 					 addr, sizeof(u8));
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2fe9aa116288..b38a302858a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1785,7 +1785,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct pvclock_vcpu_time_info guest_hv_clock;
 
-	if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return;
 
@@ -1806,9 +1806,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-				    &vcpu->hv_clock,
-				    sizeof(vcpu->hv_clock.version));
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
 
 	smp_wmb();
 
@@ -1822,16 +1822,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
 
 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
 
-	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-				    &vcpu->hv_clock,
-				    sizeof(vcpu->hv_clock));
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
 
 	smp_wmb();
 
 	vcpu->hv_clock.version++;
-	kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-				    &vcpu->hv_clock,
-				    sizeof(vcpu->hv_clock.version));
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
 }
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2064,7 +2064,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
 		return 0;
 	}
 
-	if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
+	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
 					sizeof(u32)))
 		return 1;
 
@@ -2083,7 +2083,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
+	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
 		return;
 
@@ -2094,7 +2094,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
@@ -2103,14 +2103,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
 		vcpu->arch.st.last_steal;
 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 
 	smp_wmb();
 
 	vcpu->arch.st.steal.version += 1;
 
-	kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
 		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
 
@@ -2215,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & 1))
 			break;
 
-		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
 		     &vcpu->arch.pv_time, data & ~1ULL,
 		     sizeof(struct pvclock_vcpu_time_info)))
 			vcpu->arch.pv_time_enabled = false;
@@ -2236,7 +2236,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (data & KVM_STEAL_RESERVED_MASK)
 			return 1;
 
-		if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
 						data & KVM_STEAL_VALID_BITS,
 						sizeof(struct kvm_steal_time)))
 			return 1;
@@ -2858,7 +2858,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.st.steal.preempted = 1;
 
-	kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
+	kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
 			&vcpu->arch.st.steal.preempted,
 			offsetof(struct kvm_steal_time, preempted),
 			sizeof(vcpu->arch.st.steal.preempted));
@@ -8527,8 +8527,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
-	return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
-					   sizeof(val));
+
+	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+				      sizeof(val));
 }
 
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 25cf258a1c9b..3727afdf614d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -650,18 +650,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 			  unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-			       void *data, unsigned long len);
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
 			 int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
 		    unsigned long len);
-int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-				void *data, unsigned long len);
-int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-				       void *data, int offset, unsigned long len);
-int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-				   gpa_t gpa, unsigned long len);
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len);
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, int offset, unsigned long len);
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			      gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6281cc2446d5..4c4d3fe10654 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1975,18 +1975,18 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
 	return 0;
 }
 
-int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			      gpa_t gpa, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
+EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
 
-int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-				       void *data, int offset, unsigned long len)
+int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, int offset, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 	gpa_t gpa = ghc->gpa + offset;
 
@@ -1996,7 +1996,7 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_vcpu_write_guest(vcpu, gpa, data, len);
+		return kvm_write_guest(kvm, gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -2008,19 +2008,19 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
+EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
 
-int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-			       void *data, unsigned long len)
+int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
 {
-	return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
+	return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
 
-int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-			       void *data, unsigned long len)
+int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+			   void *data, unsigned long len)
 {
-	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	int r;
 
 	BUG_ON(len > ghc->len);
@@ -2029,7 +2029,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g
 		__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
 
 	if (unlikely(!ghc->memslot))
-		return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
+		return kvm_read_guest(kvm, ghc->gpa, data, len);
 
 	if (kvm_is_error_hva(ghc->hva))
 		return -EFAULT;
@@ -2040,7 +2040,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
 
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
-- 
cgit v1.2.3


From 1f43e2ad7bff54f7c82a084a57e5c90da0d3f4d9 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Fri, 28 Apr 2017 13:56:08 +0800
Subject: f2fs: introduce CP_TRIMMED_FLAG to avoid unneeded discard

Introduce CP_TRIMMED_FLAG to indicate all invalid block were trimmed
before umount, so once we do mount with image which contain the flag,
we don't record invalid blocks as undiscard one, when fstrim is being
triggered, we can avoid issuing redundant discard commands.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c        |  3 +++
 fs/f2fs/f2fs.h              |  1 +
 fs/f2fs/segment.c           | 28 ++++++++++++++++++++--------
 fs/f2fs/super.c             |  7 +++++++
 include/linux/f2fs_fs.h     |  1 +
 include/trace/events/f2fs.h |  4 +++-
 6 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 27578903eeb6..ea9c317b5916 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1059,6 +1059,9 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
 		disable_nat_bits(sbi, false);
 
+	if (cpc->reason & CP_TRIMMED)
+		__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
+
 	if (cpc->reason & CP_UMOUNT)
 		__set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
 	else
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 37360b9ad263..f0481fb52142 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -130,6 +130,7 @@ enum {
 #define	CP_SYNC		0x00000004
 #define	CP_RECOVERY	0x00000008
 #define	CP_DISCARD	0x00000010
+#define CP_TRIMMED	0x00000020
 
 #define DEF_BATCHED_TRIM_SECTIONS	2048
 #define BATCHED_TRIM_SEGMENTS(sbi)	\
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 69b99a8f9a01..a32268eeb472 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3005,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 
 			/* build discard map only one time */
 			if (f2fs_discard_en(sbi)) {
-				memcpy(se->discard_map, se->cur_valid_map,
-							SIT_VBLOCK_MAP_SIZE);
-				sbi->discard_blks += sbi->blocks_per_seg -
-							se->valid_blocks;
+				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+					memset(se->discard_map, 0xff,
+						SIT_VBLOCK_MAP_SIZE);
+				} else {
+					memcpy(se->discard_map,
+						se->cur_valid_map,
+						SIT_VBLOCK_MAP_SIZE);
+					sbi->discard_blks +=
+						sbi->blocks_per_seg -
+						se->valid_blocks;
+				}
 			}
 
 			if (sbi->segs_per_sec > 1)
@@ -3032,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 		seg_info_from_raw_sit(se, &sit);
 
 		if (f2fs_discard_en(sbi)) {
-			memcpy(se->discard_map, se->cur_valid_map,
-						SIT_VBLOCK_MAP_SIZE);
-			sbi->discard_blks += old_valid_blocks -
-						se->valid_blocks;
+			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+				memset(se->discard_map, 0xff,
+							SIT_VBLOCK_MAP_SIZE);
+			} else {
+				memcpy(se->discard_map, se->cur_valid_map,
+							SIT_VBLOCK_MAP_SIZE);
+				sbi->discard_blks += old_valid_blocks -
+							se->valid_blocks;
+			}
 		}
 
 		if (sbi->segs_per_sec > 1)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 4cd3bee6775f..9a14b2590337 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -797,6 +797,13 @@ static void f2fs_put_super(struct super_block *sb)
 	/* be sure to wait for any on-going discard commands */
 	f2fs_wait_discard_bios(sbi);
 
+	if (!sbi->discard_blks) {
+		struct cp_control cpc = {
+			.reason = CP_UMOUNT | CP_TRIMMED,
+		};
+		write_checkpoint(sbi, &cpc);
+	}
+
 	/* write_checkpoint can update stat informaion */
 	f2fs_destroy_stats(sbi);
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 093549e10ee2..b6feed6547ce 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -114,6 +114,7 @@ struct f2fs_super_block {
 /*
  * For checkpoint
  */
+#define CP_TRIMMED_FLAG		0x00000100
 #define CP_NAT_BITS_FLAG	0x00000080
 #define CP_CRC_RECOVERY_FLAG	0x00000040
 #define CP_FASTBOOT_FLAG	0x00000020
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 2e8f68f244d4..c78a223a0d9c 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -44,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT);
 TRACE_DEFINE_ENUM(CP_SYNC);
 TRACE_DEFINE_ENUM(CP_RECOVERY);
 TRACE_DEFINE_ENUM(CP_DISCARD);
+TRACE_DEFINE_ENUM(CP_TRIMMED);
 
 #define show_block_type(type)						\
 	__print_symbolic(type,						\
@@ -125,7 +126,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ CP_FASTBOOT,	"Fastboot" },				\
 		{ CP_SYNC,	"Sync" },				\
 		{ CP_RECOVERY,	"Recovery" },				\
-		{ CP_DISCARD,	"Discard" })
+		{ CP_DISCARD,	"Discard" },				\
+		{ CP_UMOUNT | CP_TRIMMED,	"Umount,Trimmed" })
 
 struct victim_sel_policy;
 struct f2fs_map_blocks;
-- 
cgit v1.2.3


From c73322d098e4b6f5f0f0fa1330bf57e218775539 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:51:51 -0700
Subject: mm: fix 100% CPU kswapd busyloop on unreclaimable nodes

Patch series "mm: kswapd spinning on unreclaimable nodes - fixes and
cleanups".

Jia reported a scenario in which the kswapd of a node indefinitely spins
at 100% CPU usage.  We have seen similar cases at Facebook.

The kernel's current method of judging its ability to reclaim a node (or
whether to back off and sleep) is based on the amount of scanned pages
in proportion to the amount of reclaimable pages.  In Jia's and our
scenarios, there are no reclaimable pages in the node, however, and the
condition for backing off is never met.  Kswapd busyloops in an attempt
to restore the watermarks while having nothing to work with.

This series reworks the definition of an unreclaimable node based not on
scanning but on whether kswapd is able to actually reclaim pages in
MAX_RECLAIM_RETRIES (16) consecutive runs.  This is the same criteria
the page allocator uses for giving up on direct reclaim and invoking the
OOM killer.  If it cannot free any pages, kswapd will go to sleep and
leave further attempts to direct reclaim invocations, which will either
make progress and re-enable kswapd, or invoke the OOM killer.

Patch #1 fixes the immediate problem Jia reported, the remainder are
smaller fixlets, cleanups, and overall phasing out of the old method.

Patch #6 is the odd one out.  It's a nice cleanup to get_scan_count(),
and directly related to #5, but in itself not relevant to the series.

If the whole series is too ambitious for 4.11, I would consider the
first three patches fixes, the rest cleanups.

This patch (of 9):

Jia He reports a problem with kswapd spinning at 100% CPU when
requesting more hugepages than memory available in the system:

$ echo 4000 >/proc/sys/vm/nr_hugepages

top - 13:42:59 up  3:37,  1 user,  load average: 1.09, 1.03, 1.01
Tasks:   1 total,   1 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s):  0.0 us, 12.5 sy,  0.0 ni, 85.5 id,  2.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem:  31371520 total, 30915136 used,   456384 free,      320 buffers
KiB Swap:  6284224 total,   115712 used,  6168512 free.    48192 cached Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND
   76 root      20   0       0      0      0 R 100.0 0.000 217:17.29 kswapd3

At that time, there are no reclaimable pages left in the node, but as
kswapd fails to restore the high watermarks it refuses to go to sleep.

Kswapd needs to back away from nodes that fail to balance.  Up until
commit 1d82de618ddd ("mm, vmscan: make kswapd reclaim in terms of
nodes") kswapd had such a mechanism.  It considered zones whose
theoretically reclaimable pages it had reclaimed six times over as
unreclaimable and backed away from them.  This guard was erroneously
removed as the patch changed the definition of a balanced node.

However, simply restoring this code wouldn't help in the case reported
here: there *are* no reclaimable pages that could be scanned until the
threshold is met.  Kswapd would stay awake anyway.

Introduce a new and much simpler way of backing off.  If kswapd runs
through MAX_RECLAIM_RETRIES (16) cycles without reclaiming a single
page, make it back off from the node.  This is the same number of shots
direct reclaim takes before declaring OOM.  Kswapd will go to sleep on
that node until a direct reclaimer manages to reclaim some pages, thus
proving the node reclaimable again.

[hannes@cmpxchg.org: check kswapd failure against the cumulative nr_reclaimed count]
  Link: http://lkml.kernel.org/r/20170306162410.GB2090@cmpxchg.org
[shakeelb@google.com: fix condition for throttle_direct_reclaim]
  Link: http://lkml.kernel.org/r/20170314183228.20152-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20170228214007.5621-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Reported-by: Jia He <hejianet@gmail.com>
Tested-by: Jia He <hejianet@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 ++
 mm/internal.h          |  6 ++++++
 mm/page_alloc.c        |  9 ++-------
 mm/vmscan.c            | 47 ++++++++++++++++++++++++++++++++---------------
 mm/vmstat.c            |  2 +-
 5 files changed, 43 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8e02b3750fe0..d2c50ab6ae40 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -630,6 +630,8 @@ typedef struct pglist_data {
 	int kswapd_order;
 	enum zone_type kswapd_classzone_idx;
 
+	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
+
 #ifdef CONFIG_COMPACTION
 	int kcompactd_max_order;
 	enum zone_type kcompactd_classzone_idx;
diff --git a/mm/internal.h b/mm/internal.h
index 266efaeaa370..e5a0e0ec2177 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -80,6 +80,12 @@ static inline void set_page_refcounted(struct page *page)
 
 extern unsigned long highest_memmap_pfn;
 
+/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
 /*
  * in mm/vmscan.c:
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd01501efab9..42c0543e46c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3521,12 +3521,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return false;
 }
 
-/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
@@ -4534,7 +4528,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 			node_page_state(pgdat, NR_PAGES_SCANNED),
-			!pgdat_reclaimable(pgdat) ? "yes" : "no");
+			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+				"yes" : "no");
 	}
 
 	for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..667644e53b5c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 
+	/*
+	 * Kswapd gives up on balancing particular nodes after too
+	 * many failures to reclaim anything from them and goes to
+	 * sleep. On reclaim progress, reset the failure counter. A
+	 * successful direct reclaim run will revive a dormant kswapd.
+	 */
+	if (reclaimable)
+		pgdat->kswapd_failures = 0;
+
 	return reclaimable;
 }
 
@@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						 GFP_KERNEL | __GFP_HARDWALL))
 				continue;
 
-			if (sc->priority != DEF_PRIORITY &&
-			    !pgdat_reclaimable(zone->zone_pgdat))
-				continue;	/* Let kswapd poll it */
-
 			/*
 			 * If we already have plenty of memory free for
 			 * compaction in this zone, don't free any more.
@@ -2817,7 +2822,7 @@ retry:
 	return 0;
 }
 
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -2825,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 	int i;
 	bool wmark_ok;
 
+	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+		return true;
+
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!managed_zone(zone) ||
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (pfmemalloc_watermark_ok(pgdat))
+		if (allow_direct_reclaim(pgdat))
 			goto out;
 		break;
 	}
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	 */
 	if (!(gfp_mask & __GFP_FS)) {
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			pfmemalloc_watermark_ok(pgdat), HZ);
+			allow_direct_reclaim(pgdat), HZ);
 
 		goto check_pending;
 	}
 
 	/* Throttle until kswapd wakes the process */
 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-		pfmemalloc_watermark_ok(pgdat));
+		allow_direct_reclaim(pgdat));
 
 check_pending:
 	if (fatal_signal_pending(current))
@@ -3114,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
 	/*
 	 * The throttled processes are normally woken up in balance_pgdat() as
-	 * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+	 * soon as allow_direct_reclaim() is true. But there is a potential
 	 * race between when kswapd checks the watermarks and a process gets
 	 * throttled. There is also a potential race if processes get
 	 * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,6 +3136,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	if (waitqueue_active(&pgdat->pfmemalloc_wait))
 		wake_up_all(&pgdat->pfmemalloc_wait);
 
+	/* Hopeless node, leave it to direct reclaim */
+	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+		return true;
+
 	for (i = 0; i <= classzone_idx; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
@@ -3214,9 +3226,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 	count_vm_event(PAGEOUTRUN);
 
 	do {
+		unsigned long nr_reclaimed = sc.nr_reclaimed;
 		bool raise_priority = true;
 
-		sc.nr_reclaimed = 0;
 		sc.reclaim_idx = classzone_idx;
 
 		/*
@@ -3295,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				pfmemalloc_watermark_ok(pgdat))
+				allow_direct_reclaim(pgdat))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
@@ -3306,10 +3318,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * Raise priority if scanning rate is too low or there was no
 		 * progress in reclaiming pages
 		 */
-		if (raise_priority || !sc.nr_reclaimed)
+		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+		if (raise_priority || !nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1);
 
+	if (!sc.nr_reclaimed)
+		pgdat->kswapd_failures++;
+
 out:
 	/*
 	 * Return the order kswapd stopped reclaiming at as
@@ -3509,6 +3525,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 
+	/* Hopeless node, leave it to direct reclaim */
+	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+		return;
+
 	/* Only wake kswapd if all zones are unbalanced */
 	for (z = 0; z <= classzone_idx; z++) {
 		zone = pgdat->node_zones + z;
@@ -3779,9 +3799,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
 		return NODE_RECLAIM_FULL;
 
-	if (!pgdat_reclaimable(pgdat))
-		return NODE_RECLAIM_FULL;
-
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5a4f5c5a31e8..baee70dafba8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n  node_unreclaimable:  %u"
 		   "\n  start_pfn:           %lu"
 		   "\n  node_inactive_ratio: %u",
-		   !pgdat_reclaimable(zone->zone_pgdat),
+		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
 		   zone->zone_start_pfn,
 		   zone->zone_pgdat->inactive_ratio);
 	seq_putc(m, '\n');
-- 
cgit v1.2.3


From c822f6223d03c2c5b026a21da09c6b6d523258cd Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:52:10 -0700
Subject: mm: delete NR_PAGES_SCANNED and pgdat_reclaimable()

NR_PAGES_SCANNED counts number of pages scanned since the last page free
event in the allocator.  This was used primarily to measure the
reclaimability of zones and nodes, and determine when reclaim should
give up on them.  In that role, it has been replaced in the preceding
patches by a different mechanism.

Being implemented as an efficient vmstat counter, it was automatically
exported to userspace as well.  It's however unlikely that anyone
outside the kernel is using this counter in any meaningful way.

Remove the counter and the unused pgdat_reclaimable().

Link: http://lkml.kernel.org/r/20170228214007.5621-8-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Jia He <hejianet@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 -
 mm/internal.h          |  1 -
 mm/page_alloc.c        | 11 -----------
 mm/vmscan.c            |  9 ---------
 mm/vmstat.c            | 22 +++-------------------
 5 files changed, 3 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d2c50ab6ae40..04e0969966f6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -149,7 +149,6 @@ enum node_stat_item {
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
-	NR_PAGES_SCANNED,	/* pages scanned since last reclaim */
 	WORKINGSET_REFAULT,
 	WORKINGSET_ACTIVATE,
 	WORKINGSET_NODERECLAIM,
diff --git a/mm/internal.h b/mm/internal.h
index e5a0e0ec2177..a36719572eb9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -91,7 +91,6 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
-extern bool pgdat_reclaimable(struct pglist_data *pgdat);
 
 /*
  * in mm/rmap.c:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 42c0543e46c3..6994f28f769c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1090,14 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
 	int migratetype = 0;
 	int batch_free = 0;
-	unsigned long nr_scanned;
 	bool isolated_pageblocks;
 
 	spin_lock(&zone->lock);
 	isolated_pageblocks = has_isolate_pageblock(zone);
-	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-	if (nr_scanned)
-		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
 
 	while (count) {
 		struct page *page;
@@ -1150,12 +1146,7 @@ static void free_one_page(struct zone *zone,
 				unsigned int order,
 				int migratetype)
 {
-	unsigned long nr_scanned;
 	spin_lock(&zone->lock);
-	nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-	if (nr_scanned)
-		__mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
 	if (unlikely(has_isolate_pageblock(zone) ||
 		is_migrate_isolate(migratetype))) {
 		migratetype = get_pfnblock_migratetype(page, pfn);
@@ -4504,7 +4495,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
 			" writeback_tmp:%lukB"
 			" unstable:%lukB"
-			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
 			"\n",
 			pgdat->node_id,
@@ -4527,7 +4517,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
-			node_page_state(pgdat, NR_PAGES_SCANNED),
 			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
 				"yes" : "no");
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9117ae8d49ee..02f2eb51b33e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -230,12 +230,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
 	return nr;
 }
 
-bool pgdat_reclaimable(struct pglist_data *pgdat)
-{
-	return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
-		pgdat_reclaimable_pages(pgdat) * 6;
-}
-
 /**
  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
  * @lruvec: lru vector
@@ -1750,7 +1744,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
-		__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
 		if (current_is_kswapd())
 			__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
 		else
@@ -1953,8 +1946,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
-	if (global_reclaim(sc))
-		__mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
 	__count_vm_events(PGREFILL, nr_scanned);
 
 	spin_unlock_irq(&pgdat->lru_lock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index baee70dafba8..c8d15051616b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
 	"nr_unevictable",
 	"nr_isolated_anon",
 	"nr_isolated_file",
-	"nr_pages_scanned",
 	"workingset_refault",
 	"workingset_activate",
 	"workingset_nodereclaim",
@@ -1378,7 +1377,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
-		   "\n   node_scanned  %lu"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu"
 		   "\n        managed  %lu",
@@ -1386,7 +1384,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   min_wmark_pages(zone),
 		   low_wmark_pages(zone),
 		   high_wmark_pages(zone),
-		   node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
 		   zone->spanned_pages,
 		   zone->present_pages,
 		   zone->managed_pages);
@@ -1586,22 +1583,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
 	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
 		val = atomic_long_read(&vm_zone_stat[i]);
 		if (val < 0) {
-			switch (i) {
-			case NR_PAGES_SCANNED:
-				/*
-				 * This is often seen to go negative in
-				 * recent kernels, but not to go permanently
-				 * negative.  Whilst it would be nicer not to
-				 * have exceptions, rooting them out would be
-				 * another task, of rather low priority.
-				 */
-				break;
-			default:
-				pr_warn("%s: %s %ld\n",
-					__func__, vmstat_text[i], val);
-				err = -EINVAL;
-				break;
-			}
+			pr_warn("%s: %s %ld\n",
+				__func__, vmstat_text[i], val);
+			err = -EINVAL;
 		}
 	}
 	if (err)
-- 
cgit v1.2.3


From a128ca71fb29ed4444b80f38a0148b468826e19b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 3 May 2017 14:52:22 -0700
Subject: mm: delete unnecessary TTU_* flags

Patch series "mm: fix some MADV_FREE issues", v5.

We are trying to use MADV_FREE in jemalloc.  Several issues are found.
Without solving the issues, jemalloc can't use the MADV_FREE feature.

 - Doesn't support system without swap enabled. Because if swap is off,
   we can't or can't efficiently age anonymous pages. And since
   MADV_FREE pages are mixed with other anonymous pages, we can't
   reclaim MADV_FREE pages. In current implementation, MADV_FREE will
   fallback to MADV_DONTNEED without swap enabled. But in our
   environment, a lot of machines don't enable swap. This will prevent
   our setup using MADV_FREE.

 - Increases memory pressure. page reclaim bias file pages reclaim
   against anonymous pages. This doesn't make sense for MADV_FREE pages,
   because those pages could be freed easily and refilled with very
   slight penality. Even page reclaim doesn't bias file pages, there is
   still an issue, because MADV_FREE pages and other anonymous pages are
   mixed together. To reclaim a MADV_FREE page, we probably must scan a
   lot of other anonymous pages, which is inefficient. In our test, we
   usually see oom with MADV_FREE enabled and nothing without it.

 - Accounting. There are two accounting problems. We don't have a global
   accounting. If the system is abnormal, we don't know if it's a
   problem from MADV_FREE side. The other problem is RSS accounting.
   MADV_FREE pages are accounted as normal anon pages and reclaimed
   lazily, so application's RSS becomes bigger. This confuses our
   workloads. We have monitoring daemon running and if it finds
   applications' RSS becomes abnormal, the daemon will kill the
   applications even kernel can reclaim the memory easily.

To address the first the two issues, we can either put MADV_FREE pages
into a separate LRU list (Minchan's previous patches and V1 patches), or
put them into LRU_INACTIVE_FILE list (suggested by Johannes).  The
patchset use the second idea.  The reason is LRU_INACTIVE_FILE list is
tiny nowadays and should be full of used once file pages.  So we can
still efficiently reclaim MADV_FREE pages there without interference
with other anon and active file pages.  Putting the pages into inactive
file list also has an advantage which allows page reclaim to prioritize
MADV_FREE pages and used once file pages.  MADV_FREE pages are put into
the lru list and clear SwapBacked flag, so PageAnon(page) &&
!PageSwapBacked(page) will indicate a MADV_FREE pages.  These pages will
directly freed without pageout if they are clean, otherwise normal swap
will reclaim them.

For the third issue, the previous post adds global accounting and a
separate RSS count for MADV_FREE pages.  The problem is we never get
accurate accounting for MADV_FREE pages.  The pages are mapped to
userspace, can be dirtied without notice from kernel side.  To get
accurate accounting, we could write protect the page, but then there is
extra page fault overhead, which people don't want to pay.  Jemalloc
guys have concerns about the inaccurate accounting, so this post drops
the accounting patches temporarily.  The info exported to
/proc/pid/smaps for MADV_FREE pages are kept, which is the only place we
can get accurate accounting right now.

This patch (of 6):

Johannes pointed out TTU_LZFREE is unnecessary.  It's true because we
always have the flag set if we want to do an unmap.  For cases we don't
do an unmap, the TTU_LZFREE part of code should never run.

Also the TTU_UNMAP is unnecessary.  If no other flags set (for example,
TTU_MIGRATION), an unmap is implied.

The patch includes Johannes's cleanup and dead TTU_ACTION macro removal
code

Link: http://lkml.kernel.org/r/4be3ea1bc56b26fd98a54d0a6f70bec63f6d8980.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 22 +++++++++-------------
 mm/memory-failure.c  |  2 +-
 mm/rmap.c            |  2 +-
 mm/vmscan.c          | 11 ++++-------
 4 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 8c89e902df3e..7a3941492856 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -83,19 +83,17 @@ struct anon_vma_chain {
 };
 
 enum ttu_flags {
-	TTU_UNMAP = 1,			/* unmap mode */
-	TTU_MIGRATION = 2,		/* migration mode */
-	TTU_MUNLOCK = 4,		/* munlock mode */
-	TTU_LZFREE = 8,			/* lazy free mode */
-	TTU_SPLIT_HUGE_PMD = 16,	/* split huge PMD if any */
-
-	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
-	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
-	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
-	TTU_BATCH_FLUSH = (1 << 11),	/* Batch TLB flushes where possible
+	TTU_MIGRATION		= 0x1,	/* migration mode */
+	TTU_MUNLOCK		= 0x2,	/* munlock mode */
+
+	TTU_SPLIT_HUGE_PMD	= 0x4,	/* split huge PMD if any */
+	TTU_IGNORE_MLOCK	= 0x8,	/* ignore mlock */
+	TTU_IGNORE_ACCESS	= 0x10,	/* don't age */
+	TTU_IGNORE_HWPOISON	= 0x20,	/* corrupted page is recoverable */
+	TTU_BATCH_FLUSH		= 0x40,	/* Batch TLB flushes where possible
 					 * and caller guarantees they will
 					 * do a final flush if necessary */
-	TTU_RMAP_LOCKED = (1 << 12)	/* do not grab rmap lock:
+	TTU_RMAP_LOCKED		= 0x80	/* do not grab rmap lock:
 					 * caller holds it */
 };
 
@@ -193,8 +191,6 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 
-#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
-
 int try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27f7210e7fab..f85adfe57484 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -907,7 +907,7 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int trapno, int flags, struct page **hpagep)
 {
-	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
 	int ret;
diff --git a/mm/rmap.c b/mm/rmap.c
index f6838015810f..d7b6d780764b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1426,7 +1426,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			VM_BUG_ON_PAGE(!PageSwapCache(page), page);
 
-			if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+			if (!PageDirty(page)) {
 				/* It's a freeable page by MADV_FREE */
 				dec_mm_counter(mm, MM_ANONPAGES);
 				rp->lazyfreed++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 77832f0dbe0d..e5c00f2b98ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -966,7 +966,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		int may_enter_fs;
 		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 		bool dirty, writeback;
-		bool lazyfree = false;
 		int ret = SWAP_SUCCESS;
 
 		cond_resched();
@@ -1120,7 +1119,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 			if (!add_to_swap(page, page_list))
 				goto activate_locked;
-			lazyfree = true;
 			may_enter_fs = 1;
 
 			/* Adding to swap updated mapping */
@@ -1138,9 +1136,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (ret = try_to_unmap(page, lazyfree ?
-				(ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
-				(ttu_flags | TTU_BATCH_FLUSH))) {
+			switch (ret = try_to_unmap(page,
+				ttu_flags | TTU_BATCH_FLUSH)) {
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
@@ -1348,7 +1345,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 	}
 
 	ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
-			TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+			TTU_IGNORE_ACCESS, NULL, true);
 	list_splice(&clean_pages, page_list);
 	mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
 	return ret;
@@ -1740,7 +1737,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_taken == 0)
 		return 0;
 
-	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+	nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
 				&stat, false);
 
 	spin_lock_irq(&pgdat->lru_lock);
-- 
cgit v1.2.3


From f7ad2a6cb9f7c4040004bedee84a70a9b985583e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 3 May 2017 14:52:29 -0700
Subject: mm: move MADV_FREE pages into LRU_INACTIVE_FILE list

madv()'s MADV_FREE indicate pages are 'lazyfree'.  They are still
anonymous pages, but they can be freed without pageout.  To distinguish
these from normal anonymous pages, we clear their SwapBacked flag.

MADV_FREE pages could be freed without pageout, so they pretty much like
used once file pages.  For such pages, we'd like to reclaim them once
there is memory pressure.  Also it might be unfair reclaiming MADV_FREE
pages always before used once file pages and we definitively want to
reclaim the pages before other anonymous and file pages.

To speed up MADV_FREE pages reclaim, we put the pages into
LRU_INACTIVE_FILE list.  The rationale is LRU_INACTIVE_FILE list is tiny
nowadays and should be full of used once file pages.  Reclaiming
MADV_FREE pages will not have much interfere of anonymous and active
file pages.  And the inactive file pages and MADV_FREE pages will be
reclaimed according to their age, so we don't reclaim too many MADV_FREE
pages too.  Putting the MADV_FREE pages into LRU_INACTIVE_FILE_LIST also
means we can reclaim the pages without swap support.  This idea is
suggested by Johannes.

This patch doesn't move MADV_FREE pages to LRU_INACTIVE_FILE list yet to
avoid bisect failure, next patch will do it.

The patch is based on Minchan's original patch.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/2f87063c1e9354677b7618c647abde77b07561e5.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h          |  2 +-
 include/linux/vm_event_item.h |  2 +-
 mm/huge_memory.c              |  3 ---
 mm/madvise.c                  |  2 --
 mm/swap.c                     | 49 ++++++++++++++++++++++++-------------------
 mm/vmstat.c                   |  1 +
 6 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 45e91dd6716d..486494e6b2fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
-extern void deactivate_page(struct page *page);
+extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index a80b7b59cf33..d84ae90ccd5c 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		FOR_ALL_ZONES(PGALLOC),
 		FOR_ALL_ZONES(ALLOCSTALL),
 		FOR_ALL_ZONES(PGSCAN_SKIP),
-		PGFREE, PGACTIVATE, PGDEACTIVATE,
+		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
 		PGREFILL,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 17f6008f2827..7309a716b7fc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1564,9 +1564,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		ClearPageDirty(page);
 	unlock_page(page);
 
-	if (PageActive(page))
-		deactivate_page(page);
-
 	if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
 		pmdp_invalidate(vma, addr, pmd);
 		orig_pmd = pmd_mkold(orig_pmd);
diff --git a/mm/madvise.c b/mm/madvise.c
index 7a2abf0127ae..cf3021b05b32 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -411,8 +411,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			ptent = pte_mkold(ptent);
 			ptent = pte_mkclean(ptent);
 			set_pte_at(mm, addr, pte, ptent);
-			if (PageActive(page))
-				deactivate_page(page);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
 	}
diff --git a/mm/swap.c b/mm/swap.c
index d8d9ee9e311a..98d08b4579fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,7 +46,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
 #endif
@@ -571,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 }
 
 
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
 {
-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-		int file = page_is_file_cache(page);
-		int lru = page_lru_base_type(page);
+	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+	    !PageUnevictable(page)) {
+		bool active = PageActive(page);
 
-		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+		del_page_from_lru_list(page, lruvec,
+				       LRU_INACTIVE_ANON + active);
 		ClearPageActive(page);
 		ClearPageReferenced(page);
-		add_page_to_lru_list(page, lruvec, lru);
+		/*
+		 * lazyfree pages are clean anonymous pages. They have
+		 * SwapBacked flag cleared to distinguish normal anonymous
+		 * pages
+		 */
+		ClearPageSwapBacked(page);
+		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
 
-		__count_vm_event(PGDEACTIVATE);
-		update_page_reclaim_stat(lruvec, file, 0);
+		__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+		update_page_reclaim_stat(lruvec, 1, 0);
 	}
 }
 
@@ -614,9 +621,9 @@ void lru_add_drain_cpu(int cpu)
 	if (pagevec_count(pvec))
 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
-	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+	pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
 	if (pagevec_count(pvec))
-		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
 
 	activate_page_drain(cpu);
 }
@@ -648,22 +655,22 @@ void deactivate_file_page(struct page *page)
 }
 
 /**
- * deactivate_page - deactivate a page
+ * mark_page_lazyfree - make an anon page lazyfree
  * @page: page to deactivate
  *
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page.  This is done to accelerate the reclaim
- * of @page.
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
  */
-void deactivate_page(struct page *page)
+void mark_page_lazyfree(struct page *page)
 {
-	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
-		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+	    !PageUnevictable(page)) {
+		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
 
 		get_page(page);
 		if (!pagevec_add(pvec, page) || PageCompound(page))
-			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-		put_cpu_var(lru_deactivate_pvecs);
+			pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+		put_cpu_var(lru_lazyfree_pvecs);
 	}
 }
 
@@ -703,7 +710,7 @@ void lru_add_drain_all(void)
 		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
-		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
 			queue_work_on(cpu, mm_percpu_wq, work);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c8d15051616b..828a36ea0584 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -991,6 +991,7 @@ const char * const vmstat_text[] = {
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
+	"pglazyfree",
 
 	"pgfault",
 	"pgmajfault",
-- 
cgit v1.2.3


From 802a3a92ad7ac0b9be9df229dee530a1f0a8039b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 3 May 2017 14:52:32 -0700
Subject: mm: reclaim MADV_FREE pages

When memory pressure is high, we free MADV_FREE pages.  If the pages are
not dirty in pte, the pages could be freed immediately.  Otherwise we
can't reclaim them.  We put the pages back to anonumous LRU list (by
setting SwapBacked flag) and the pages will be reclaimed in normal
swapout way.

We use normal page reclaim policy.  Since MADV_FREE pages are put into
inactive file list, such pages and inactive file pages are reclaimed
according to their age.  This is expected, because we don't want to
reclaim too many MADV_FREE pages before used once pages.

Based on Minchan's original patch

[minchan@kernel.org: clean up lazyfree page handling]
  Link: http://lkml.kernel.org/r/20170303025237.GB3503@bbox
Link: http://lkml.kernel.org/r/14b8eb1d3f6bf6cc492833f183ac8c304e560484.1487965799.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/huge_memory.c     |  2 ++
 mm/madvise.c         |  1 +
 mm/rmap.c            | 46 ++++++++++++++++++++--------------------------
 mm/vmscan.c          | 34 ++++++++++++++++++++++------------
 5 files changed, 46 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7a3941492856..fee10d744ebd 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -298,6 +298,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
 #define SWAP_MLOCK	3
-#define SWAP_LZFREE	4
+#define SWAP_DIRTY	4
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7309a716b7fc..08501a607b00 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1572,6 +1572,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		set_pmd_at(mm, addr, pmd, orig_pmd);
 		tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 	}
+
+	mark_page_lazyfree(page);
 	ret = true;
 out:
 	spin_unlock(ptl);
diff --git a/mm/madvise.c b/mm/madvise.c
index cf3021b05b32..d3a6712c3e14 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -413,6 +413,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			set_pte_at(mm, addr, pte, ptent);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 		}
+		mark_page_lazyfree(page);
 	}
 out:
 	if (nr_swap) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b4084d09dbe8..519b7eb723d1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1288,11 +1288,6 @@ void page_remove_rmap(struct page *page, bool compound)
 	 */
 }
 
-struct rmap_private {
-	enum ttu_flags flags;
-	int lazyfreed;
-};
-
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
@@ -1308,8 +1303,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	pte_t pteval;
 	struct page *subpage;
 	int ret = SWAP_AGAIN;
-	struct rmap_private *rp = arg;
-	enum ttu_flags flags = rp->flags;
+	enum ttu_flags flags = (enum ttu_flags)arg;
 
 	/* munlock has nothing to gain from examining un-locked vmas */
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1427,11 +1421,21 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			VM_BUG_ON_PAGE(!PageSwapCache(page) && PageSwapBacked(page),
 				page);
 
-			if (!PageDirty(page)) {
-				/* It's a freeable page by MADV_FREE */
-				dec_mm_counter(mm, MM_ANONPAGES);
-				rp->lazyfreed++;
-				goto discard;
+			/* MADV_FREE page check */
+			if (!PageSwapBacked(page)) {
+				if (!PageDirty(page)) {
+					dec_mm_counter(mm, MM_ANONPAGES);
+					goto discard;
+				}
+
+				/*
+				 * If the page was redirtied, it cannot be
+				 * discarded. Remap the page to page table.
+				 */
+				set_pte_at(mm, address, pvmw.pte, pteval);
+				ret = SWAP_DIRTY;
+				page_vma_mapped_walk_done(&pvmw);
+				break;
 			}
 
 			if (swap_duplicate(entry) < 0) {
@@ -1499,18 +1503,15 @@ static int page_mapcount_is_zero(struct page *page)
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  * SWAP_MLOCK	- page is mlocked.
+ * SWAP_DIRTY	- page is dirty MADV_FREE page
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	int ret;
-	struct rmap_private rp = {
-		.flags = flags,
-		.lazyfreed = 0,
-	};
 
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
-		.arg = &rp,
+		.arg = (void *)flags,
 		.done = page_mapcount_is_zero,
 		.anon_lock = page_lock_anon_vma_read,
 	};
@@ -1531,11 +1532,8 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		ret = rmap_walk(page, &rwc);
 
-	if (ret != SWAP_MLOCK && !page_mapcount(page)) {
+	if (ret != SWAP_MLOCK && !page_mapcount(page))
 		ret = SWAP_SUCCESS;
-		if (rp.lazyfreed && !PageDirty(page))
-			ret = SWAP_LZFREE;
-	}
 	return ret;
 }
 
@@ -1562,14 +1560,10 @@ static int page_not_mapped(struct page *page)
 int try_to_munlock(struct page *page)
 {
 	int ret;
-	struct rmap_private rp = {
-		.flags = TTU_MUNLOCK,
-		.lazyfreed = 0,
-	};
 
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
-		.arg = &rp,
+		.arg = (void *)TTU_MUNLOCK,
 		.done = page_not_mapped,
 		.anon_lock = page_lock_anon_vma_read,
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5c00f2b98ab..ec4555369e17 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -906,7 +906,8 @@ static void page_check_dirty_writeback(struct page *page,
 	 * Anonymous pages are not handled by flushers and must be written
 	 * from reclaim context. Do not stall reclaim based on them
 	 */
-	if (!page_is_file_cache(page)) {
+	if (!page_is_file_cache(page) ||
+	    (PageAnon(page) && !PageSwapBacked(page))) {
 		*dirty = false;
 		*writeback = false;
 		return;
@@ -987,7 +988,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			goto keep_locked;
 
 		/* Double the slab pressure for mapped and swapcache pages */
-		if (page_mapped(page) || PageSwapCache(page))
+		if ((page_mapped(page) || PageSwapCache(page)) &&
+		    !(PageAnon(page) && !PageSwapBacked(page)))
 			sc->nr_scanned++;
 
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1113,8 +1115,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
+		 * Lazyfree page could be freed directly
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
+		if (PageAnon(page) && PageSwapBacked(page) &&
+		    !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
 			if (!add_to_swap(page, page_list))
@@ -1135,9 +1139,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
 		 */
-		if (page_mapped(page) && mapping) {
+		if (page_mapped(page)) {
 			switch (ret = try_to_unmap(page,
 				ttu_flags | TTU_BATCH_FLUSH)) {
+			case SWAP_DIRTY:
+				SetPageSwapBacked(page);
+				/* fall through */
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
@@ -1145,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto keep_locked;
 			case SWAP_MLOCK:
 				goto cull_mlocked;
-			case SWAP_LZFREE:
-				goto lazyfree;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -1258,10 +1263,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 		}
 
-lazyfree:
-		if (!mapping || !__remove_mapping(mapping, page, true))
-			goto keep_locked;
+		if (PageAnon(page) && !PageSwapBacked(page)) {
+			/* follow __remove_mapping for reference */
+			if (!page_ref_freeze(page, 1))
+				goto keep_locked;
+			if (PageDirty(page)) {
+				page_ref_unfreeze(page, 1);
+				goto keep_locked;
+			}
 
+			count_vm_event(PGLAZYFREED);
+		} else if (!mapping || !__remove_mapping(mapping, page, true))
+			goto keep_locked;
 		/*
 		 * At this point, we have no other references and there is
 		 * no way to pick any more up (removed from LRU, removed
@@ -1271,9 +1284,6 @@ lazyfree:
 		 */
 		__ClearPageLocked(page);
 free_it:
-		if (ret == SWAP_LZFREE)
-			count_vm_event(PGLAZYFREED);
-
 		nr_reclaimed++;
 
 		/*
-- 
cgit v1.2.3


From 9a4caf1e9fa4864ce21ba9584a2c336bfbc72740 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:52:45 -0700
Subject: mm: memcontrol: provide shmem statistics

Cgroups currently don't report how much shmem they use, which can be
useful data to have, in particular since shmem is included in the
cache/file item while being reclaimed like anonymous memory.

Add a counter to track shmem pages during charging and uncharging.

Link: http://lkml.kernel.org/r/20170221164343.32252-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Chris Down <cdown@fb.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v2.txt |  5 +++++
 include/linux/memcontrol.h  |  1 +
 mm/memcontrol.c             | 28 ++++++++++++++++++++--------
 3 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 49d7c997fa1e..e50b95c25868 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back.
 
 		Amount of memory used in network transmission buffers
 
+	  shmem
+
+		Amount of cached filesystem data that is swap-backed,
+		such as tmpfs, shm segments, shared anonymous mmap()s
+
 	  file_mapped
 
 		Amount of cached filesystem data mapped with mmap()
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bb7250c45cb8..c5ebb32fef49 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -46,6 +46,7 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
+	MEM_CGROUP_STAT_SHMEM,		/* # of pages charged as shmem */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
 	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd7541d7c11..490d5b4676c1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -104,6 +104,7 @@ static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"rss",
 	"rss_huge",
+	"shmem",
 	"mapped_file",
 	"dirty",
 	"writeback",
@@ -608,9 +609,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	if (PageAnon(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
-	else
+	else {
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
+		if (PageSwapBacked(page))
+			__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
+				       nr_pages);
+	}
 
 	if (compound) {
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
@@ -5208,6 +5213,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "sock %llu\n",
 		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
 
+	seq_printf(m, "shmem %llu\n",
+		   (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE);
 	seq_printf(m, "file_mapped %llu\n",
 		   (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
 	seq_printf(m, "file_dirty %llu\n",
@@ -5476,8 +5483,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
 
 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 			   unsigned long nr_anon, unsigned long nr_file,
-			   unsigned long nr_huge, unsigned long nr_kmem,
-			   struct page *dummy_page)
+			   unsigned long nr_kmem, unsigned long nr_huge,
+			   unsigned long nr_shmem, struct page *dummy_page)
 {
 	unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
 	unsigned long flags;
@@ -5495,6 +5502,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 	memcg_check_events(memcg, dummy_page);
@@ -5507,6 +5515,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 static void uncharge_list(struct list_head *page_list)
 {
 	struct mem_cgroup *memcg = NULL;
+	unsigned long nr_shmem = 0;
 	unsigned long nr_anon = 0;
 	unsigned long nr_file = 0;
 	unsigned long nr_huge = 0;
@@ -5539,9 +5548,9 @@ static void uncharge_list(struct list_head *page_list)
 		if (memcg != page->mem_cgroup) {
 			if (memcg) {
 				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-					       nr_huge, nr_kmem, page);
-				pgpgout = nr_anon = nr_file =
-					nr_huge = nr_kmem = 0;
+					       nr_kmem, nr_huge, nr_shmem, page);
+				pgpgout = nr_anon = nr_file = nr_kmem = 0;
+				nr_huge = nr_shmem = 0;
 			}
 			memcg = page->mem_cgroup;
 		}
@@ -5555,8 +5564,11 @@ static void uncharge_list(struct list_head *page_list)
 			}
 			if (PageAnon(page))
 				nr_anon += nr_pages;
-			else
+			else {
 				nr_file += nr_pages;
+				if (PageSwapBacked(page))
+					nr_shmem += nr_pages;
+			}
 			pgpgout++;
 		} else {
 			nr_kmem += 1 << compound_order(page);
@@ -5568,7 +5580,7 @@ static void uncharge_list(struct list_head *page_list)
 
 	if (memcg)
 		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
-			       nr_huge, nr_kmem, page);
+			       nr_kmem, nr_huge, nr_shmem, page);
 }
 
 /**
-- 
cgit v1.2.3


From a6ffdc07847e74cc244c02ab6d0351a4a5d77281 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 3 May 2017 14:52:52 -0700
Subject: mm: use is_migrate_highatomic() to simplify the code

Introduce two helpers, is_migrate_highatomic() and is_migrate_highatomic_page().

Simplify the code, no functional changes.

[akpm@linux-foundation.org: use static inlines rather than macros, per mhocko]
Link: http://lkml.kernel.org/r/58B94F15.6060606@huawei.com
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/internal.h          | 10 ++++++++++
 mm/page_alloc.c        | 14 ++++++--------
 3 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 04e0969966f6..446cf68c1c09 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -35,7 +35,7 @@
  */
 #define PAGE_ALLOC_COSTLY_ORDER 3
 
-enum {
+enum migratetype {
 	MIGRATE_UNMOVABLE,
 	MIGRATE_MOVABLE,
 	MIGRATE_RECLAIMABLE,
diff --git a/mm/internal.h b/mm/internal.h
index a36719572eb9..04d08ef91224 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -510,4 +510,14 @@ extern const struct trace_print_flags pageflag_names[];
 extern const struct trace_print_flags vmaflag_names[];
 extern const struct trace_print_flags gfpflag_names[];
 
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+	return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f82beddbd96f..34ac32428de8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2036,8 +2036,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 
 	/* Yoink! */
 	mt = get_pageblock_migratetype(page);
-	if (mt != MIGRATE_HIGHATOMIC &&
-			!is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+	if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+	    && !is_migrate_cma(mt)) {
 		zone->nr_reserved_highatomic += pageblock_nr_pages;
 		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
 		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
@@ -2094,8 +2094,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * from highatomic to ac->migratetype. So we should
 			 * adjust the count once.
 			 */
-			if (get_pageblock_migratetype(page) ==
-							MIGRATE_HIGHATOMIC) {
+			if (is_migrate_highatomic_page(page)) {
 				/*
 				 * It should never happen but changes to
 				 * locking could inadvertently allow a per-cpu
@@ -2152,8 +2151,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 
 		page = list_first_entry(&area->free_list[fallback_mt],
 						struct page, lru);
-		if (can_steal &&
-			get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
+		if (can_steal && !is_migrate_highatomic_page(page))
 			steal_suitable_fallback(zone, page, start_migratetype);
 
 		/* Remove the page from the freelists */
@@ -2493,7 +2491,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
 	 * Free ISOLATE pages back to the allocator because they are being
-	 * offlined but treat RESERVE as movable pages so we can get those
+	 * offlined but treat HIGHATOMIC as movable pages so we can get those
 	 * areas back if necessary. Otherwise, we may have to free
 	 * excessively into the page allocator
 	 */
@@ -2603,7 +2601,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 		for (; page < endpage; page += pageblock_nr_pages) {
 			int mt = get_pageblock_migratetype(page);
 			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
-				&& mt != MIGRATE_HIGHATOMIC)
+			    && !is_migrate_highatomic(mt))
 				set_pageblock_migratetype(page,
 							  MIGRATE_MOVABLE);
 		}
-- 
cgit v1.2.3


From 7e7844226f1053236b6f6d5d122a06509fb14fd9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 3 May 2017 14:53:09 -0700
Subject: lockdep: allow to disable reclaim lockup detection

The current implementation of the reclaim lockup detection can lead to
false positives and those even happen and usually lead to tweak the code
to silence the lockdep by using GFP_NOFS even though the context can use
__GFP_FS just fine.

See

  http://lkml.kernel.org/r/20160512080321.GA18496@dastard

as an example.

  =================================
  [ INFO: inconsistent lock state ]
  4.5.0-rc2+ #4 Tainted: G           O
  ---------------------------------
  inconsistent {RECLAIM_FS-ON-R} -> {IN-RECLAIM_FS-W} usage.
  kswapd0/543 [HC0[0]:SC0[0]:HE1:SE1] takes:

  (&xfs_nondir_ilock_class){++++-+}, at: xfs_ilock+0x177/0x200 [xfs]

  {RECLAIM_FS-ON-R} state was registered at:
    mark_held_locks+0x79/0xa0
    lockdep_trace_alloc+0xb3/0x100
    kmem_cache_alloc+0x33/0x230
    kmem_zone_alloc+0x81/0x120 [xfs]
    xfs_refcountbt_init_cursor+0x3e/0xa0 [xfs]
    __xfs_refcount_find_shared+0x75/0x580 [xfs]
    xfs_refcount_find_shared+0x84/0xb0 [xfs]
    xfs_getbmap+0x608/0x8c0 [xfs]
    xfs_vn_fiemap+0xab/0xc0 [xfs]
    do_vfs_ioctl+0x498/0x670
    SyS_ioctl+0x79/0x90
    entry_SYSCALL_64_fastpath+0x12/0x6f

         CPU0
         ----
    lock(&xfs_nondir_ilock_class);
    <Interrupt>
      lock(&xfs_nondir_ilock_class);

   *** DEADLOCK ***

  3 locks held by kswapd0/543:

  stack backtrace:
  CPU: 0 PID: 543 Comm: kswapd0 Tainted: G           O    4.5.0-rc2+ #4
  Call Trace:
   lock_acquire+0xd8/0x1e0
   down_write_nested+0x5e/0xc0
   xfs_ilock+0x177/0x200 [xfs]
   xfs_reflink_cancel_cow_range+0x150/0x300 [xfs]
   xfs_fs_evict_inode+0xdc/0x1e0 [xfs]
   evict+0xc5/0x190
   dispose_list+0x39/0x60
   prune_icache_sb+0x4b/0x60
   super_cache_scan+0x14f/0x1a0
   shrink_slab.part.63.constprop.79+0x1e9/0x4e0
   shrink_zone+0x15e/0x170
   kswapd+0x4f1/0xa80
   kthread+0xf2/0x110
   ret_from_fork+0x3f/0x70

To quote Dave:
 "Ignoring whether reflink should be doing anything or not, that's a
  "xfs_refcountbt_init_cursor() gets called both outside and inside
  transactions" lockdep false positive case. The problem here is lockdep
  has seen this allocation from within a transaction, hence a GFP_NOFS
  allocation, and now it's seeing it in a GFP_KERNEL context. Also note
  that we have an active reference to this inode.

  So, because the reclaim annotations overload the interrupt level
  detections and it's seen the inode ilock been taken in reclaim
  ("interrupt") context, this triggers a reclaim context warning where
  it thinks it is unsafe to do this allocation in GFP_KERNEL context
  holding the inode ilock..."

This sounds like a fundamental problem of the reclaim lock detection.
It is really impossible to annotate such a special usecase IMHO unless
the reclaim lockup detection is reworked completely.  Until then it is
much better to provide a way to add "I know what I am doing flag" and
mark problematic places.  This would prevent from abusing GFP_NOFS flag
which has a runtime effect even on configurations which have lockdep
disabled.

Introduce __GFP_NOLOCKDEP flag which tells the lockdep gfp tracking to
skip the current allocation request.

While we are at it also make sure that the radix tree doesn't
accidentaly override tags stored in the upper part of the gfp_mask.

Link: http://lkml.kernel.org/r/20170306131408.9828-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h      | 10 +++++++++-
 kernel/locking/lockdep.c |  4 ++++
 lib/radix-tree.c         |  2 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index db373b9d3223..978232a3b4ae 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -40,6 +40,11 @@ struct vm_area_struct;
 #define ___GFP_DIRECT_RECLAIM	0x400000u
 #define ___GFP_WRITE		0x800000u
 #define ___GFP_KSWAPD_RECLAIM	0x1000000u
+#ifdef CONFIG_LOCKDEP
+#define ___GFP_NOLOCKDEP	0x4000000u
+#else
+#define ___GFP_NOLOCKDEP	0
+#endif
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -179,8 +184,11 @@ struct vm_area_struct;
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
+/* Disable lockdep for GFP context tracking */
+#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
+
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 25
+#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 106f4dcf6679..f84294c9a018 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2897,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 	if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
 		return;
 
+	/* Disable lockdep if explicitly requested */
+	if (gfp_mask & __GFP_NOLOCKDEP)
+		return;
+
 	mark_held_locks(curr, RECLAIM_FS);
 }
 
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 691a9ad48497..898e87998417 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu)
 void __init radix_tree_init(void)
 {
 	int ret;
+
+	BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
 	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
 			sizeof(struct radix_tree_node), 0,
 			SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
-- 
cgit v1.2.3


From 9070733b4efac4bf17f299a81b01c15e206f9ff5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 3 May 2017 14:53:12 -0700
Subject: xfs: abstract PF_FSTRANS to PF_MEMALLOC_NOFS

xfs has defined PF_FSTRANS to declare a scope GFP_NOFS semantic quite
some time ago.  We would like to make this concept more generic and use
it for other filesystems as well.  Let's start by giving the flag a more
generic name PF_MEMALLOC_NOFS which is in line with an exiting
PF_MEMALLOC_NOIO already used for the same purpose for GFP_NOIO
contexts.  Replace all PF_FSTRANS usage from the xfs code in the first
step before we introduce a full API for it as xfs uses the flag directly
anyway.

This patch doesn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170306131408.9828-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/kmem.c             |  4 ++--
 fs/xfs/kmem.h             |  2 +-
 fs/xfs/libxfs/xfs_btree.c |  2 +-
 fs/xfs/xfs_aops.c         |  6 +++---
 fs/xfs/xfs_trans.c        | 12 ++++++------
 include/linux/sched.h     |  2 ++
 6 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 70a5b55e0870..d0ac1a065539 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -63,13 +63,13 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 	 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
 	 * the filesystem here and potentially deadlocking.
 	 */
-	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+	if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
 		noio_flag = memalloc_noio_save();
 
 	lflags = kmem_flags_convert(flags);
 	ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 
-	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+	if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
 		memalloc_noio_restore(noio_flag);
 
 	return ptr;
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f0fc84fcaac2..a6c8da40c70d 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
 			lflags &= ~__GFP_FS;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..3059a3ec7ecb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
 	struct xfs_btree_split_args	*args = container_of(work,
 						struct xfs_btree_split_args, work);
 	unsigned long		pflags;
-	unsigned long		new_pflags = PF_FSTRANS;
+	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
 
 	/*
 	 * we are in a transaction context here, but may also be doing work
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 61494295d92f..05eca126c688 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
 	 * We hand off the transaction to the completion thread now, so
 	 * clear the flag here.
 	 */
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	return 0;
 }
 
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
 	 * thus we need to mark ourselves as being in a transaction manually.
 	 * Similarly for freeze protection.
 	 */
-	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
 	/* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
 	 * Given that we do not allow direct reclaim to call us, we should
 	 * never be called while in a filesystem transaction.
 	 */
-	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
 		goto redirty;
 
 	/*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..f5969c8274fc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
 	bool		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
 	/* Mark this thread as being in a transaction */
-	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	/*
 	 * Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
 	if (blocks > 0) {
 		error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
 		if (error != 0) {
-			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+			current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 			return -ENOSPC;
 		}
 		tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
 		tp->t_blk_res = 0;
 	}
 
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	return error;
 }
@@ -914,7 +914,7 @@ __xfs_trans_commit(
 
 	xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free(tp);
 
 	/*
@@ -944,7 +944,7 @@ out_unreserve:
 		if (commit_lsn == -1 && !error)
 			error = -EIO;
 	}
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
 	xfs_trans_free(tp);
 
@@ -998,7 +998,7 @@ xfs_trans_cancel(
 		xfs_log_done(mp, tp->t_ticket, NULL, false);
 
 	/* mark this thread as no longer being in a transaction */
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
 	xfs_trans_free(tp);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d4fa448223f..8ac11465ac5b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1237,6 +1237,8 @@ extern struct pid *cad_pid;
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
+#define PF_MEMALLOC_NOFS PF_FSTRANS	/* Transition to a more generic GFP_NOFS scope semantic */
+
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
-- 
cgit v1.2.3


From 7dea19f9ee636cb244109a4dba426bbb3e5304b7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 3 May 2017 14:53:15 -0700
Subject: mm: introduce memalloc_nofs_{save,restore} API

GFP_NOFS context is used for the following 5 reasons currently:

 - to prevent from deadlocks when the lock held by the allocation
   context would be needed during the memory reclaim

 - to prevent from stack overflows during the reclaim because the
   allocation is performed from a deep context already

 - to prevent lockups when the allocation context depends on other
   reclaimers to make a forward progress indirectly

 - just in case because this would be safe from the fs POV

 - silence lockdep false positives

Unfortunately overuse of this allocation context brings some problems to
the MM.  Memory reclaim is much weaker (especially during heavy FS
metadata workloads), OOM killer cannot be invoked because the MM layer
doesn't have enough information about how much memory is freeable by the
FS layer.

In many cases it is far from clear why the weaker context is even used
and so it might be used unnecessarily.  We would like to get rid of
those as much as possible.  One way to do that is to use the flag in
scopes rather than isolated cases.  Such a scope is declared when really
necessary, tracked per task and all the allocation requests from within
the context will simply inherit the GFP_NOFS semantic.

Not only this is easier to understand and maintain because there are
much less problematic contexts than specific allocation requests, this
also helps code paths where FS layer interacts with other layers (e.g.
crypto, security modules, MM etc...) and there is no easy way to convey
the allocation context between the layers.

Introduce memalloc_nofs_{save,restore} API to control the scope of
GFP_NOFS allocation context.  This is basically copying
memalloc_noio_{save,restore} API we have for other restricted allocation
context GFP_NOIO.  The PF_MEMALLOC_NOFS flag already exists and it is
just an alias for PF_FSTRANS which has been xfs specific until recently.
There are no more PF_FSTRANS users anymore so let's just drop it.

PF_MEMALLOC_NOFS is now checked in the MM layer and drops __GFP_FS
implicitly same as PF_MEMALLOC_NOIO drops __GFP_IO.  memalloc_noio_flags
is renamed to current_gfp_context because it now cares about both
PF_MEMALLOC_NOFS and PF_MEMALLOC_NOIO contexts.  Xfs code paths preserve
their semantic.  kmem_flags_convert() doesn't need to evaluate the flag
anymore.

This patch shouldn't introduce any functional changes.

Let's hope that filesystems will drop direct GFP_NOFS (resp.  ~__GFP_FS)
usage as much as possible and only use a properly documented
memalloc_nofs_{save,restore} checkpoints where they are appropriate.

[akpm@linux-foundation.org: fix comment typo, reflow comment]
Link: http://lkml.kernel.org/r/20170306131408.9828-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Jan Kara <jack@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/kmem.h            |  2 +-
 include/linux/gfp.h      |  8 ++++++++
 include/linux/sched.h    |  8 +++-----
 include/linux/sched/mm.h | 26 +++++++++++++++++++++++---
 kernel/locking/lockdep.c |  6 +++---
 mm/page_alloc.c          | 10 ++++++----
 mm/vmscan.c              |  6 +++---
 7 files changed, 47 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index a6c8da40c70d..d6ea520162b2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if ((current->flags & PF_MEMALLOC_NOFS) || (flags & KM_NOFS))
+		if (flags & KM_NOFS)
 			lflags &= ~__GFP_FS;
 	}
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 978232a3b4ae..2bfcfd33e476 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -210,8 +210,16 @@ struct vm_area_struct;
  *
  * GFP_NOIO will use direct reclaim to discard clean pages or slab pages
  *   that do not require the starting of any physical IO.
+ *   Please try to avoid using this flag directly and instead use
+ *   memalloc_noio_{save,restore} to mark the whole scope which cannot
+ *   perform any IO with a short explanation why. All allocation requests
+ *   will inherit GFP_NOIO implicitly.
  *
  * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ *   Please try to avoid using this flag directly and instead use
+ *   memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ *   recurse into the FS layer with a short explanation why. All allocation
+ *   requests will inherit GFP_NOFS implicitly.
  *
  * GFP_USER is for userspace allocations that also need to be directly
  *   accessibly by the kernel or hardware. It is typically used by hardware
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8ac11465ac5b..993e7e25a3a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1224,9 +1224,9 @@ extern struct pid *cad_pid;
 #define PF_USED_ASYNC		0x00004000	/* Used async_schedule*(), used by module init */
 #define PF_NOFREEZE		0x00008000	/* This thread should not be frozen */
 #define PF_FROZEN		0x00010000	/* Frozen for system suspend */
-#define PF_FSTRANS		0x00020000	/* Inside a filesystem transaction */
-#define PF_KSWAPD		0x00040000	/* I am kswapd */
-#define PF_MEMALLOC_NOIO	0x00080000	/* Allocating memory without IO involved */
+#define PF_KSWAPD		0x00020000	/* I am kswapd */
+#define PF_MEMALLOC_NOFS	0x00040000	/* All allocation requests will inherit GFP_NOFS */
+#define PF_MEMALLOC_NOIO	0x00080000	/* All allocation requests will inherit GFP_NOIO */
 #define PF_LESS_THROTTLE	0x00100000	/* Throttle me less: I clean memory */
 #define PF_KTHREAD		0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE		0x00400000	/* Randomize virtual address space */
@@ -1237,8 +1237,6 @@ extern struct pid *cad_pid;
 #define PF_FREEZER_SKIP		0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK		0x80000000      /* This thread called freeze_processes() and should not be frozen */
 
-#define PF_MEMALLOC_NOFS PF_FSTRANS	/* Transition to a more generic GFP_NOFS scope semantic */
-
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
  * tasks can access tsk->flags in readonly mode for example
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 830953ebb391..9daabe138c99 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk)
 	return ret;
 }
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
- * __GFP_FS is also cleared as it implies __GFP_IO.
+/*
+ * Applies per-task gfp context to the given allocation flags.
+ * PF_MEMALLOC_NOIO implies GFP_NOIO
+ * PF_MEMALLOC_NOFS implies GFP_NOFS
  */
-static inline gfp_t memalloc_noio_flags(gfp_t flags)
+static inline gfp_t current_gfp_context(gfp_t flags)
 {
+	/*
+	 * NOIO implies both NOIO and NOFS and it is a weaker context
+	 * so always make sure it makes precendence
+	 */
 	if (unlikely(current->flags & PF_MEMALLOC_NOIO))
 		flags &= ~(__GFP_IO | __GFP_FS);
+	else if (unlikely(current->flags & PF_MEMALLOC_NOFS))
+		flags &= ~__GFP_FS;
 	return flags;
 }
 
@@ -171,4 +179,16 @@ static inline void memalloc_noio_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
 }
 
+static inline unsigned int memalloc_nofs_save(void)
+{
+	unsigned int flags = current->flags & PF_MEMALLOC_NOFS;
+	current->flags |= PF_MEMALLOC_NOFS;
+	return flags;
+}
+
+static inline void memalloc_nofs_restore(unsigned int flags)
+{
+	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f84294c9a018..fd440b5a3c75 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2877,7 +2877,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 	if (unlikely(!debug_locks))
 		return;
 
-	gfp_mask = memalloc_noio_flags(gfp_mask);
+	gfp_mask = current_gfp_context(gfp_mask);
 
 	/* no reclaim without waiting on it */
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -2888,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
 		return;
 
 	/* We're only interested __GFP_FS allocations for now */
-	if (!(gfp_mask & __GFP_FS))
+	if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS))
 		return;
 
 	/*
@@ -3954,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
 void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
 {
-	current->lockdep_reclaim_gfp = memalloc_noio_flags(gfp_mask);
+	current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask);
 }
 
 void lockdep_clear_current_reclaim_state(void)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34ac32428de8..7a3751e53f91 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3951,10 +3951,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	/*
-	 * Runtime PM, block IO and its error handling path can deadlock
-	 * because I/O on the device might not complete.
+	 * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+	 * resp. GFP_NOIO which has to be inherited for all allocation requests
+	 * from a particular context which has been marked by
+	 * memalloc_no{fs,io}_{save,restore}.
 	 */
-	alloc_mask = memalloc_noio_flags(gfp_mask);
+	alloc_mask = current_gfp_context(gfp_mask);
 	ac.spread_dirty_pages = false;
 
 	/*
@@ -7408,7 +7410,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
 		.zone = page_zone(pfn_to_page(start)),
 		.mode = MIGRATE_SYNC,
 		.ignore_skip_hint = true,
-		.gfp_mask = memalloc_noio_flags(gfp_mask),
+		.gfp_mask = current_gfp_context(gfp_mask),
 	};
 	INIT_LIST_HEAD(&cc.migratepages);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ec4555369e17..3ad66580b8b4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2915,7 +2915,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
-		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
 		.reclaim_idx = gfp_zone(gfp_mask),
 		.order = order,
 		.nodemask = nodemask,
@@ -2995,7 +2995,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	int nid;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 		.reclaim_idx = MAX_NR_ZONES - 1,
 		.target_mem_cgroup = memcg,
@@ -3702,7 +3702,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	int classzone_idx = gfp_zone(gfp_mask);
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
 		.order = order,
 		.priority = NODE_RECLAIM_PRIORITY,
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
-- 
cgit v1.2.3


From 81378da64de6d33d0c200885f1de431c9a3e5ccd Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 3 May 2017 14:53:22 -0700
Subject: jbd2: mark the transaction context with the scope GFP_NOFS context

now that we have memalloc_nofs_{save,restore} api we can mark the whole
transaction context as implicitly GFP_NOFS.  All allocations will
automatically inherit GFP_NOFS this way.  This means that we do not have
to mark any of those requests with GFP_NOFS and moreover all the
ext4_kv[mz]alloc(GFP_NOFS) are also safe now because even the hardcoded
GFP_KERNEL allocations deep inside the vmalloc will be NOFS now.

[akpm@linux-foundation.org: tweak comments]
Link: http://lkml.kernel.org/r/20170306131408.9828-7-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Chris Mason <clm@fb.com>
Cc: David Sterba <dsterba@suse.cz>
Cc: Brian Foster <bfoster@redhat.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Nikolay Borisov <nborisov@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd2/transaction.c | 12 ++++++++++++
 include/linux/jbd2.h  |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/jbd2.h>
 
@@ -388,6 +389,11 @@ repeat:
 
 	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
 	jbd2_journal_free_transaction(new_transaction);
+	/*
+	 * Ensure that no allocations done while the transaction is open are
+	 * going to recurse back to the fs layer.
+	 */
+	handle->saved_alloc_context = memalloc_nofs_save();
 	return 0;
 }
 
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
 				handle->h_transaction->t_tid, type,
 				line_no, nblocks);
+
 	return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_rsv_handle)
 		jbd2_journal_free_reserved(handle->h_rsv_handle);
 free_and_exit:
+	/*
+	 * Scope of the GFP_NOFS context is over here and so we can restore the
+	 * original alloc context.
+	 */
+	memalloc_nofs_restore(handle->saved_alloc_context);
 	jbd2_free_handle(handle);
 	return err;
 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index dfaa1f4dcb0c..606b6bce3a5b 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -491,6 +491,8 @@ struct jbd2_journal_handle
 
 	unsigned long		h_start_jiffies;
 	unsigned int		h_requested_credits;
+
+	unsigned int		saved_alloc_context;
 };
 
 
-- 
cgit v1.2.3


From 056b9d8a76924df02011f3941c4f53ace8d6c32a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 May 2017 14:53:32 -0700
Subject: mm: remove rodata_test_data export, add pr_fmt

Since commit 3ad38ceb2769 ("x86/mm: Remove CONFIG_DEBUG_NX_TEST"),
nothing is using the exported rodata_test_data variable, so drop the
export.

This additionally updates the pr_fmt to avoid redundant strings and
adjusts some whitespace.

Link: http://lkml.kernel.org/r/20170307005313.GA85809@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Jinbum Park <jinb.park7@gmail.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rodata_test.h |  1 -
 mm/rodata_test.c            | 17 +++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h
index ea05f6c51413..84766bcdd01f 100644
--- a/include/linux/rodata_test.h
+++ b/include/linux/rodata_test.h
@@ -14,7 +14,6 @@
 #define _RODATA_TEST_H
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
-extern const int rodata_test_data;
 void rodata_test(void);
 #else
 static inline void rodata_test(void) {}
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 0fd21670b513..6bb4deb12e78 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,11 +9,12 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+#define pr_fmt(fmt) "rodata_test: " fmt
+
 #include <linux/uaccess.h>
 #include <asm/sections.h>
 
 const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
 
 void rodata_test(void)
 {
@@ -23,20 +24,20 @@ void rodata_test(void)
 	/* test 1: read the value */
 	/* If this test fails, some previous testrun has clobbered the state */
 	if (!rodata_test_data) {
-		pr_err("rodata_test: test 1 fails (start data)\n");
+		pr_err("test 1 fails (start data)\n");
 		return;
 	}
 
 	/* test 2: write to the variable; this should fault */
 	if (!probe_kernel_write((void *)&rodata_test_data,
-						(void *)&zero, sizeof(zero))) {
-		pr_err("rodata_test: test data was not read only\n");
+				(void *)&zero, sizeof(zero))) {
+		pr_err("test data was not read only\n");
 		return;
 	}
 
 	/* test 3: check the value hasn't changed */
 	if (rodata_test_data == zero) {
-		pr_err("rodata_test: test data was changed\n");
+		pr_err("test data was changed\n");
 		return;
 	}
 
@@ -44,13 +45,13 @@ void rodata_test(void)
 	start = (unsigned long)__start_rodata;
 	end = (unsigned long)__end_rodata;
 	if (start & (PAGE_SIZE - 1)) {
-		pr_err("rodata_test: start of .rodata is not page size aligned\n");
+		pr_err("start of .rodata is not page size aligned\n");
 		return;
 	}
 	if (end & (PAGE_SIZE - 1)) {
-		pr_err("rodata_test: end of .rodata is not page size aligned\n");
+		pr_err("end of .rodata is not page size aligned\n");
 		return;
 	}
 
-	pr_info("rodata_test: all tests were successful\n");
+	pr_info("all tests were successful\n");
 }
-- 
cgit v1.2.3


From 18863d3a3f593f47b075b9f53ebf9228dc76cf72 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:04 -0700
Subject: mm: remove SWAP_DIRTY in ttu

If we found lazyfree page is dirty, try_to_unmap_one can just
SetPageSwapBakced in there like PG_mlocked page and just return with
SWAP_FAIL which is very natural because the page is not swappable right
now so that vmscan can activate it.  There is no point to introduce new
return value SWAP_DIRTY in try_to_unmap at the moment.

Link: http://lkml.kernel.org/r/1489555493-14659-3-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 1 -
 mm/rmap.c            | 4 ++--
 mm/vmscan.c          | 3 ---
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fee10d744ebd..b556eefa62bc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -298,6 +298,5 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
 #define SWAP_MLOCK	3
-#define SWAP_DIRTY	4
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 4baf504e4213..f6aa18d8a420 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1436,7 +1436,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				 * discarded. Remap the page to page table.
 				 */
 				set_pte_at(mm, address, pvmw.pte, pteval);
-				ret = SWAP_DIRTY;
+				SetPageSwapBacked(page);
+				ret = SWAP_FAIL;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1506,7 +1507,6 @@ static int page_mapcount_is_zero(struct page *page)
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
  * SWAP_MLOCK	- page is mlocked.
- * SWAP_DIRTY	- page is dirty MADV_FREE page
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e54c882d6789..f1fd388454bd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1147,9 +1147,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (page_mapped(page)) {
 			switch (ret = try_to_unmap(page,
 				ttu_flags | TTU_BATCH_FLUSH)) {
-			case SWAP_DIRTY:
-				SetPageSwapBacked(page);
-				/* fall through */
 			case SWAP_FAIL:
 				nr_unmap_fail++;
 				goto activate_locked;
-- 
cgit v1.2.3


From 192d7232569ab61ded40c8be691b12832bc6bcd1 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:10 -0700
Subject: mm: make try_to_munlock() return void

try_to_munlock returns SWAP_MLOCK if the one of VMAs mapped the page has
VM_LOCKED flag.  In that time, VM set PG_mlocked to the page if the page
is not pte-mapped THP which cannot be mlocked, either.

With that, __munlock_isolated_page can use PageMlocked to check whether
try_to_munlock is successful or not without relying on try_to_munlock's
retval.  It helps to make try_to_unmap/try_to_unmap_one simple with
upcoming patches.

[minchan@kernel.org: remove PG_Mlocked VM_BUG_ON check]
  Link: http://lkml.kernel.org/r/20170411025615.GA6545@bbox
Link: http://lkml.kernel.org/r/1489555493-14659-5-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/mlock.c           |  6 ++----
 mm/rmap.c            | 16 ++++------------
 3 files changed, 7 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b556eefa62bc..1b0cd4cf68e3 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -235,7 +235,7 @@ int page_mkclean(struct page *);
  * called in munlock()/munmap() path to check for other vmas holding
  * the page mlocked.
  */
-int try_to_munlock(struct page *);
+void try_to_munlock(struct page *);
 
 void remove_migration_ptes(struct page *old, struct page *new, bool locked);
 
diff --git a/mm/mlock.c b/mm/mlock.c
index 0dd9ca18e19e..c483c5c20b4b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
  */
 static void __munlock_isolated_page(struct page *page)
 {
-	int ret = SWAP_AGAIN;
-
 	/*
 	 * Optimization: if the page was mapped just once, that's our mapping
 	 * and we don't need to check all the other vmas.
 	 */
 	if (page_mapcount(page) > 1)
-		ret = try_to_munlock(page);
+		try_to_munlock(page);
 
 	/* Did try_to_unlock() succeed or punt? */
-	if (ret != SWAP_MLOCK)
+	if (!PageMlocked(page))
 		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
 
 	putback_lru_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index dfe40557ea29..171db511aa8c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1552,18 +1552,10 @@ static int page_not_mapped(struct page *page)
  * Called from munlock code.  Checks all of the VMAs mapping the page
  * to make sure nobody else has this page mlocked. The page will be
  * returned with PG_mlocked cleared if no other vmas have it mlocked.
- *
- * Return values are:
- *
- * SWAP_AGAIN	- no vma is holding page mlocked, or,
- * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
- * SWAP_FAIL	- page cannot be located at present
- * SWAP_MLOCK	- page is now mlocked.
  */
-int try_to_munlock(struct page *page)
-{
-	int ret;
 
+void try_to_munlock(struct page *page)
+{
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)TTU_MUNLOCK,
@@ -1573,9 +1565,9 @@ int try_to_munlock(struct page *page)
 	};
 
 	VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
 
-	ret = rmap_walk(page, &rwc);
-	return ret;
+	rmap_walk(page, &rwc);
 }
 
 void __put_anon_vma(struct anon_vma *anon_vma)
-- 
cgit v1.2.3


From ad6b67041a45497261617d7a28b15159b202cb5a Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:13 -0700
Subject: mm: remove SWAP_MLOCK in ttu

ttu doesn't need to return SWAP_MLOCK.  Instead, just return SWAP_FAIL
because it means the page is not-swappable so it should move to another
LRU list(active or unevictable).  putback friends will move it to right
list depending on the page's LRU flag.

Link: http://lkml.kernel.org/r/1489555493-14659-6-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 -
 mm/rmap.c            |  3 +--
 mm/vmscan.c          | 20 +++++++-------------
 3 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1b0cd4cf68e3..3630d4dcee13 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -297,6 +297,5 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
-#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 171db511aa8c..45f41d4fa670 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1329,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 					 */
 					mlock_vma_page(page);
 				}
-				ret = SWAP_MLOCK;
+				ret = SWAP_FAIL;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1506,7 +1506,6 @@ static int page_mapcount_is_zero(struct page *page)
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
- * SWAP_MLOCK	- page is mlocked.
  */
 int try_to_unmap(struct page *page, enum ttu_flags flags)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f1fd388454bd..cfd2651966a8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -987,7 +987,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		sc->nr_scanned++;
 
 		if (unlikely(!page_evictable(page)))
-			goto cull_mlocked;
+			goto activate_locked;
 
 		if (!sc->may_unmap && page_mapped(page))
 			goto keep_locked;
@@ -1152,8 +1152,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
-			case SWAP_MLOCK:
-				goto cull_mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -1295,20 +1293,16 @@ free_it:
 		list_add(&page->lru, &free_pages);
 		continue;
 
-cull_mlocked:
-		if (PageSwapCache(page))
-			try_to_free_swap(page);
-		unlock_page(page);
-		list_add(&page->lru, &ret_pages);
-		continue;
-
 activate_locked:
 		/* Not a candidate for swapping, so reclaim swap space. */
-		if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+		if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+						PageMlocked(page)))
 			try_to_free_swap(page);
 		VM_BUG_ON_PAGE(PageActive(page), page);
-		SetPageActive(page);
-		pgactivate++;
+		if (!PageMlocked(page)) {
+			SetPageActive(page);
+			pgactivate++;
+		}
 keep_locked:
 		unlock_page(page);
 keep:
-- 
cgit v1.2.3


From 666e5a406c3ed562e7b3ceff8b631b6067bdaead Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:20 -0700
Subject: mm: make ttu's return boolean

try_to_unmap() returns SWAP_SUCCESS or SWAP_FAIL so it's suitable for
boolean return.  This patch changes it.

Link: http://lkml.kernel.org/r/1489555493-14659-8-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  4 ++--
 mm/huge_memory.c     |  6 +++---
 mm/memory-failure.c  | 26 ++++++++++++--------------
 mm/rmap.c            |  8 +++-----
 mm/vmscan.c          |  7 +------
 5 files changed, 21 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 3630d4dcee13..6028c38d3cac 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -191,7 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound)
 int page_referenced(struct page *, int is_locked,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 
-int try_to_unmap(struct page *, enum ttu_flags flags);
+bool try_to_unmap(struct page *, enum ttu_flags flags);
 
 /* Avoid racy checks */
 #define PVMW_SYNC		(1 << 0)
@@ -281,7 +281,7 @@ static inline int page_referenced(struct page *page, int is_locked,
 	return 0;
 }
 
-#define try_to_unmap(page, refs) SWAP_FAIL
+#define try_to_unmap(page, refs) false
 
 static inline int page_mkclean(struct page *page)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08501a607b00..b787c4cfda0e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2144,15 +2144,15 @@ static void freeze_page(struct page *page)
 {
 	enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
 		TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
-	int ret;
+	bool unmap_success;
 
 	VM_BUG_ON_PAGE(!PageHead(page), page);
 
 	if (PageAnon(page))
 		ttu_flags |= TTU_MIGRATION;
 
-	ret = try_to_unmap(page, ttu_flags);
-	VM_BUG_ON_PAGE(ret, page);
+	unmap_success = try_to_unmap(page, ttu_flags);
+	VM_BUG_ON_PAGE(!unmap_success, page);
 }
 
 static void unfreeze_page(struct page *page)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f85adfe57484..3d3cf6add4c1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -322,7 +322,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
  * wrong earlier.
  */
 static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
-			  int fail, struct page *page, unsigned long pfn,
+			  bool fail, struct page *page, unsigned long pfn,
 			  int flags)
 {
 	struct to_kill *tk, *next;
@@ -904,13 +904,13 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
-static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int trapno, int flags, struct page **hpagep)
 {
 	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
-	int ret;
+	bool unmap_success;
 	int kill = 1, forcekill;
 	struct page *hpage = *hpagep;
 
@@ -919,20 +919,20 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * other types of pages.
 	 */
 	if (PageReserved(p) || PageSlab(p))
-		return SWAP_SUCCESS;
+		return true;
 	if (!(PageLRU(hpage) || PageHuge(p)))
-		return SWAP_SUCCESS;
+		return true;
 
 	/*
 	 * This check implies we don't kill processes if their pages
 	 * are in the swap cache early. Those are always late kills.
 	 */
 	if (!page_mapped(hpage))
-		return SWAP_SUCCESS;
+		return true;
 
 	if (PageKsm(p)) {
 		pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
-		return SWAP_FAIL;
+		return false;
 	}
 
 	if (PageSwapCache(p)) {
@@ -971,8 +971,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	if (kill)
 		collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
 
-	ret = try_to_unmap(hpage, ttu);
-	if (ret != SWAP_SUCCESS)
+	unmap_success = try_to_unmap(hpage, ttu);
+	if (!unmap_success)
 		pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
 		       pfn, page_mapcount(hpage));
 
@@ -987,10 +987,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * any accesses to the poisoned memory.
 	 */
 	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
-	kill_procs(&tokill, forcekill, trapno,
-		      ret != SWAP_SUCCESS, p, pfn, flags);
+	kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
 
-	return ret;
+	return unmap_success;
 }
 
 static void set_page_hwpoison_huge_page(struct page *hpage)
@@ -1230,8 +1229,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * When the raw error page is thp tail page, hpage points to the raw
 	 * page after thp split.
 	 */
-	if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
-	    != SWAP_SUCCESS) {
+	if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
 		action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		res = -EBUSY;
 		goto out;
diff --git a/mm/rmap.c b/mm/rmap.c
index a3645d029400..928bdfe2be30 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1501,12 +1501,10 @@ static int page_mapcount_is_zero(struct page *page)
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock.
- * Return values are:
  *
- * SWAP_SUCCESS	- we succeeded in removing all mappings
- * SWAP_FAIL	- the page is unswappable
+ * If unmap is successful, return true. Otherwise, false.
  */
-int try_to_unmap(struct page *page, enum ttu_flags flags)
+bool try_to_unmap(struct page *page, enum ttu_flags flags)
 {
 	struct rmap_walk_control rwc = {
 		.rmap_one = try_to_unmap_one,
@@ -1531,7 +1529,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 	else
 		rmap_walk(page, &rwc);
 
-	return !page_mapcount(page) ? SWAP_SUCCESS : SWAP_FAIL;
+	return !page_mapcount(page) ? true : false;
 }
 
 static int page_not_mapped(struct page *page)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f80a54da5f7f..7a30150b4dee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -972,7 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		int may_enter_fs;
 		enum page_references references = PAGEREF_RECLAIM_CLEAN;
 		bool dirty, writeback;
-		int ret = SWAP_SUCCESS;
 
 		cond_resched();
 
@@ -1145,13 +1144,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page)) {
-			switch (ret = try_to_unmap(page,
-				ttu_flags | TTU_BATCH_FLUSH)) {
-			case SWAP_FAIL:
+			if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
 				nr_unmap_fail++;
 				goto activate_locked;
-			case SWAP_SUCCESS:
-				; /* try to free the page below */
 			}
 		}
 
-- 
cgit v1.2.3


From 1df631ae19819cff343d316eda42eca32d3de7fc Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:23 -0700
Subject: mm: make rmap_walk() return void

There is no user of the return value from rmap_walk() and friends so
this patch makes them void-returning functions.

Link: http://lkml.kernel.org/r/1489555493-14659-9-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ksm.h  |  5 ++---
 include/linux/rmap.h |  4 ++--
 mm/ksm.c             | 16 ++++++----------
 mm/rmap.c            | 32 +++++++++++++-------------------
 4 files changed, 23 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index e1cfda4bee58..78b44a024eaa 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page,
 struct page *ksm_might_need_to_copy(struct page *page,
 			struct vm_area_struct *vma, unsigned long address);
 
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
 #else  /* !CONFIG_KSM */
@@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page,
 	return 0;
 }
 
-static inline int rmap_walk_ksm(struct page *page,
+static inline void rmap_walk_ksm(struct page *page,
 			struct rmap_walk_control *rwc)
 {
-	return 0;
 }
 
 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 6028c38d3cac..1d7d457ca0dc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -264,8 +264,8 @@ struct rmap_walk_control {
 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
 
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc);
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc);
 
 #else	/* !CONFIG_MMU */
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 19b4f2dea7a5..6edffb9a795b 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
 	return new_page;
 }
 
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct stable_node *stable_node;
 	struct rmap_item *rmap_item;
-	int ret = SWAP_AGAIN;
 	int search_new_forks = 0;
 
 	VM_BUG_ON_PAGE(!PageKsm(page), page);
@@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 
 	stable_node = page_stable_node(page);
 	if (!stable_node)
-		return ret;
+		return;
 again:
 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
 		struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1978,23 +1977,20 @@ again:
 			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 				continue;
 
-			ret = rwc->rmap_one(page, vma,
-					rmap_item->address, rwc->arg);
-			if (ret != SWAP_AGAIN) {
+			if (SWAP_AGAIN != rwc->rmap_one(page, vma,
+					rmap_item->address, rwc->arg)) {
 				anon_vma_unlock_read(anon_vma);
-				goto out;
+				return;
 			}
 			if (rwc->done && rwc->done(page)) {
 				anon_vma_unlock_read(anon_vma);
-				goto out;
+				return;
 			}
 		}
 		anon_vma_unlock_read(anon_vma);
 	}
 	if (!search_new_forks++)
 		goto again;
-out:
-	return ret;
 }
 
 #ifdef CONFIG_MIGRATION
diff --git a/mm/rmap.c b/mm/rmap.c
index 928bdfe2be30..3b40d47e3300 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1607,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
  */
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		bool locked)
 {
 	struct anon_vma *anon_vma;
 	pgoff_t pgoff_start, pgoff_end;
 	struct anon_vma_chain *avc;
-	int ret = SWAP_AGAIN;
 
 	if (locked) {
 		anon_vma = page_anon_vma(page);
@@ -1623,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		anon_vma = rmap_walk_anon_lock(page, rwc);
 	}
 	if (!anon_vma)
-		return ret;
+		return;
 
 	pgoff_start = page_to_pgoff(page);
 	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1637,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		ret = rwc->rmap_one(page, vma, address, rwc->arg);
-		if (ret != SWAP_AGAIN)
+		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
 			break;
 		if (rwc->done && rwc->done(page))
 			break;
@@ -1646,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 
 	if (!locked)
 		anon_vma_unlock_read(anon_vma);
-	return ret;
 }
 
 /*
@@ -1662,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * LOCKED.
  */
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 		bool locked)
 {
 	struct address_space *mapping = page_mapping(page);
 	pgoff_t pgoff_start, pgoff_end;
 	struct vm_area_struct *vma;
-	int ret = SWAP_AGAIN;
 
 	/*
 	 * The page lock not only makes sure that page->mapping cannot
@@ -1679,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
 	if (!mapping)
-		return ret;
+		return;
 
 	pgoff_start = page_to_pgoff(page);
 	pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1694,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		ret = rwc->rmap_one(page, vma, address, rwc->arg);
-		if (ret != SWAP_AGAIN)
+		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
 			goto done;
 		if (rwc->done && rwc->done(page))
 			goto done;
@@ -1704,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 done:
 	if (!locked)
 		i_mmap_unlock_read(mapping);
-	return ret;
 }
 
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
 {
 	if (unlikely(PageKsm(page)))
-		return rmap_walk_ksm(page, rwc);
+		rmap_walk_ksm(page, rwc);
 	else if (PageAnon(page))
-		return rmap_walk_anon(page, rwc, false);
+		rmap_walk_anon(page, rwc, false);
 	else
-		return rmap_walk_file(page, rwc, false);
+		rmap_walk_file(page, rwc, false);
 }
 
 /* Like rmap_walk, but caller holds relevant rmap lock */
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
 {
 	/* no ksm support for now */
 	VM_BUG_ON_PAGE(PageKsm(page), page);
 	if (PageAnon(page))
-		return rmap_walk_anon(page, rwc, true);
+		rmap_walk_anon(page, rwc, true);
 	else
-		return rmap_walk_file(page, rwc, true);
+		rmap_walk_file(page, rwc, true);
 }
 
 #ifdef CONFIG_HUGETLB_PAGE
-- 
cgit v1.2.3


From e4b82222712ed15813d35204c91429883d27d1d9 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:27 -0700
Subject: mm: make rmap_one boolean function

rmap_one's return value controls whether rmap_work should contine to
scan other ptes or not so it's target for changing to boolean.  Return
true if the scan should be continued.  Otherwise, return false to stop
the scanning.

This patch makes rmap_one's return value to boolean.

Link: http://lkml.kernel.org/r/1489555493-14659-10-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  6 +++++-
 mm/ksm.c             |  2 +-
 mm/migrate.c         |  4 ++--
 mm/page_idle.c       |  4 ++--
 mm/rmap.c            | 30 +++++++++++++++---------------
 5 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 1d7d457ca0dc..13ed232cbb29 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -257,7 +257,11 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  */
 struct rmap_walk_control {
 	void *arg;
-	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+	/*
+	 * Return false if page table scanning in rmap_walk should be stopped.
+	 * Otherwise, return true.
+	 */
+	bool (*rmap_one)(struct page *page, struct vm_area_struct *vma,
 					unsigned long addr, void *arg);
 	int (*done)(struct page *page);
 	struct anon_vma *(*anon_lock)(struct page *page);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6edffb9a795b..d9fc0e456128 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1977,7 +1977,7 @@ again:
 			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 				continue;
 
-			if (SWAP_AGAIN != rwc->rmap_one(page, vma,
+			if (!rwc->rmap_one(page, vma,
 					rmap_item->address, rwc->arg)) {
 				anon_vma_unlock_read(anon_vma);
 				return;
diff --git a/mm/migrate.c b/mm/migrate.c
index b32630d10329..89a0a1707f4c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l)
 /*
  * Restore a potential migration pte to a working pte entry
  */
-static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
+static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 				 unsigned long addr, void *old)
 {
 	struct page_vma_mapped_walk pvmw = {
@@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		update_mmu_cache(vma, pvmw.address, pvmw.pte);
 	}
 
-	return SWAP_AGAIN;
+	return true;
 }
 
 /*
diff --git a/mm/page_idle.c b/mm/page_idle.c
index b0ee56c56b58..1b0f48c62316 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn)
 	return page;
 }
 
-static int page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct page *page,
 					struct vm_area_struct *vma,
 					unsigned long addr, void *arg)
 {
@@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
 		 */
 		set_page_young(page);
 	}
-	return SWAP_AGAIN;
+	return true;
 }
 
 static void page_idle_clear_pte_refs(struct page *page)
diff --git a/mm/rmap.c b/mm/rmap.c
index 3b40d47e3300..47e8dafc83c8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -724,7 +724,7 @@ struct page_referenced_arg {
 /*
  * arg: page_referenced_arg will be passed
  */
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
 			unsigned long address, void *arg)
 {
 	struct page_referenced_arg *pra = arg;
@@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		if (vma->vm_flags & VM_LOCKED) {
 			page_vma_mapped_walk_done(&pvmw);
 			pra->vm_flags |= VM_LOCKED;
-			return SWAP_FAIL; /* To break the loop */
+			return false; /* To break the loop */
 		}
 
 		if (pvmw.pte) {
@@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 	}
 
 	if (!pra->mapcount)
-		return SWAP_SUCCESS; /* To break the loop */
+		return false; /* To break the loop */
 
-	return SWAP_AGAIN;
+	return true;
 }
 
 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
@@ -854,7 +854,7 @@ int page_referenced(struct page *page,
 	return pra.referenced;
 }
 
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 			    unsigned long address, void *arg)
 {
 	struct page_vma_mapped_walk pvmw = {
@@ -907,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
 		}
 	}
 
-	return SWAP_AGAIN;
+	return true;
 }
 
 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
@@ -1290,7 +1290,7 @@ void page_remove_rmap(struct page *page, bool compound)
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		     unsigned long address, void *arg)
 {
 	struct mm_struct *mm = vma->vm_mm;
@@ -1301,12 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	};
 	pte_t pteval;
 	struct page *subpage;
-	int ret = SWAP_AGAIN;
+	bool ret = true;
 	enum ttu_flags flags = (enum ttu_flags)arg;
 
 	/* munlock has nothing to gain from examining un-locked vmas */
 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
-		return SWAP_AGAIN;
+		return true;
 
 	if (flags & TTU_SPLIT_HUGE_PMD) {
 		split_huge_pmd_address(vma, address,
@@ -1329,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 					 */
 					mlock_vma_page(page);
 				}
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1347,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (!(flags & TTU_IGNORE_ACCESS)) {
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte)) {
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1437,14 +1437,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 				 */
 				set_pte_at(mm, address, pvmw.pte, pteval);
 				SetPageSwapBacked(page);
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
 
 			if (swap_duplicate(entry) < 0) {
 				set_pte_at(mm, address, pvmw.pte, pteval);
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
@@ -1636,7 +1636,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
+		if (!rwc->rmap_one(page, vma, address, rwc->arg))
 			break;
 		if (rwc->done && rwc->done(page))
 			break;
@@ -1690,7 +1690,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 			continue;
 
-		if (SWAP_AGAIN != rwc->rmap_one(page, vma, address, rwc->arg))
+		if (!rwc->rmap_one(page, vma, address, rwc->arg))
 			goto done;
 		if (rwc->done && rwc->done(page))
 			goto done;
-- 
cgit v1.2.3


From 83612a948d3bd2e71b110d7e8735661621bd23d9 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 3 May 2017 14:54:30 -0700
Subject: mm: remove SWAP_[SUCCESS|AGAIN|FAIL]

There is no user for it.  Remove it.

[minchan@kernel.org: use false instead of SWAP_FAIL]
  Link: http://lkml.kernel.org/r/20170316053313.GA19241@bbox
Link: http://lkml.kernel.org/r/1489555493-14659-11-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 7 -------
 mm/rmap.c            | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 13ed232cbb29..43ef2c30cb0f 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -295,11 +295,4 @@ static inline int page_mkclean(struct page *page)
 
 #endif	/* CONFIG_MMU */
 
-/*
- * Return values of try_to_unmap
- */
-#define SWAP_SUCCESS	0
-#define SWAP_AGAIN	1
-#define SWAP_FAIL	2
-
 #endif	/* _LINUX_RMAP_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 47e8dafc83c8..e303fdbee561 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1419,7 +1419,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 			 */
 			if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
 				WARN_ON_ONCE(1);
-				ret = SWAP_FAIL;
+				ret = false;
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}
-- 
cgit v1.2.3


From bd33ef3681359343863f2290aded182b0441edee Mon Sep 17 00:00:00 2001
From: Vinayak Menon <vinmenon@codeaurora.org>
Date: Wed, 3 May 2017 14:54:42 -0700
Subject: mm: enable page poisoning early at boot

On SPARSEMEM systems page poisoning is enabled after buddy is up,
because of the dependency on page extension init.  This causes the pages
released by free_all_bootmem not to be poisoned.  This either delays or
misses the identification of some issues because the pages have to
undergo another cycle of alloc-free-alloc for any corruption to be
detected.

Enable page poisoning early by getting rid of the PAGE_EXT_DEBUG_POISON
flag.  Since all the free pages will now be poisoned, the flag need not
be verified before checking the poison during an alloc.

[vinmenon@codeaurora.org: fix Kconfig]
  Link: http://lkml.kernel.org/r/1490878002-14423-1-git-send-email-vinmenon@codeaurora.org
Link: http://lkml.kernel.org/r/1490358246-11001-1-git-send-email-vinmenon@codeaurora.org
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Tested-by: Laura Abbott <labbott@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/Kconfig.debug   |  1 -
 mm/page_alloc.c    | 13 +++------
 mm/page_ext.c      | 13 ++-------
 mm/page_poison.c   | 77 +++++++++---------------------------------------------
 5 files changed, 17 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 695da2a19b4c..5d22e69f51ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2487,7 +2487,6 @@ extern long copy_huge_page_from_user(struct page *dst_page,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
 extern struct page_ext_operations debug_guardpage_ops;
-extern struct page_ext_operations page_poisoning_ops;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 extern unsigned int _debug_guardpage_minorder;
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 79d0fd13b5b3..5b0adf1435de 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
 
 config PAGE_POISONING
 	bool "Poison pages after freeing"
-	select PAGE_EXTENSION
 	select PAGE_POISONING_NO_SANITY if HIBERNATION
 	---help---
 	  Fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 465391811c2e..f1f225608413 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1689,10 +1689,10 @@ static inline int check_new_page(struct page *page)
 	return 1;
 }
 
-static inline bool free_pages_prezeroed(bool poisoned)
+static inline bool free_pages_prezeroed(void)
 {
 	return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-		page_poisoning_enabled() && poisoned;
+		page_poisoning_enabled();
 }
 
 #ifdef CONFIG_DEBUG_VM
@@ -1746,17 +1746,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
 							unsigned int alloc_flags)
 {
 	int i;
-	bool poisoned = true;
-
-	for (i = 0; i < (1 << order); i++) {
-		struct page *p = page + i;
-		if (poisoned)
-			poisoned &= page_is_poisoned(p);
-	}
 
 	post_alloc_hook(page, order, gfp_flags);
 
-	if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
+	if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
 		for (i = 0; i < (1 << order); i++)
 			clear_highpage(page + i);
 
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 121dcffc4ec1..88ccc044b09a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,9 +59,6 @@
 
 static struct page_ext_operations *page_ext_ops[] = {
 	&debug_guardpage_ops,
-#ifdef CONFIG_PAGE_POISONING
-	&page_poisoning_ops,
-#endif
 #ifdef CONFIG_PAGE_OWNER
 	&page_owner_ops,
 #endif
@@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page)
 	struct page_ext *base;
 
 	base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
 	/*
 	 * The sanity checks the page allocator does upon freeing a
 	 * page can reach here before the page_ext arrays are
 	 * allocated when feeding a range of pages to the allocator
 	 * for the first time during bootup or memory hotplug.
-	 *
-	 * This check is also necessary for ensuring page poisoning
-	 * works as expected when enabled
 	 */
 	if (unlikely(!base))
 		return NULL;
@@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page)
 {
 	unsigned long pfn = page_to_pfn(page);
 	struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
 	/*
 	 * The sanity checks the page allocator does upon freeing a
 	 * page can reach here before the page_ext arrays are
 	 * allocated when feeding a range of pages to the allocator
 	 * for the first time during bootup or memory hotplug.
-	 *
-	 * This check is also necessary for ensuring page poisoning
-	 * works as expected when enabled
 	 */
 	if (!section->page_ext)
 		return NULL;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 2e647c65916b..be19e989ccff 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,7 +6,6 @@
 #include <linux/poison.h>
 #include <linux/ratelimit.h>
 
-static bool __page_poisoning_enabled __read_mostly;
 static bool want_page_poisoning __read_mostly;
 
 static int early_page_poison_param(char *buf)
@@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf)
 early_param("page_poison", early_page_poison_param);
 
 bool page_poisoning_enabled(void)
-{
-	return __page_poisoning_enabled;
-}
-
-static bool need_page_poisoning(void)
-{
-	return want_page_poisoning;
-}
-
-static void init_page_poisoning(void)
 {
 	/*
-	 * page poisoning is debug page alloc for some arches. If either
-	 * of those options are enabled, enable poisoning
+	 * Assumes that debug_pagealloc_enabled is set before
+	 * free_all_bootmem.
+	 * Page poisoning is debug page alloc for some arches. If
+	 * either of those options are enabled, enable poisoning.
 	 */
-	if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
-		if (!want_page_poisoning && !debug_pagealloc_enabled())
-			return;
-	} else {
-		if (!want_page_poisoning)
-			return;
-	}
-
-	__page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
-	.need = need_page_poisoning,
-	.init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
-	struct page_ext *page_ext;
-
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return;
-
-	__set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
-	struct page_ext *page_ext;
-
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return;
-
-	__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-bool page_is_poisoned(struct page *page)
-{
-	struct page_ext *page_ext;
-
-	page_ext = lookup_page_ext(page);
-	if (unlikely(!page_ext))
-		return false;
-
-	return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+	return (want_page_poisoning ||
+		(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+		debug_pagealloc_enabled()));
 }
 
 static void poison_page(struct page *page)
 {
 	void *addr = kmap_atomic(page);
 
-	set_page_poison(page);
 	memset(addr, PAGE_POISON, PAGE_SIZE);
 	kunmap_atomic(addr);
 }
@@ -140,12 +86,13 @@ static void unpoison_page(struct page *page)
 {
 	void *addr;
 
-	if (!page_is_poisoned(page))
-		return;
-
 	addr = kmap_atomic(page);
+	/*
+	 * Page poisoning when enabled poisons each and every page
+	 * that is freed to buddy. Thus no extra check is done to
+	 * see if a page was posioned.
+	 */
 	check_poison_mem(addr, PAGE_SIZE);
-	clear_page_poison(page);
 	kunmap_atomic(addr);
 }
 
-- 
cgit v1.2.3


From 9927e3887642b976d9b391cd77d71388aa521e54 Mon Sep 17 00:00:00 2001
From: Pushkar Jambhlekar <pushkar.iit@gmail.com>
Date: Wed, 3 May 2017 14:54:45 -0700
Subject: include/linux/migrate.h: add arg names to prototype

It is preferred, and the rest of migrate.h gets it right.

Link: http://lkml.kernel.org/r/1490336009-8024-1-git-send-email-pushkar.iit@gmail.com
Signed-off-by: Pushkar Jambhlekar <pushkar.iit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fa76b516fa47..48e24844b3c5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES];
 #ifdef CONFIG_MIGRATION
 
 extern void putback_movable_pages(struct list_head *l);
-extern int migrate_page(struct address_space *,
-			struct page *, struct page *, enum migrate_mode);
+extern int migrate_page(struct address_space *mapping,
+			struct page *newpage, struct page *page,
+			enum migrate_mode mode);
 extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
 		unsigned long private, enum migrate_mode mode, int reason);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-- 
cgit v1.2.3


From ac2e8e40acf4c73e0ad1addca34b186d855565d7 Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Wed, 3 May 2017 14:54:51 -0700
Subject: mm: fix spelling error

Fix variable name error in comments. No code changes.

Link: http://lkml.kernel.org/r/20170403161655.5081-1-haolee.swjtu@gmail.com
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2bfcfd33e476..2b1a44f5bdb6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -313,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
 
 /*
  * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
- * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
- * and there are 16 of them to cover all possible combinations of
+ * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
+ * bits long and there are 16 of them to cover all possible combinations of
  * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
  *
  * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
-- 
cgit v1.2.3


From 2a2e48854d704214dac7546e87ae0e4daa0e61a0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:55:03 -0700
Subject: mm: vmscan: fix IO/refault regression in cache workingset transition

Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file
list") we noticed bigger IO spikes during changes in cache access
patterns.

The patch in question shrunk the inactive list size to leave more room
for the current workingset in the presence of streaming IO.  However,
workingset transitions that previously happened on the inactive list are
now pushed out of memory and incur more refaults to complete.

This patch disables active list protection when refaults are being
observed.  This accelerates workingset transitions, and allows more of
the new set to establish itself from memory, without eating into the
ability to protect the established workingset during stable periods.

The workloads that were measurably affected for us were hit pretty bad
by it, with refault/majfault rates doubling and tripling during cache
transitions, and the machines sustaining half-hour periods of 100% IO
utilization, where they'd previously have sub-minute peaks at 60-90%.

Stateful services that handle user data tend to be more conservative
with kernel upgrades.  As a result we hit most page cache issues with
some delay, as was the case here.

The severity seemed to warrant a stable tag.

Fixes: 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list")
Link: http://lkml.kernel.org/r/20170404220052.27593-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>	[4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 64 +++++++++++++++++++++++++++++--
 include/linux/mmzone.h     |  2 +
 mm/memcontrol.c            | 24 ++++--------
 mm/vmscan.c                | 94 ++++++++++++++++++++++++++++++++++++----------
 mm/workingset.c            |  7 +++-
 5 files changed, 150 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c5ebb32fef49..cfa91a3ca0ca 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -57,6 +57,9 @@ enum mem_cgroup_stat_index {
 	MEMCG_SLAB_RECLAIMABLE,
 	MEMCG_SLAB_UNRECLAIMABLE,
 	MEMCG_SOCK,
+	MEMCG_WORKINGSET_REFAULT,
+	MEMCG_WORKINGSET_ACTIVATE,
+	MEMCG_WORKINGSET_NODERECLAIM,
 	MEMCG_NR_STAT,
 };
 
@@ -495,6 +498,40 @@ extern int do_swap_account;
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
+static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
+						 enum mem_cgroup_stat_index idx)
+{
+	long val = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		val += per_cpu(memcg->stat->count[idx], cpu);
+
+	if (val < 0)
+		val = 0;
+
+	return val;
+}
+
+static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
+				   enum mem_cgroup_stat_index idx, int val)
+{
+	if (!mem_cgroup_disabled())
+		this_cpu_add(memcg->stat->count[idx], val);
+}
+
+static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
+				   enum mem_cgroup_stat_index idx)
+{
+	mem_cgroup_update_stat(memcg, idx, 1);
+}
+
+static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
+				   enum mem_cgroup_stat_index idx)
+{
+	mem_cgroup_update_stat(memcg, idx, -1);
+}
+
 /**
  * mem_cgroup_update_page_stat - update page state statistics
  * @page: the page
@@ -509,14 +546,14 @@ void unlock_page_memcg(struct page *page);
  *   if (TestClearPageState(page))
  *     mem_cgroup_update_page_stat(page, state, -1);
  *   unlock_page(page) or unlock_page_memcg(page)
+ *
+ * Kernel pages are an exception to this, since they'll never move.
  */
 static inline void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_stat_index idx, int val)
 {
-	VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page)));
-
 	if (page->mem_cgroup)
-		this_cpu_add(page->mem_cgroup->stat->count[idx], val);
+		mem_cgroup_update_stat(page->mem_cgroup, idx, val);
 }
 
 static inline void mem_cgroup_inc_page_stat(struct page *page,
@@ -741,6 +778,27 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 	return false;
 }
 
+static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
+						 enum mem_cgroup_stat_index idx)
+{
+	return 0;
+}
+
+static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
+				   enum mem_cgroup_stat_index idx, int val)
+{
+}
+
+static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
+				   enum mem_cgroup_stat_index idx)
+{
+}
+
+static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
+				   enum mem_cgroup_stat_index idx)
+{
+}
+
 static inline void mem_cgroup_update_page_stat(struct page *page,
 					       enum mem_cgroup_stat_index idx,
 					       int nr)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 446cf68c1c09..e0c3c5e3d8a0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -225,6 +225,8 @@ struct lruvec {
 	struct zone_reclaim_stat	reclaim_stat;
 	/* Evictions & activations on the inactive file list */
 	atomic_long_t			inactive_age;
+	/* Refaults at the time of last reclaim cycle */
+	unsigned long			refaults;
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 490d5b4676c1..108d5b097db1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -569,23 +569,6 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
  */
-static unsigned long
-mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
-{
-	long val = 0;
-	int cpu;
-
-	/* Per-cpu values can be negative, use a signed accumulator */
-	for_each_possible_cpu(cpu)
-		val += per_cpu(memcg->stat->count[idx], cpu);
-	/*
-	 * Summing races with updates, so val may be negative.  Avoid exposing
-	 * transient negative values.
-	 */
-	if (val < 0)
-		val = 0;
-	return val;
-}
 
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 					    enum mem_cgroup_events_index idx)
@@ -5244,6 +5227,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "pgmajfault %lu\n",
 		   events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
 
+	seq_printf(m, "workingset_refault %lu\n",
+		   stat[MEMCG_WORKINGSET_REFAULT]);
+	seq_printf(m, "workingset_activate %lu\n",
+		   stat[MEMCG_WORKINGSET_ACTIVATE]);
+	seq_printf(m, "workingset_nodereclaim %lu\n",
+		   stat[MEMCG_WORKINGSET_NODERECLAIM]);
+
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7a30150b4dee..e8cb983a8c84 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2006,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
  * Both inactive lists should also be large enough that each inactive
  * page has a chance to be referenced again before it is reclaimed.
  *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
  * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
  * on this LRU, maintained by the pageout code. A zone->inactive_ratio
  * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -2022,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-						struct scan_control *sc, bool trace)
+				 struct mem_cgroup *memcg,
+				 struct scan_control *sc, bool actual_reclaim)
 {
-	unsigned long inactive_ratio;
-	unsigned long inactive, active;
-	enum lru_list inactive_lru = file * LRU_FILE;
 	enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+	enum lru_list inactive_lru = file * LRU_FILE;
+	unsigned long inactive, active;
+	unsigned long inactive_ratio;
+	unsigned long refaults;
 	unsigned long gb;
 
 	/*
@@ -2040,27 +2045,43 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
 	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
+	if (memcg)
+		refaults = mem_cgroup_read_stat(memcg,
+						MEMCG_WORKINGSET_ACTIVATE);
 	else
-		inactive_ratio = 1;
+		refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+	/*
+	 * When refaults are being observed, it means a new workingset
+	 * is being established. Disable active list protection to get
+	 * rid of the stale workingset quickly.
+	 */
+	if (file && actual_reclaim && lruvec->refaults != refaults) {
+		inactive_ratio = 0;
+	} else {
+		gb = (inactive + active) >> (30 - PAGE_SHIFT);
+		if (gb)
+			inactive_ratio = int_sqrt(10 * gb);
+		else
+			inactive_ratio = 1;
+	}
 
-	if (trace)
-		trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
-				sc->reclaim_idx,
-				lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
-				lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
-				inactive_ratio, file);
+	if (actual_reclaim)
+		trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+			lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+			lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+			inactive_ratio, file);
 
 	return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-				 struct lruvec *lruvec, struct scan_control *sc)
+				 struct lruvec *lruvec, struct mem_cgroup *memcg,
+				 struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru),
+					 memcg, sc, true))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2169,7 +2190,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * lruvec even if it has plenty of old anonymous pages unless the
 	 * system is under heavy pressure.
 	 */
-	if (!inactive_list_is_low(lruvec, true, sc, false) &&
+	if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2320,7 +2341,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 				nr[lru] -= nr_to_scan;
 
 				nr_reclaimed += shrink_list(lru, nr_to_scan,
-							    lruvec, sc);
+							    lruvec, memcg, sc);
 			}
 		}
 
@@ -2387,7 +2408,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_list_is_low(lruvec, false, sc, true))
+	if (inactive_list_is_low(lruvec, false, memcg, sc, true))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 }
@@ -2703,6 +2724,26 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	sc->gfp_mask = orig_mask;
 }
 
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+	struct mem_cgroup *memcg;
+
+	memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+	do {
+		unsigned long refaults;
+		struct lruvec *lruvec;
+
+		if (memcg)
+			refaults = mem_cgroup_read_stat(memcg,
+						MEMCG_WORKINGSET_ACTIVATE);
+		else
+			refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+		lruvec = mem_cgroup_lruvec(pgdat, memcg);
+		lruvec->refaults = refaults;
+	} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -2723,6 +2764,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					  struct scan_control *sc)
 {
 	int initial_priority = sc->priority;
+	pg_data_t *last_pgdat;
+	struct zoneref *z;
+	struct zone *zone;
 retry:
 	delayacct_freepages_start();
 
@@ -2749,6 +2793,15 @@ retry:
 			sc->may_writepage = 1;
 	} while (--sc->priority >= 0);
 
+	last_pgdat = NULL;
+	for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+					sc->nodemask) {
+		if (zone->zone_pgdat == last_pgdat)
+			continue;
+		last_pgdat = zone->zone_pgdat;
+		snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+	}
+
 	delayacct_freepages_end();
 
 	if (sc->nr_reclaimed)
@@ -3033,7 +3086,7 @@ static void age_active_anon(struct pglist_data *pgdat,
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-		if (inactive_list_is_low(lruvec, false, sc, true))
+		if (inactive_list_is_low(lruvec, false, memcg, sc, true))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
@@ -3280,6 +3333,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		pgdat->kswapd_failures++;
 
 out:
+	snapshot_refaults(NULL, pgdat);
 	/*
 	 * Return the order kswapd stopped reclaiming at as
 	 * prepare_kswapd_sleep() takes it into account. If another caller
diff --git a/mm/workingset.c b/mm/workingset.c
index eda05c71fa49..51c6f61d4cea 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -269,7 +269,6 @@ bool workingset_refault(void *shadow)
 	lruvec = mem_cgroup_lruvec(pgdat, memcg);
 	refault = atomic_long_read(&lruvec->inactive_age);
 	active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
-	rcu_read_unlock();
 
 	/*
 	 * The unsigned subtraction here gives an accurate distance
@@ -290,11 +289,15 @@ bool workingset_refault(void *shadow)
 	refault_distance = (refault - eviction) & EVICTION_MASK;
 
 	inc_node_state(pgdat, WORKINGSET_REFAULT);
+	mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_REFAULT);
 
 	if (refault_distance <= active_file) {
 		inc_node_state(pgdat, WORKINGSET_ACTIVATE);
+		mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_ACTIVATE);
+		rcu_read_unlock();
 		return true;
 	}
+	rcu_read_unlock();
 	return false;
 }
 
@@ -472,6 +475,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	if (WARN_ON_ONCE(node->exceptional))
 		goto out_invalid;
 	inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+	mem_cgroup_inc_page_stat(virt_to_page(node),
+				 MEMCG_WORKINGSET_NODERECLAIM);
 	__radix_tree_delete_node(&mapping->page_tree, node,
 				 workingset_update_node, mapping);
 
-- 
cgit v1.2.3


From 31176c781508e4e35b1cc4ae2f0a5abd1f4ea689 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:55:07 -0700
Subject: mm: memcontrol: clean up memory.events counting function

We only ever count single events, drop the @nr parameter.  Rename the
function accordingly.  Remove low-information kerneldoc.

Link: http://lkml.kernel.org/r/20170404220148.28338-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 18 +++++-------------
 mm/memcontrol.c            |  8 ++++----
 mm/vmscan.c                |  2 +-
 3 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index cfa91a3ca0ca..bc0c16e284c0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -287,17 +287,10 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-static inline void mem_cgroup_events(struct mem_cgroup *memcg,
-		       enum mem_cgroup_events_index idx,
-		       unsigned int nr)
+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+				    enum mem_cgroup_events_index idx)
 {
-	this_cpu_add(memcg->stat->events[idx], nr);
+	this_cpu_inc(memcg->stat->events[idx]);
 	cgroup_file_notify(&memcg->events_file);
 }
 
@@ -614,9 +607,8 @@ static inline bool mem_cgroup_disabled(void)
 	return true;
 }
 
-static inline void mem_cgroup_events(struct mem_cgroup *memcg,
-				     enum mem_cgroup_events_index idx,
-				     unsigned int nr)
+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
+				    enum mem_cgroup_events_index idx)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 108d5b097db1..1ffa3ad201ea 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1825,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
 	do {
 		if (page_counter_read(&memcg->memory) <= memcg->high)
 			continue;
-		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+		mem_cgroup_event(memcg, MEMCG_HIGH);
 		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
 	} while ((memcg = parent_mem_cgroup(memcg)));
 }
@@ -1916,7 +1916,7 @@ retry:
 	if (!gfpflags_allow_blocking(gfp_mask))
 		goto nomem;
 
-	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+	mem_cgroup_event(mem_over_limit, MEMCG_MAX);
 
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
 						    gfp_mask, may_swap);
@@ -1959,7 +1959,7 @@ retry:
 	if (fatal_signal_pending(current))
 		goto force;
 
-	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+	mem_cgroup_event(mem_over_limit, MEMCG_OOM);
 
 	mem_cgroup_oom(mem_over_limit, gfp_mask,
 		       get_order(nr_pages * PAGE_SIZE));
@@ -5142,7 +5142,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 			continue;
 		}
 
-		mem_cgroup_events(memcg, MEMCG_OOM, 1);
+		mem_cgroup_event(memcg, MEMCG_OOM);
 		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
 			break;
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e8cb983a8c84..fbec74af2b69 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2526,7 +2526,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 					sc->memcg_low_skipped = 1;
 					continue;
 				}
-				mem_cgroup_events(memcg, MEMCG_LOW, 1);
+				mem_cgroup_event(memcg, MEMCG_LOW);
 			}
 
 			reclaimed = sc->nr_reclaimed;
-- 
cgit v1.2.3


From df0e53d0619e83b465e363c088bf4eeb2848273b Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:55:10 -0700
Subject: mm: memcontrol: re-use global VM event enum

The current duplication is a high-maintenance mess, and it's painful to
add new items.

This increases the size of the event array, but we'll eventually want
most of the VM events tracked on a per-cgroup basis anyway.

Link: http://lkml.kernel.org/r/20170404220148.28338-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 45 ++++++++++++---------------------------
 mm/memcontrol.c            | 53 ++++++++++++++++++++++++----------------------
 2 files changed, 42 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bc0c16e284c0..0bb5f055bd26 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -69,20 +69,6 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
-enum mem_cgroup_events_index {
-	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
-	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
-	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
-	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
-	MEM_CGROUP_EVENTS_NSTATS,
-	/* default hierarchy events */
-	MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS,
-	MEMCG_HIGH,
-	MEMCG_MAX,
-	MEMCG_OOM,
-	MEMCG_NR_EVENTS,
-};
-
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
@@ -106,6 +92,15 @@ struct mem_cgroup_id {
 	atomic_t ref;
 };
 
+/* Cgroup-specific events, on top of universal VM events */
+enum memcg_event_item {
+	MEMCG_LOW = NR_VM_EVENT_ITEMS,
+	MEMCG_HIGH,
+	MEMCG_MAX,
+	MEMCG_OOM,
+	MEMCG_NR_EVENTS,
+};
+
 struct mem_cgroup_stat_cpu {
 	long count[MEMCG_NR_STAT];
 	unsigned long events[MEMCG_NR_EVENTS];
@@ -288,9 +283,9 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 static inline void mem_cgroup_event(struct mem_cgroup *memcg,
-				    enum mem_cgroup_events_index idx)
+				    enum memcg_event_item event)
 {
-	this_cpu_inc(memcg->stat->events[idx]);
+	this_cpu_inc(memcg->stat->events[event]);
 	cgroup_file_notify(&memcg->events_file);
 }
 
@@ -575,20 +570,8 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (unlikely(!memcg))
-		goto out;
-
-	switch (idx) {
-	case PGFAULT:
-		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-		break;
-	case PGMAJFAULT:
-		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-		break;
-	default:
-		BUG();
-	}
-out:
+	if (likely(memcg))
+		this_cpu_inc(memcg->stat->events[idx]);
 	rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -608,7 +591,7 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 static inline void mem_cgroup_event(struct mem_cgroup *memcg,
-				    enum mem_cgroup_events_index idx)
+				    enum memcg_event_item event)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1ffa3ad201ea..6b42887e5f14 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,13 +111,6 @@ static const char * const mem_cgroup_stat_names[] = {
 	"swap",
 };
 
-static const char * const mem_cgroup_events_names[] = {
-	"pgpgin",
-	"pgpgout",
-	"pgfault",
-	"pgmajfault",
-};
-
 static const char * const mem_cgroup_lru_names[] = {
 	"inactive_anon",
 	"active_anon",
@@ -571,13 +564,13 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  */
 
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
-					    enum mem_cgroup_events_index idx)
+					    enum memcg_event_item event)
 {
 	unsigned long val = 0;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		val += per_cpu(memcg->stat->events[idx], cpu);
+		val += per_cpu(memcg->stat->events[event], cpu);
 	return val;
 }
 
@@ -608,9 +601,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
-		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+		__this_cpu_inc(memcg->stat->events[PGPGIN]);
 	else {
-		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+		__this_cpu_inc(memcg->stat->events[PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 
@@ -3119,6 +3112,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
 }
 #endif /* CONFIG_NUMA */
 
+/* Universal VM events cgroup1 shows, original sort order */
+unsigned int memcg1_events[] = {
+	PGPGIN,
+	PGPGOUT,
+	PGFAULT,
+	PGMAJFAULT,
+};
+
+static const char *const memcg1_event_names[] = {
+	"pgpgin",
+	"pgpgout",
+	"pgfault",
+	"pgmajfault",
+};
+
 static int memcg_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3128,8 +3136,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
 		     MEM_CGROUP_STAT_NSTATS);
-	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
-		     MEM_CGROUP_EVENTS_NSTATS);
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3139,9 +3145,9 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 
-	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
-		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
-			   mem_cgroup_read_events(memcg, i));
+	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+		seq_printf(m, "%s %lu\n", memcg1_event_names[i],
+			   mem_cgroup_read_events(memcg, memcg1_events[i]));
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
@@ -3169,13 +3175,12 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
 	}
 
-	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
 		unsigned long long val = 0;
 
 		for_each_mem_cgroup_tree(mi, memcg)
-			val += mem_cgroup_read_events(mi, i);
-		seq_printf(m, "total_%s %llu\n",
-			   mem_cgroup_events_names[i], val);
+			val += mem_cgroup_read_events(mi, memcg1_events[i]);
+		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
 	}
 
 	for (i = 0; i < NR_LRU_LISTS; i++) {
@@ -5222,10 +5227,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n",
-		   events[MEM_CGROUP_EVENTS_PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n",
-		   events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
+	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
 
 	seq_printf(m, "workingset_refault %lu\n",
 		   stat[MEMCG_WORKINGSET_REFAULT]);
@@ -5493,7 +5496,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
-	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+	__this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 	memcg_check_events(memcg, dummy_page);
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From 71cd31135d4cf030a057ed7079a75a40c0a4a796 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:55:13 -0700
Subject: mm: memcontrol: re-use node VM page state enum

The current duplication is a high-maintenance mess, and it's painful to
add new items or query memcg state from the rest of the VM.

This increases the size of the stat array marginally, but we should aim
to track all these stats on a per-cgroup level anyway.

Link: http://lkml.kernel.org/r/20170404220148.28338-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 100 +++++++++++++++------------------
 mm/memcontrol.c            | 135 +++++++++++++++++++++++----------------------
 mm/page-writeback.c        |  10 ++--
 mm/rmap.c                  |   4 +-
 mm/vmscan.c                |   5 +-
 mm/workingset.c            |   7 +--
 6 files changed, 123 insertions(+), 138 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0bb5f055bd26..0fa1f5de6841 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -35,40 +35,45 @@ struct page;
 struct mm_struct;
 struct kmem_cache;
 
-/*
- * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
- * These two lists should keep in accord with each other.
- */
-enum mem_cgroup_stat_index {
-	/*
-	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
-	 */
-	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
-	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
-	MEM_CGROUP_STAT_SHMEM,		/* # of pages charged as shmem */
-	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
-	MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
-	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
-	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
-	MEM_CGROUP_STAT_NSTATS,
-	/* default hierarchy stats */
-	MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
+/* Cgroup-specific page state, on top of universal node page state */
+enum memcg_stat_item {
+	MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
+	MEMCG_RSS,
+	MEMCG_RSS_HUGE,
+	MEMCG_SWAP,
+	MEMCG_SOCK,
+	/* XXX: why are these zone and not node counters? */
+	MEMCG_KERNEL_STACK_KB,
 	MEMCG_SLAB_RECLAIMABLE,
 	MEMCG_SLAB_UNRECLAIMABLE,
-	MEMCG_SOCK,
-	MEMCG_WORKINGSET_REFAULT,
-	MEMCG_WORKINGSET_ACTIVATE,
-	MEMCG_WORKINGSET_NODERECLAIM,
 	MEMCG_NR_STAT,
 };
 
+/* Cgroup-specific events, on top of universal VM events */
+enum memcg_event_item {
+	MEMCG_LOW = NR_VM_EVENT_ITEMS,
+	MEMCG_HIGH,
+	MEMCG_MAX,
+	MEMCG_OOM,
+	MEMCG_NR_EVENTS,
+};
+
 struct mem_cgroup_reclaim_cookie {
 	pg_data_t *pgdat;
 	int priority;
 	unsigned int generation;
 };
 
+#ifdef CONFIG_MEMCG
+
+#define MEM_CGROUP_ID_SHIFT	16
+#define MEM_CGROUP_ID_MAX	USHRT_MAX
+
+struct mem_cgroup_id {
+	int id;
+	atomic_t ref;
+};
+
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
@@ -82,25 +87,6 @@ enum mem_cgroup_events_target {
 	MEM_CGROUP_NTARGETS,
 };
 
-#ifdef CONFIG_MEMCG
-
-#define MEM_CGROUP_ID_SHIFT	16
-#define MEM_CGROUP_ID_MAX	USHRT_MAX
-
-struct mem_cgroup_id {
-	int id;
-	atomic_t ref;
-};
-
-/* Cgroup-specific events, on top of universal VM events */
-enum memcg_event_item {
-	MEMCG_LOW = NR_VM_EVENT_ITEMS,
-	MEMCG_HIGH,
-	MEMCG_MAX,
-	MEMCG_OOM,
-	MEMCG_NR_EVENTS,
-};
-
 struct mem_cgroup_stat_cpu {
 	long count[MEMCG_NR_STAT];
 	unsigned long events[MEMCG_NR_EVENTS];
@@ -487,7 +473,7 @@ void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
 static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
-						 enum mem_cgroup_stat_index idx)
+						 enum memcg_stat_item idx)
 {
 	long val = 0;
 	int cpu;
@@ -502,20 +488,20 @@ static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 }
 
 static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
-				   enum mem_cgroup_stat_index idx, int val)
+					  enum memcg_stat_item idx, int val)
 {
 	if (!mem_cgroup_disabled())
 		this_cpu_add(memcg->stat->count[idx], val);
 }
 
 static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
-				   enum mem_cgroup_stat_index idx)
+				       enum memcg_stat_item idx)
 {
 	mem_cgroup_update_stat(memcg, idx, 1);
 }
 
 static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
-				   enum mem_cgroup_stat_index idx)
+				       enum memcg_stat_item idx)
 {
 	mem_cgroup_update_stat(memcg, idx, -1);
 }
@@ -538,20 +524,20 @@ static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
  * Kernel pages are an exception to this, since they'll never move.
  */
 static inline void mem_cgroup_update_page_stat(struct page *page,
-				 enum mem_cgroup_stat_index idx, int val)
+				 enum memcg_stat_item idx, int val)
 {
 	if (page->mem_cgroup)
 		mem_cgroup_update_stat(page->mem_cgroup, idx, val);
 }
 
 static inline void mem_cgroup_inc_page_stat(struct page *page,
-					    enum mem_cgroup_stat_index idx)
+					    enum memcg_stat_item idx)
 {
 	mem_cgroup_update_page_stat(page, idx, 1);
 }
 
 static inline void mem_cgroup_dec_page_stat(struct page *page,
-					    enum mem_cgroup_stat_index idx)
+					    enum memcg_stat_item idx)
 {
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
@@ -760,33 +746,33 @@ static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 }
 
 static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
-				   enum mem_cgroup_stat_index idx, int val)
+					  enum memcg_stat_item idx, int val)
 {
 }
 
 static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
-				   enum mem_cgroup_stat_index idx)
+				       enum memcg_stat_item idx)
 {
 }
 
 static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
-				   enum mem_cgroup_stat_index idx)
+				       enum memcg_stat_item idx)
 {
 }
 
 static inline void mem_cgroup_update_page_stat(struct page *page,
-					       enum mem_cgroup_stat_index idx,
+					       enum memcg_stat_item idx,
 					       int nr)
 {
 }
 
 static inline void mem_cgroup_inc_page_stat(struct page *page,
-					    enum mem_cgroup_stat_index idx)
+					    enum memcg_stat_item idx)
 {
 }
 
 static inline void mem_cgroup_dec_page_stat(struct page *page,
-					    enum mem_cgroup_stat_index idx)
+					    enum memcg_stat_item idx)
 {
 }
 
@@ -906,7 +892,7 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
  * @val: number of pages (positive or negative)
  */
 static inline void memcg_kmem_update_page_stat(struct page *page,
-				enum mem_cgroup_stat_index idx, int val)
+				enum memcg_stat_item idx, int val)
 {
 	if (memcg_kmem_enabled() && page->mem_cgroup)
 		this_cpu_add(page->mem_cgroup->stat->count[idx], val);
@@ -935,7 +921,7 @@ static inline void memcg_put_cache_ids(void)
 }
 
 static inline void memcg_kmem_update_page_stat(struct page *page,
-				enum mem_cgroup_stat_index idx, int val)
+				enum memcg_stat_item idx, int val)
 {
 }
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6b42887e5f14..6fe4c7fafbfc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -100,18 +100,7 @@ static bool do_memsw_account(void)
 	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
 }
 
-static const char * const mem_cgroup_stat_names[] = {
-	"cache",
-	"rss",
-	"rss_huge",
-	"shmem",
-	"mapped_file",
-	"dirty",
-	"writeback",
-	"swap",
-};
-
-static const char * const mem_cgroup_lru_names[] = {
+static const char *const mem_cgroup_lru_names[] = {
 	"inactive_anon",
 	"active_anon",
 	"inactive_file",
@@ -583,20 +572,16 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
 	if (PageAnon(page))
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
-				nr_pages);
+		__this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
 	else {
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
-				nr_pages);
+		__this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
 		if (PageSwapBacked(page))
-			__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
-				       nr_pages);
+			__this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
 	}
 
 	if (compound) {
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
-				nr_pages);
+		__this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
 	}
 
 	/* pagein of a big page is an event. So, ignore page size */
@@ -1125,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 	return false;
 }
 
+unsigned int memcg1_stats[] = {
+	MEMCG_CACHE,
+	MEMCG_RSS,
+	MEMCG_RSS_HUGE,
+	NR_SHMEM,
+	NR_FILE_MAPPED,
+	NR_FILE_DIRTY,
+	NR_WRITEBACK,
+	MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+	"cache",
+	"rss",
+	"rss_huge",
+	"shmem",
+	"mapped_file",
+	"dirty",
+	"writeback",
+	"swap",
+};
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /**
  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1169,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 		pr_cont_cgroup_path(iter->css.cgroup);
 		pr_cont(":");
 
-		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+		for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
 				continue;
-			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
-				K(mem_cgroup_read_stat(iter, i)));
+			pr_cont(" %s:%luKB", memcg1_stat_names[i],
+				K(mem_cgroup_read_stat(iter, memcg1_stats[i])));
 		}
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
@@ -2362,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 	for (i = 1; i < HPAGE_PMD_NR; i++)
 		head[i].mem_cgroup = head->mem_cgroup;
 
-	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+	__this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
 		       HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -2372,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
-	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+	this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
 }
 
 /**
@@ -2731,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		struct mem_cgroup *iter;
 
 		for_each_mem_cgroup_tree(iter, memcg) {
-			val += mem_cgroup_read_stat(iter,
-					MEM_CGROUP_STAT_CACHE);
-			val += mem_cgroup_read_stat(iter,
-					MEM_CGROUP_STAT_RSS);
+			val += mem_cgroup_read_stat(iter, MEMCG_CACHE);
+			val += mem_cgroup_read_stat(iter, MEMCG_RSS);
 			if (swap)
-				val += mem_cgroup_read_stat(iter,
-						MEM_CGROUP_STAT_SWAP);
+				val += mem_cgroup_read_stat(iter, MEMCG_SWAP);
 		}
 	} else {
 		if (!swap)
@@ -3134,15 +3138,15 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	struct mem_cgroup *mi;
 	unsigned int i;
 
-	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
-		     MEM_CGROUP_STAT_NSTATS);
+	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 
-	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-		if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
-		seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
-			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
+		seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+			   mem_cgroup_read_stat(memcg, memcg1_stats[i]) *
+			   PAGE_SIZE);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -3165,14 +3169,15 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		seq_printf(m, "hierarchical_memsw_limit %llu\n",
 			   (u64)memsw * PAGE_SIZE);
 
-	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		unsigned long long val = 0;
 
-		if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
-			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
-		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
+			val += mem_cgroup_read_stat(mi, memcg1_stats[i]) *
+			PAGE_SIZE;
+		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
@@ -3645,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+	*pdirty = mem_cgroup_read_stat(memcg, NR_FILE_DIRTY);
 
 	/* this should eventually include NR_UNSTABLE_NFS */
-	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+	*pwriteback = mem_cgroup_read_stat(memcg, NR_WRITEBACK);
 	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 						     (1 << LRU_ACTIVE_FILE));
 	*pheadroom = PAGE_COUNTER_MAX;
@@ -4504,10 +4509,8 @@ static int mem_cgroup_move_account(struct page *page,
 	spin_lock_irqsave(&from->move_lock, flags);
 
 	if (!anon && page_mapped(page)) {
-		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-			       nr_pages);
-		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
-			       nr_pages);
+		__this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
+		__this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
 	}
 
 	/*
@@ -4519,18 +4522,16 @@ static int mem_cgroup_move_account(struct page *page,
 		struct address_space *mapping = page_mapping(page);
 
 		if (mapping_cap_account_dirty(mapping)) {
-			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+			__this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
 				       nr_pages);
-			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+			__this_cpu_add(to->stat->count[NR_FILE_DIRTY],
 				       nr_pages);
 		}
 	}
 
 	if (PageWriteback(page)) {
-		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-			       nr_pages);
-		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
-			       nr_pages);
+		__this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
+		__this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
 	}
 
 	/*
@@ -5190,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	tree_events(memcg, events);
 
 	seq_printf(m, "anon %llu\n",
-		   (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
+		   (u64)stat[MEMCG_RSS] * PAGE_SIZE);
 	seq_printf(m, "file %llu\n",
-		   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+		   (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
 	seq_printf(m, "kernel_stack %llu\n",
 		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
 	seq_printf(m, "slab %llu\n",
@@ -5202,13 +5203,13 @@ static int memory_stat_show(struct seq_file *m, void *v)
 		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
 
 	seq_printf(m, "shmem %llu\n",
-		   (u64)stat[MEM_CGROUP_STAT_SHMEM] * PAGE_SIZE);
+		   (u64)stat[NR_SHMEM] * PAGE_SIZE);
 	seq_printf(m, "file_mapped %llu\n",
-		   (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
+		   (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
 	seq_printf(m, "file_dirty %llu\n",
-		   (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
+		   (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
 	seq_printf(m, "file_writeback %llu\n",
-		   (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
+		   (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
 
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		struct mem_cgroup *mi;
@@ -5231,11 +5232,11 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
 
 	seq_printf(m, "workingset_refault %lu\n",
-		   stat[MEMCG_WORKINGSET_REFAULT]);
+		   stat[WORKINGSET_REFAULT]);
 	seq_printf(m, "workingset_activate %lu\n",
-		   stat[MEMCG_WORKINGSET_ACTIVATE]);
+		   stat[WORKINGSET_ACTIVATE]);
 	seq_printf(m, "workingset_nodereclaim %lu\n",
-		   stat[MEMCG_WORKINGSET_NODERECLAIM]);
+		   stat[WORKINGSET_NODERECLAIM]);
 
 	return 0;
 }
@@ -5492,10 +5493,10 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 	}
 
 	local_irq_save(flags);
-	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
-	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
-	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
-	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
+	__this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
+	__this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
+	__this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
+	__this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
 	__this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 	memcg_check_events(memcg, dummy_page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 33df0583edb9..777711203809 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2427,7 +2427,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		inode_attach_wb(inode, page);
 		wb = inode_to_wb(inode);
 
-		mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+		mem_cgroup_inc_page_stat(page, NR_FILE_DIRTY);
 		__inc_node_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		__inc_node_page_state(page, NR_DIRTIED);
@@ -2449,7 +2449,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb)
 {
 	if (mapping_cap_account_dirty(mapping)) {
-		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+		mem_cgroup_dec_page_stat(page, NR_FILE_DIRTY);
 		dec_node_page_state(page, NR_FILE_DIRTY);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2706,7 +2706,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 */
 		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		if (TestClearPageDirty(page)) {
-			mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+			mem_cgroup_dec_page_stat(page, NR_FILE_DIRTY);
 			dec_node_page_state(page, NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 			dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2753,7 +2753,7 @@ int test_clear_page_writeback(struct page *page)
 		ret = TestClearPageWriteback(page);
 	}
 	if (ret) {
-		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+		mem_cgroup_dec_page_stat(page, NR_WRITEBACK);
 		dec_node_page_state(page, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		inc_node_page_state(page, NR_WRITTEN);
@@ -2808,7 +2808,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 		ret = TestSetPageWriteback(page);
 	}
 	if (!ret) {
-		mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+		mem_cgroup_inc_page_stat(page, NR_WRITEBACK);
 		inc_node_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index e303fdbee561..a6d018c4a13a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1158,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound)
 			goto out;
 	}
 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
-	mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
+	mem_cgroup_update_page_stat(page, NR_FILE_MAPPED, nr);
 out:
 	unlock_page_memcg(page);
 }
@@ -1198,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 	 * pte lock(a spinlock) is held, which implies preemption disabled.
 	 */
 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
-	mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
+	mem_cgroup_update_page_stat(page, NR_FILE_MAPPED, -nr);
 
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fbec74af2b69..417b6657e994 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2046,8 +2046,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
 	if (memcg)
-		refaults = mem_cgroup_read_stat(memcg,
-						MEMCG_WORKINGSET_ACTIVATE);
+		refaults = mem_cgroup_read_stat(memcg, WORKINGSET_ACTIVATE);
 	else
 		refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
 
@@ -2735,7 +2734,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
 
 		if (memcg)
 			refaults = mem_cgroup_read_stat(memcg,
-						MEMCG_WORKINGSET_ACTIVATE);
+							WORKINGSET_ACTIVATE);
 		else
 			refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
 
diff --git a/mm/workingset.c b/mm/workingset.c
index 51c6f61d4cea..37fc1057cd86 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -289,11 +289,11 @@ bool workingset_refault(void *shadow)
 	refault_distance = (refault - eviction) & EVICTION_MASK;
 
 	inc_node_state(pgdat, WORKINGSET_REFAULT);
-	mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_REFAULT);
+	mem_cgroup_inc_stat(memcg, WORKINGSET_REFAULT);
 
 	if (refault_distance <= active_file) {
 		inc_node_state(pgdat, WORKINGSET_ACTIVATE);
-		mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_ACTIVATE);
+		mem_cgroup_inc_stat(memcg, WORKINGSET_ACTIVATE);
 		rcu_read_unlock();
 		return true;
 	}
@@ -475,8 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	if (WARN_ON_ONCE(node->exceptional))
 		goto out_invalid;
 	inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
-	mem_cgroup_inc_page_stat(virt_to_page(node),
-				 MEMCG_WORKINGSET_NODERECLAIM);
+	mem_cgroup_inc_page_stat(virt_to_page(node), WORKINGSET_NODERECLAIM);
 	__radix_tree_delete_node(&mapping->page_tree, node,
 				 workingset_update_node, mapping);
 
-- 
cgit v1.2.3


From ccda7f4360be86b87497c50d1f58aab3fd85a9a5 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 3 May 2017 14:55:16 -0700
Subject: mm: memcontrol: use node page state naming scheme for memcg

The memory controllers stat function names are awkwardly long and
arbitrarily different from the zone and node stat functions.

The current interface is named:

  mem_cgroup_read_stat()
  mem_cgroup_update_stat()
  mem_cgroup_inc_stat()
  mem_cgroup_dec_stat()
  mem_cgroup_update_page_stat()
  mem_cgroup_inc_page_stat()
  mem_cgroup_dec_page_stat()

This patch renames it to match the corresponding node stat functions:

  memcg_page_state()		[node_page_state()]
  mod_memcg_state()		[mod_node_state()]
  inc_memcg_state()		[inc_node_state()]
  dec_memcg_state()		[dec_node_state()]
  mod_memcg_page_state()	[mod_node_page_state()]
  inc_memcg_page_state()	[inc_node_page_state()]
  dec_memcg_page_state()	[dec_node_page_state()]

Link: http://lkml.kernel.org/r/20170404220148.28338-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 73 +++++++++++++++++++++++-----------------------
 mm/memcontrol.c            | 38 ++++++++++++------------
 mm/page-writeback.c        | 10 +++----
 mm/rmap.c                  |  4 +--
 mm/vmscan.c                |  5 ++--
 mm/workingset.c            |  6 ++--
 6 files changed, 68 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0fa1f5de6841..899949bbb2f9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -472,8 +472,8 @@ extern int do_swap_account;
 void lock_page_memcg(struct page *page);
 void unlock_page_memcg(struct page *page);
 
-static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
-						 enum memcg_stat_item idx)
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+					     enum memcg_stat_item idx)
 {
 	long val = 0;
 	int cpu;
@@ -487,27 +487,27 @@ static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 	return val;
 }
 
-static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
-					  enum memcg_stat_item idx, int val)
+static inline void mod_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx, int val)
 {
 	if (!mem_cgroup_disabled())
 		this_cpu_add(memcg->stat->count[idx], val);
 }
 
-static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
-				       enum memcg_stat_item idx)
+static inline void inc_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx)
 {
-	mem_cgroup_update_stat(memcg, idx, 1);
+	mod_memcg_state(memcg, idx, 1);
 }
 
-static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
-				       enum memcg_stat_item idx)
+static inline void dec_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx)
 {
-	mem_cgroup_update_stat(memcg, idx, -1);
+	mod_memcg_state(memcg, idx, -1);
 }
 
 /**
- * mem_cgroup_update_page_stat - update page state statistics
+ * mod_memcg_page_state - update page state statistics
  * @page: the page
  * @idx: page state item to account
  * @val: number of pages (positive or negative)
@@ -518,28 +518,28 @@ static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
  *
  *   lock_page(page) or lock_page_memcg(page)
  *   if (TestClearPageState(page))
- *     mem_cgroup_update_page_stat(page, state, -1);
+ *     mod_memcg_page_state(page, state, -1);
  *   unlock_page(page) or unlock_page_memcg(page)
  *
  * Kernel pages are an exception to this, since they'll never move.
  */
-static inline void mem_cgroup_update_page_stat(struct page *page,
-				 enum memcg_stat_item idx, int val)
+static inline void mod_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx, int val)
 {
 	if (page->mem_cgroup)
-		mem_cgroup_update_stat(page->mem_cgroup, idx, val);
+		mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
-static inline void mem_cgroup_inc_page_stat(struct page *page,
-					    enum memcg_stat_item idx)
+static inline void inc_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx)
 {
-	mem_cgroup_update_page_stat(page, idx, 1);
+	mod_memcg_page_state(page, idx, 1);
 }
 
-static inline void mem_cgroup_dec_page_stat(struct page *page,
-					    enum memcg_stat_item idx)
+static inline void dec_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx)
 {
-	mem_cgroup_update_page_stat(page, idx, -1);
+	mod_memcg_page_state(page, idx, -1);
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -739,40 +739,41 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
 	return false;
 }
 
-static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg,
-						 enum mem_cgroup_stat_index idx)
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
+					     enum memcg_stat_item idx)
 {
 	return 0;
 }
 
-static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
-					  enum memcg_stat_item idx, int val)
+static inline void mod_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx,
+				   int nr)
 {
 }
 
-static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
-				       enum memcg_stat_item idx)
+static inline void inc_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx)
 {
 }
 
-static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
-				       enum memcg_stat_item idx)
+static inline void dec_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx)
 {
 }
 
-static inline void mem_cgroup_update_page_stat(struct page *page,
-					       enum memcg_stat_item idx,
-					       int nr)
+static inline void mod_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx,
+					int nr)
 {
 }
 
-static inline void mem_cgroup_inc_page_stat(struct page *page,
-					    enum memcg_stat_item idx)
+static inline void inc_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx)
 {
 }
 
-static inline void mem_cgroup_dec_page_stat(struct page *page,
-					    enum memcg_stat_item idx)
+static inline void dec_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6fe4c7fafbfc..ff73899af61a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -552,8 +552,8 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  * implemented.
  */
 
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
-					    enum memcg_event_item event)
+static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
+				      enum memcg_event_item event)
 {
 	unsigned long val = 0;
 	int cpu;
@@ -1180,7 +1180,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
 				continue;
 			pr_cont(" %s:%luKB", memcg1_stat_names[i],
-				K(mem_cgroup_read_stat(iter, memcg1_stats[i])));
+				K(memcg_page_state(iter, memcg1_stats[i])));
 		}
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
@@ -2713,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
 
 	for_each_mem_cgroup_tree(iter, memcg) {
 		for (i = 0; i < MEMCG_NR_STAT; i++)
-			stat[i] += mem_cgroup_read_stat(iter, i);
+			stat[i] += memcg_page_state(iter, i);
 	}
 }
 
@@ -2726,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
 
 	for_each_mem_cgroup_tree(iter, memcg) {
 		for (i = 0; i < MEMCG_NR_EVENTS; i++)
-			events[i] += mem_cgroup_read_events(iter, i);
+			events[i] += memcg_sum_events(iter, i);
 	}
 }
 
@@ -2738,10 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		struct mem_cgroup *iter;
 
 		for_each_mem_cgroup_tree(iter, memcg) {
-			val += mem_cgroup_read_stat(iter, MEMCG_CACHE);
-			val += mem_cgroup_read_stat(iter, MEMCG_RSS);
+			val += memcg_page_state(iter, MEMCG_CACHE);
+			val += memcg_page_state(iter, MEMCG_RSS);
 			if (swap)
-				val += mem_cgroup_read_stat(iter, MEMCG_SWAP);
+				val += memcg_page_state(iter, MEMCG_SWAP);
 		}
 	} else {
 		if (!swap)
@@ -3145,13 +3145,13 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
-			   mem_cgroup_read_stat(memcg, memcg1_stats[i]) *
+			   memcg_page_state(memcg, memcg1_stats[i]) *
 			   PAGE_SIZE);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
 		seq_printf(m, "%s %lu\n", memcg1_event_names[i],
-			   mem_cgroup_read_events(memcg, memcg1_events[i]));
+			   memcg_sum_events(memcg, memcg1_events[i]));
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
@@ -3175,7 +3175,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
-			val += mem_cgroup_read_stat(mi, memcg1_stats[i]) *
+			val += memcg_page_state(mi, memcg1_stats[i]) *
 			PAGE_SIZE;
 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
 	}
@@ -3184,7 +3184,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		unsigned long long val = 0;
 
 		for_each_mem_cgroup_tree(mi, memcg)
-			val += mem_cgroup_read_events(mi, memcg1_events[i]);
+			val += memcg_sum_events(mi, memcg1_events[i]);
 		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
 	}
 
@@ -3650,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
 	struct mem_cgroup *parent;
 
-	*pdirty = mem_cgroup_read_stat(memcg, NR_FILE_DIRTY);
+	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 
 	/* this should eventually include NR_UNSTABLE_NFS */
-	*pwriteback = mem_cgroup_read_stat(memcg, NR_WRITEBACK);
+	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
 	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
 						     (1 << LRU_ACTIVE_FILE));
 	*pheadroom = PAGE_COUNTER_MAX;
@@ -4515,7 +4515,7 @@ static int mem_cgroup_move_account(struct page *page,
 
 	/*
 	 * move_lock grabbed above and caller set from->moving_account, so
-	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+	 * mod_memcg_page_state will serialize updates to PageDirty.
 	 * So mapping should be stable for dirty pages.
 	 */
 	if (!anon && PageDirty(page)) {
@@ -5161,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 
-	seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
-	seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
-	seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
-	seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
+	seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
+	seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
+	seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
+	seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
 
 	return 0;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 777711203809..2359608d2568 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2427,7 +2427,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		inode_attach_wb(inode, page);
 		wb = inode_to_wb(inode);
 
-		mem_cgroup_inc_page_stat(page, NR_FILE_DIRTY);
+		inc_memcg_page_state(page, NR_FILE_DIRTY);
 		__inc_node_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		__inc_node_page_state(page, NR_DIRTIED);
@@ -2449,7 +2449,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb)
 {
 	if (mapping_cap_account_dirty(mapping)) {
-		mem_cgroup_dec_page_stat(page, NR_FILE_DIRTY);
+		dec_memcg_page_state(page, NR_FILE_DIRTY);
 		dec_node_page_state(page, NR_FILE_DIRTY);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2706,7 +2706,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 */
 		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		if (TestClearPageDirty(page)) {
-			mem_cgroup_dec_page_stat(page, NR_FILE_DIRTY);
+			dec_memcg_page_state(page, NR_FILE_DIRTY);
 			dec_node_page_state(page, NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 			dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2753,7 +2753,7 @@ int test_clear_page_writeback(struct page *page)
 		ret = TestClearPageWriteback(page);
 	}
 	if (ret) {
-		mem_cgroup_dec_page_stat(page, NR_WRITEBACK);
+		dec_memcg_page_state(page, NR_WRITEBACK);
 		dec_node_page_state(page, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		inc_node_page_state(page, NR_WRITTEN);
@@ -2808,7 +2808,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 		ret = TestSetPageWriteback(page);
 	}
 	if (!ret) {
-		mem_cgroup_inc_page_stat(page, NR_WRITEBACK);
+		inc_memcg_page_state(page, NR_WRITEBACK);
 		inc_node_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 	}
diff --git a/mm/rmap.c b/mm/rmap.c
index a6d018c4a13a..3ff241f714eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1158,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound)
 			goto out;
 	}
 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
-	mem_cgroup_update_page_stat(page, NR_FILE_MAPPED, nr);
+	mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
 out:
 	unlock_page_memcg(page);
 }
@@ -1198,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 	 * pte lock(a spinlock) is held, which implies preemption disabled.
 	 */
 	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
-	mem_cgroup_update_page_stat(page, NR_FILE_MAPPED, -nr);
+	mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
 
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 417b6657e994..4e7ed65842af 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2046,7 +2046,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
 
 	if (memcg)
-		refaults = mem_cgroup_read_stat(memcg, WORKINGSET_ACTIVATE);
+		refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
 	else
 		refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
 
@@ -2733,8 +2733,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
 		struct lruvec *lruvec;
 
 		if (memcg)
-			refaults = mem_cgroup_read_stat(memcg,
-							WORKINGSET_ACTIVATE);
+			refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
 		else
 			refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
 
diff --git a/mm/workingset.c b/mm/workingset.c
index 37fc1057cd86..b8c9ab678479 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -289,11 +289,11 @@ bool workingset_refault(void *shadow)
 	refault_distance = (refault - eviction) & EVICTION_MASK;
 
 	inc_node_state(pgdat, WORKINGSET_REFAULT);
-	mem_cgroup_inc_stat(memcg, WORKINGSET_REFAULT);
+	inc_memcg_state(memcg, WORKINGSET_REFAULT);
 
 	if (refault_distance <= active_file) {
 		inc_node_state(pgdat, WORKINGSET_ACTIVATE);
-		mem_cgroup_inc_stat(memcg, WORKINGSET_ACTIVATE);
+		inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
 		rcu_read_unlock();
 		return true;
 	}
@@ -475,7 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	if (WARN_ON_ONCE(node->exceptional))
 		goto out_invalid;
 	inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
-	mem_cgroup_inc_page_stat(virt_to_page(node), WORKINGSET_NODERECLAIM);
+	inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
 	__radix_tree_delete_node(&mapping->page_tree, node,
 				 workingset_update_node, mapping);
 
-- 
cgit v1.2.3


From df6b7499806bffd233e6dd0465901827b0b385b8 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 3 May 2017 14:55:19 -0700
Subject: mm, swap: remove unused function prototype

This is a code cleanup patch, no functionality changes.  There are 2
unused function prototype in swap.h, they are removed.

Link: http://lkml.kernel.org/r/20170405071017.23677-1-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 486494e6b2fc..ba5882419a7d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -411,9 +411,6 @@ struct backing_dev_info;
 extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
 extern void exit_swap_address_space(unsigned int type);
 
-extern int get_swap_slots(int n, swp_entry_t *slots);
-extern void swapcache_free_batch(swp_entry_t *entries, int n);
-
 #else /* CONFIG_SWAP */
 
 #define swap_address_space(entry)		(NULL)
-- 
cgit v1.2.3


From 74da4a0f574d11ed60dbe50a1e5e942e20476590 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 3 Mar 2017 18:16:07 +0100
Subject: libceph, ceph: always advertise all supported features

No reason to hide CephFS-specific features in the rbd case.  Recent
feature bits mix RADOS and CephFS-specific stuff together anyway.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c                |  2 +-
 fs/ceph/super.c                    |  7 +------
 include/linux/ceph/ceph_features.h |  4 ++++
 include/linux/ceph/libceph.h       |  5 +----
 net/ceph/ceph_common.c             | 16 ++++++----------
 5 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 517838b65964..16010183b703 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -731,7 +731,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 	kref_init(&rbdc->kref);
 	INIT_LIST_HEAD(&rbdc->node);
 
-	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
+	rbdc->client = ceph_create_client(ceph_opts, rbdc);
 	if (IS_ERR(rbdc->client))
 		goto out_rbdc;
 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ec8d0114e57..8b9f645006da 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 					struct ceph_options *opt)
 {
 	struct ceph_fs_client *fsc;
-	const u64 supported_features =
-		CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
-		CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
-	const u64 required_features = 0;
 	int page_count;
 	size_t size;
 	int err = -ENOMEM;
@@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc)
 		return ERR_PTR(-ENOMEM);
 
-	fsc->client = ceph_create_client(opt, fsc, supported_features,
-					 required_features);
+	fsc->client = ceph_create_client(opt, fsc);
 	if (IS_ERR(fsc->client)) {
 		err = PTR_ERR(fsc->client);
 		goto fail;
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index ae2f66833762..fd8b2953c78f 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -105,8 +105,10 @@ static inline u64 ceph_sanitize_features(u64 features)
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT		\
 	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_FLOCK |			\
 	 CEPH_FEATURE_SUBSCRIBE2 |		\
 	 CEPH_FEATURE_RECONNECT_SEQ |		\
+	 CEPH_FEATURE_DIRLAYOUTHASH |		\
 	 CEPH_FEATURE_PGID64 |			\
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
@@ -114,11 +116,13 @@ static inline u64 ceph_sanitize_features(u64 features)
 	 CEPH_FEATURE_MSG_AUTH |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
+	 CEPH_FEATURE_MDSENC |			\
 	 CEPH_FEATURE_OSDHASHPSPOOL |		\
 	 CEPH_FEATURE_OSD_CACHEPOOL |		\
 	 CEPH_FEATURE_CRUSH_V2 |		\
 	 CEPH_FEATURE_EXPORT_PEER |		\
 	 CEPH_FEATURE_OSDMAP_ENC |		\
+	 CEPH_FEATURE_MDS_INLINE_DATA |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES3 |		\
 	 CEPH_FEATURE_OSD_PRIMARY_AFFINITY |	\
 	 CEPH_FEATURE_MSGR_KEEPALIVE2 |		\
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 88cd5dc8e238..cecbf5a26e5a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -262,10 +262,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client);
 extern void ceph_destroy_options(struct ceph_options *opt);
 extern int ceph_compare_options(struct ceph_options *new_opt,
 				struct ceph_client *client);
-extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
-					      void *private,
-					      u64 supported_features,
-					      u64 required_features);
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
 struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
 u64 ceph_client_gid(struct ceph_client *client);
 extern void ceph_destroy_client(struct ceph_client *client);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 108533859a53..4701d067169f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -596,9 +596,7 @@ EXPORT_SYMBOL(ceph_client_gid);
 /*
  * create a fresh client instance
  */
-struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
-				       u64 supported_features,
-				       u64 required_features)
+struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
 {
 	struct ceph_client *client;
 	struct ceph_entity_addr *myaddr = NULL;
@@ -615,14 +613,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 	init_waitqueue_head(&client->auth_wq);
 	client->auth_err = 0;
 
-	if (!ceph_test_opt(client, NOMSGAUTH))
-		required_features |= CEPH_FEATURE_MSG_AUTH;
-
 	client->extra_mon_dispatch = NULL;
-	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
-		supported_features;
-	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
-		required_features;
+	client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
+	client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
+
+	if (!ceph_test_opt(client, NOMSGAUTH))
+		client->required_features |= CEPH_FEATURE_MSG_AUTH;
 
 	/* msgr */
 	if (ceph_test_opt(client, MYIP))
-- 
cgit v1.2.3


From 06dfa96399a9a3280dd81e47f8696aa89f1783e7 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Mar 2017 14:10:27 +0200
Subject: libceph: convert ceph_snap_context.nref from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h | 3 ++-
 net/ceph/snapshot.c          | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index cecbf5a26e5a..3229ae6c7846 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -14,6 +14,7 @@
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
+#include <linux/refcount.h>
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/messenger.h>
@@ -161,7 +162,7 @@ struct ceph_client {
  * dirtied.
  */
 struct ceph_snap_context {
-	atomic_t nref;
+	refcount_t nref;
 	u64 seq;
 	u32 num_snaps;
 	u64 snaps[];
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index 705414e78ae0..e14a5d038656 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
 	if (!snapc)
 		return NULL;
 
-	atomic_set(&snapc->nref, 1);
+	refcount_set(&snapc->nref, 1);
 	snapc->num_snaps = snap_count;
 
 	return snapc;
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context);
 struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
 {
 	if (sc)
-		atomic_inc(&sc->nref);
+		refcount_inc(&sc->nref);
 	return sc;
 }
 EXPORT_SYMBOL(ceph_get_snap_context);
@@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc)
 {
 	if (!sc)
 		return;
-	if (atomic_dec_and_test(&sc->nref)) {
+	if (refcount_dec_and_test(&sc->nref)) {
 		/*printk(" deleting snap_context %p\n", sc);*/
 		kfree(sc);
 	}
-- 
cgit v1.2.3


From 02113a0f14e20bd8e675d7cec16db6eaaf2b2380 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Mar 2017 14:10:28 +0200
Subject: libceph: convert ceph_osd.o_ref from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  3 ++-
 net/ceph/osd_client.c           | 16 ++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c125b5d9e13c..d6a625e75040 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -5,6 +5,7 @@
 #include <linux/kref.h>
 #include <linux/mempool.h>
 #include <linux/rbtree.h>
+#include <linux/refcount.h>
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/osdmap.h>
@@ -27,7 +28,7 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
 
 /* a given osd we're communicating with */
 struct ceph_osd {
-	atomic_t o_ref;
+	refcount_t o_ref;
 	struct ceph_osd_client *o_osdc;
 	int o_osd;
 	int o_incarnation;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e15ea9e4c495..b4500a8ab8b3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1005,7 +1005,7 @@ static bool osd_registered(struct ceph_osd *osd)
  */
 static void osd_init(struct ceph_osd *osd)
 {
-	atomic_set(&osd->o_ref, 1);
+	refcount_set(&osd->o_ref, 1);
 	RB_CLEAR_NODE(&osd->o_node);
 	osd->o_requests = RB_ROOT;
 	osd->o_linger_requests = RB_ROOT;
@@ -1050,9 +1050,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
 
 static struct ceph_osd *get_osd(struct ceph_osd *osd)
 {
-	if (atomic_inc_not_zero(&osd->o_ref)) {
-		dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
-		     atomic_read(&osd->o_ref));
+	if (refcount_inc_not_zero(&osd->o_ref)) {
+		dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1,
+		     refcount_read(&osd->o_ref));
 		return osd;
 	} else {
 		dout("get_osd %p FAIL\n", osd);
@@ -1062,9 +1062,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd)
 
 static void put_osd(struct ceph_osd *osd)
 {
-	dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
-	     atomic_read(&osd->o_ref) - 1);
-	if (atomic_dec_and_test(&osd->o_ref)) {
+	dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref),
+	     refcount_read(&osd->o_ref) - 1);
+	if (refcount_dec_and_test(&osd->o_ref)) {
 		osd_cleanup(osd);
 		kfree(osd);
 	}
@@ -4126,7 +4126,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
 		close_osd(osd);
 	}
 	up_write(&osdc->lock);
-	WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
+	WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1);
 	osd_cleanup(&osdc->homeless_osd);
 
 	WARN_ON(!list_empty(&osdc->osd_lru));
-- 
cgit v1.2.3


From 0e1a5ee6577e43e5be55369d398107080b360941 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 17 Mar 2017 14:10:29 +0200
Subject: libceph: convert ceph_pagelist.refcnt from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c          | 2 +-
 include/linux/ceph/pagelist.h | 6 +++---
 net/ceph/pagelist.c           | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 074490542b4c..b16f1cf552a8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1991,7 +1991,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 
 	if (req->r_pagelist) {
 		struct ceph_pagelist *pagelist = req->r_pagelist;
-		atomic_inc(&pagelist->refcnt);
+		refcount_inc(&pagelist->refcnt);
 		ceph_msg_data_add_pagelist(msg, pagelist);
 		msg->hdr.data_len = cpu_to_le32(pagelist->length);
 	} else {
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index 13d71fe18b0c..75a7db21457d 100644
--- a/include/linux/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -2,7 +2,7 @@
 #define __FS_CEPH_PAGELIST_H
 
 #include <asm/byteorder.h>
-#include <linux/atomic.h>
+#include <linux/refcount.h>
 #include <linux/list.h>
 #include <linux/types.h>
 
@@ -13,7 +13,7 @@ struct ceph_pagelist {
 	size_t room;
 	struct list_head free_list;
 	size_t num_pages_free;
-	atomic_t refcnt;
+	refcount_t refcnt;
 };
 
 struct ceph_pagelist_cursor {
@@ -30,7 +30,7 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
 	pl->room = 0;
 	INIT_LIST_HEAD(&pl->free_list);
 	pl->num_pages_free = 0;
-	atomic_set(&pl->refcnt, 1);
+	refcount_set(&pl->refcnt, 1);
 }
 
 extern void ceph_pagelist_release(struct ceph_pagelist *pl);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 6864007e64fc..ce09f73be759 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
 
 void ceph_pagelist_release(struct ceph_pagelist *pl)
 {
-	if (!atomic_dec_and_test(&pl->refcnt))
+	if (!refcount_dec_and_test(&pl->refcnt))
 		return;
 	ceph_pagelist_unmap_tail(pl);
 	while (!list_empty(&pl->head)) {
-- 
cgit v1.2.3


From 76201b6354bb3aa31c7ba2bd42b9cbb8dda71c44 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Tue, 28 Mar 2017 17:04:13 +0800
Subject: ceph: allow connecting to mds whose rank >= mdsmap::m_max_mds

mdsmap::m_max_mds is the expected count of active mds. It's not the
max rank of active mds. User can decrease mdsmap::m_max_mds, but does
not stop mds whose rank >= mdsmap::m_max_mds.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/debugfs.c           | 21 ++++++++++-----------
 fs/ceph/mds_client.c        | 10 +++++-----
 fs/ceph/mdsmap.c            | 44 +++++++++++++++++++++++++++++++++++++-------
 include/linux/ceph/mdsmap.h |  7 ++++---
 4 files changed, 56 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2ae393e2c31..4107a887ca34 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
 {
 	int i;
 	struct ceph_fs_client *fsc = s->private;
+	struct ceph_mdsmap *mdsmap;
 
 	if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
 		return 0;
-	seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
-	seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
-	seq_printf(s, "session_timeout %d\n",
-		       fsc->mdsc->mdsmap->m_session_timeout);
-	seq_printf(s, "session_autoclose %d\n",
-		       fsc->mdsc->mdsmap->m_session_autoclose);
-	for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
-		struct ceph_entity_addr *addr =
-			&fsc->mdsc->mdsmap->m_info[i].addr;
-		int state = fsc->mdsc->mdsmap->m_info[i].state;
-
+	mdsmap = fsc->mdsc->mdsmap;
+	seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
+	seq_printf(s, "root %d\n", mdsmap->m_root);
+	seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
+	seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
+	seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
+	for (i = 0; i < mdsmap->m_num_mds; i++) {
+		struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
+		int state = mdsmap->m_info[i].state;
 		seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
 			       ceph_pr_addr(&addr->in_addr),
 			       ceph_mds_state_name(state));
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b16f1cf552a8..2733932bf192 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -441,7 +441,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 {
 	struct ceph_mds_session *s;
 
-	if (mds >= mdsc->mdsmap->m_max_mds)
+	if (mds >= mdsc->mdsmap->m_num_mds)
 		return ERR_PTR(-EINVAL);
 
 	s = kzalloc(sizeof(*s), GFP_NOFS);
@@ -1004,7 +1004,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
 	struct ceph_mds_session *ts;
 	int i, mds = session->s_mds;
 
-	if (mds >= mdsc->mdsmap->m_max_mds)
+	if (mds >= mdsc->mdsmap->m_num_mds)
 		return;
 
 	mi = &mdsc->mdsmap->m_info[mds];
@@ -3107,7 +3107,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 	dout("check_new_map new %u old %u\n",
 	     newmap->m_epoch, oldmap->m_epoch);
 
-	for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
+	for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i] == NULL)
 			continue;
 		s = mdsc->sessions[i];
@@ -3121,7 +3121,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
 		     ceph_session_state_name(s->s_state));
 
-		if (i >= newmap->m_max_mds ||
+		if (i >= newmap->m_num_mds ||
 		    memcmp(ceph_mdsmap_get_addr(oldmap, i),
 			   ceph_mdsmap_get_addr(newmap, i),
 			   sizeof(struct ceph_entity_addr))) {
@@ -3167,7 +3167,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
 		}
 	}
 
-	for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+	for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) {
 		s = mdsc->sessions[i];
 		if (!s)
 			continue;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 5454e2327a5f..1a748cf88535 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 	int i;
 
 	/* special case for one mds */
-	if (1 == m->m_max_mds && m->m_info[0].state > 0)
+	if (1 == m->m_num_mds && m->m_info[0].state > 0)
 		return 0;
 
 	/* count */
-	for (i = 0; i < m->m_max_mds; i++)
+	for (i = 0; i < m->m_num_mds; i++)
 		if (m->m_info[i].state > 0)
 			n++;
 	if (n == 0)
@@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	m->m_session_autoclose = ceph_decode_32(p);
 	m->m_max_file_size = ceph_decode_64(p);
 	m->m_max_mds = ceph_decode_32(p);
+	m->m_num_mds = m->m_max_mds;
 
-	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
+	m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
 	if (m->m_info == NULL)
 		goto nomem;
 
@@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		     ceph_pr_addr(&addr.in_addr),
 		     ceph_mds_state_name(state));
 
-		if (mds < 0 || mds >= m->m_max_mds || state <= 0)
+		if (mds < 0 || state <= 0)
 			continue;
 
+		if (mds >= m->m_num_mds) {
+			int new_num = max(mds + 1, m->m_num_mds * 2);
+			void *new_m_info = krealloc(m->m_info,
+						new_num * sizeof(*m->m_info),
+						GFP_NOFS | __GFP_ZERO);
+			if (!new_m_info)
+				goto nomem;
+			m->m_info = new_m_info;
+			m->m_num_mds = new_num;
+		}
+
 		info = &m->m_info[mds];
 		info->global_id = global_id;
 		info->state = state;
@@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 			info->export_targets = NULL;
 		}
 	}
+	if (m->m_num_mds > m->m_max_mds) {
+		/* find max up mds */
+		for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
+			if (i == 0 || m->m_info[i-1].state > 0)
+				break;
+		}
+		m->m_num_mds = i;
+	}
 
 	/* pg_pools */
 	ceph_decode_32_safe(p, end, n, bad);
@@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
 		for (i = 0; i < n; i++) {
 			s32 mds = ceph_decode_32(p);
-			if (mds >= 0 && mds < m->m_max_mds) {
+			if (mds >= 0 && mds < m->m_num_mds) {
 				if (m->m_info[mds].laggy)
 					num_laggy++;
 			}
 		}
 		m->m_num_laggy = num_laggy;
+
+		if (n > m->m_num_mds) {
+			void *new_m_info = krealloc(m->m_info,
+						    n * sizeof(*m->m_info),
+						    GFP_NOFS | __GFP_ZERO);
+			if (!new_m_info)
+				goto nomem;
+			m->m_info = new_m_info;
+		}
+		m->m_num_mds = n;
 	}
 
 	/* inc */
@@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
 {
 	int i;
 
-	for (i = 0; i < m->m_max_mds; i++)
+	for (i = 0; i < m->m_num_mds; i++)
 		kfree(m->m_info[i].export_targets);
 	kfree(m->m_info);
 	kfree(m->m_data_pg_pools);
@@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
 		return false;
 	if (m->m_num_laggy > 0)
 		return false;
-	for (i = 0; i < m->m_max_mds; i++) {
+	for (i = 0; i < m->m_num_mds; i++) {
 		if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
 			nr_active++;
 	}
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 8ed5dc505fbb..d5f783f3226a 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -25,6 +25,7 @@ struct ceph_mdsmap {
 	u32 m_session_autoclose;        /* seconds */
 	u64 m_max_file_size;
 	u32 m_max_mds;                  /* size of m_addr, m_state arrays */
+	int m_num_mds;
 	struct ceph_mds_info *m_info;
 
 	/* which object pools file data can be stored in */
@@ -40,7 +41,7 @@ struct ceph_mdsmap {
 static inline struct ceph_entity_addr *
 ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
 {
-	if (w >= m->m_max_mds)
+	if (w >= m->m_num_mds)
 		return NULL;
 	return &m->m_info[w].addr;
 }
@@ -48,14 +49,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
 static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
 {
 	BUG_ON(w < 0);
-	if (w >= m->m_max_mds)
+	if (w >= m->m_num_mds)
 		return CEPH_MDS_STATE_DNE;
 	return m->m_info[w].state;
 }
 
 static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 {
-	if (w >= 0 && w < m->m_max_mds)
+	if (w >= 0 && w < m->m_num_mds)
 		return m->m_info[w].laggy;
 	return false;
 }
-- 
cgit v1.2.3


From 79162547b76e4979b21ef80c9629ada94a51a59b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 5 Apr 2017 12:54:05 -0400
Subject: ceph: make seeky readdir more efficient

Current cephfs client uses string to indicate start position of
readdir. The string is last entry of previous readdir reply.
This approach does not work for seeky readdir because we can
not easily convert the new postion to a string. For seeky readdir,
mds needs to return dentries from the beginning. Client keeps
retrying if the reply does not contain the dentry it wants.

In current version of ceph, mds sorts CDentry in its cache in
hash order. Client also uses dentry hash to compose dir postion.
For seeky readdir, if client passes the hash part of dir postion
to mds. mds can avoid replying useless dentries.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/dir.c                |  4 ++++
 fs/ceph/inode.c              | 17 ++++++++++++-----
 fs/ceph/mds_client.c         |  1 +
 fs/ceph/mds_client.h         |  3 ++-
 include/linux/ceph/ceph_fs.h |  2 ++
 5 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3e9ad501addf..ae61cdf7d489 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -378,7 +378,11 @@ more:
 				ceph_mdsc_put_request(req);
 				return -ENOMEM;
 			}
+		} else if (is_hash_order(ctx->pos)) {
+			req->r_args.readdir.offset_hash =
+				cpu_to_le32(fpos_hash(ctx->pos));
 		}
+
 		req->r_dir_release_cnt = fi->dir_release_count;
 		req->r_dir_ordered_cnt = fi->dir_ordered_count;
 		req->r_readdir_cache_idx = fi->readdir_cache_idx;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index d3119fe3ab45..dcce79b84406 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
 		return readdir_prepopulate_inodes_only(req, session);
 
-	if (rinfo->hash_order && req->r_path2) {
-		last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
-					  req->r_path2, strlen(req->r_path2));
-		last_hash = ceph_frag_value(last_hash);
+	if (rinfo->hash_order) {
+		if (req->r_path2) {
+			last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+						  req->r_path2,
+						  strlen(req->r_path2));
+			last_hash = ceph_frag_value(last_hash);
+		} else if (rinfo->offset_hash) {
+			/* mds understands offset_hash */
+			WARN_ON_ONCE(req->r_readdir_offset != 2);
+			last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
+		}
 	}
 
 	if (rinfo->dir_dir &&
@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	}
 
 	if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
-	    !(rinfo->hash_order && req->r_path2)) {
+	    !(rinfo->hash_order && last_hash)) {
 		/* note dir version at start of readdir so we can tell
 		 * if any dentries get dropped */
 		req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a22688873ec3..8cc4d4e8b077 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end,
 		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
 		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
 		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
+		info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
 	}
 	if (num == 0)
 		goto done;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bbebcd55d79e..3e67dd2169fa 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -83,9 +83,10 @@ struct ceph_mds_reply_info_parsed {
 			struct ceph_mds_reply_dirfrag *dir_dir;
 			size_t			      dir_buf_size;
 			int                           dir_nr;
-			bool			      dir_complete;
 			bool			      dir_end;
+			bool			      dir_complete;
 			bool			      hash_order;
+			bool			      offset_hash;
 			struct ceph_mds_reply_dir_entry  *dir_entries;
 		};
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index f4b2ee18f38c..1787e4a8e251 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -365,6 +365,7 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_READDIR_FRAG_END		(1<<0)
 #define CEPH_READDIR_FRAG_COMPLETE	(1<<8)
 #define CEPH_READDIR_HASH_ORDER		(1<<9)
+#define CEPH_READDIR_OFFSET_HASH	(1<<10)
 
 union ceph_mds_request_args {
 	struct {
@@ -384,6 +385,7 @@ union ceph_mds_request_args {
 		__le32 max_entries;          /* how many dentries to grab */
 		__le32 max_bytes;
 		__le16 flags;
+		__le32 offset_hash;
 	} __attribute__ ((packed)) readdir;
 	struct {
 		__le32 mode;
-- 
cgit v1.2.3


From aa26d662b9d44e7f5b0d109e892e537a23471863 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 4 Apr 2017 08:39:36 -0400
Subject: libceph: remove req->r_replay_version

Nothing uses this anymore with the removal of the ack vs. commit code.
Remove the field and just encode zeroes into place in the request
encoding.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h | 1 -
 net/ceph/debugfs.c              | 4 +---
 net/ceph/osd_client.c           | 7 ++++---
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index d6a625e75040..3fc9e7754a9b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -192,7 +192,6 @@ struct ceph_osd_request {
 	unsigned long r_stamp;                /* jiffies, send or check time */
 	unsigned long r_start_stamp;          /* jiffies */
 	int r_attempts;
-	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
 	u32 r_map_dne_bound;
 
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index c62b2b029a6e..d7e63a4f5578 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -177,9 +177,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
 	seq_printf(s, "%llu\t", req->r_tid);
 	dump_target(s, &req->r_t);
 
-	seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
-		   le32_to_cpu(req->r_replay_version.epoch),
-		   le64_to_cpu(req->r_replay_version.version));
+	seq_printf(s, "\t%d", req->r_attempts);
 
 	for (i = 0; i < req->r_num_ops; i++) {
 		struct ceph_osd_req_op *op = &req->r_ops[i];
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b4500a8ab8b3..feb666c22381 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1503,9 +1503,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
 	ceph_encode_32(&p, req->r_flags);
 	ceph_encode_timespec(p, &req->r_mtime);
 	p += sizeof(struct ceph_timespec);
-	/* aka reassert_version */
-	memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
-	p += sizeof(req->r_replay_version);
+
+	/* reassert_version */
+	memset(p, 0, sizeof(struct ceph_eversion));
+	p += sizeof(struct ceph_eversion);
 
 	/* oloc */
 	ceph_start_encoding(&p, 5, 4,
-- 
cgit v1.2.3


From a1f4020aab10a6bddb2d061c960ebe52cdfa30b5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 4 Apr 2017 08:39:37 -0400
Subject: libceph: allow requests to return immediately on full conditions if
 caller wishes

Usually, when the osd map is flagged as full or the pool is at quota,
write requests just hang. This is not what we want for cephfs, where
it would be better to simply report -ENOSPC back to userland instead
of stalling.

If the caller knows that it will want an immediate error return instead
of blocking on a full or at-quota error condition then allow it to set a
flag to request that behavior.

Set that flag in ceph_osdc_new_request (since ceph.ko is the only caller),
and on any other write request from ceph.ko.

A later patch will deal with requests that were submitted before the new
map showing the full condition came in.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c                  | 1 +
 fs/ceph/file.c                  | 1 +
 include/linux/ceph/osd_client.h | 1 +
 net/ceph/osd_client.c           | 7 +++++++
 4 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1a3e1b40799a..7e3fae334620 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1892,6 +1892,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
 	err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
 
 	wr_req->r_mtime = ci->vfs_inode.i_mtime;
+	wr_req->r_abort_on_full = true;
 	err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
 
 	if (!err)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ab4823e3e0d5..134c978141d0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -781,6 +781,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_callback = ceph_aio_complete_req;
 	req->r_inode = inode;
 	req->r_priv = aio_req;
+	req->r_abort_on_full = true;
 
 	ret = ceph_osdc_start_request(req->r_osdc, req, false);
 out:
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3fc9e7754a9b..8cf644197b1a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -187,6 +187,7 @@ struct ceph_osd_request {
 	struct timespec r_mtime;              /* ditto */
 	u64 r_data_offset;                    /* ditto */
 	bool r_linger;                        /* don't resend on failure */
+	bool r_abort_on_full;		      /* return ENOSPC when full */
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index feb666c22381..52a2019a2b64 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
+	req->r_abort_on_full = true;
 	req->r_flags = flags;
 	req->r_base_oloc.pool = layout->pool_id;
 	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
@@ -1627,6 +1628,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc)
 		ceph_monc_renew_subs(&osdc->client->monc);
 }
 
+static void complete_request(struct ceph_osd_request *req, int err);
 static void send_map_check(struct ceph_osd_request *req);
 
 static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -1636,6 +1638,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 	enum calc_target_result ct_res;
 	bool need_send = false;
 	bool promoted = false;
+	bool need_abort = false;
 
 	WARN_ON(req->r_tid);
 	dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
@@ -1670,6 +1673,8 @@ again:
 		pr_warn_ratelimited("FULL or reached pool quota\n");
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
+		if (req->r_abort_on_full)
+			need_abort = true;
 	} else if (!osd_homeless(osd)) {
 		need_send = true;
 	} else {
@@ -1686,6 +1691,8 @@ again:
 	link_request(osd, req);
 	if (need_send)
 		send_request(req);
+	else if (need_abort)
+		complete_request(req, -ENOSPC);
 	mutex_unlock(&osd->lock);
 
 	if (ct_res == CALC_TARGET_POOL_DNE)
-- 
cgit v1.2.3


From 58eb7932ae4d671d2a2377a1779eda96a2789b11 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 18 Apr 2017 09:21:16 -0400
Subject: libceph: add an epoch_barrier field to struct ceph_osd_client

Cephfs can get cap update requests that contain a new epoch barrier in
them. When that happens we want to pause all OSD traffic until the right
map epoch arrives.

Add an epoch_barrier field to ceph_osd_client that is protected by the
osdc->lock rwsem. When the barrier is set, and the current OSD map
epoch is below that, pause the request target when submitting the
request or when revisiting it. Add a way for upper layers (cephfs)
to update the epoch_barrier as well.

If we get a new map, compare the new epoch against the barrier before
kicking requests and request another map if the map epoch is still lower
than the one we want.

If we get a map with a full pool, or at quota condition, then set the
barrier to the current epoch value.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |  2 ++
 net/ceph/debugfs.c              |  3 +-
 net/ceph/osd_client.c           | 79 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 76 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 8cf644197b1a..85650b415e73 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -267,6 +267,7 @@ struct ceph_osd_client {
 	struct rb_root         osds;          /* osds */
 	struct list_head       osd_lru;       /* idle osds */
 	spinlock_t             osd_lru_lock;
+	u32		       epoch_barrier;
 	struct ceph_osd        homeless_osd;
 	atomic64_t             last_tid;      /* tid of last request */
 	u64                    last_linger_id;
@@ -305,6 +306,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 				   struct ceph_msg *msg);
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
 
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
 			    unsigned int which, u16 opcode, u32 flags);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index d7e63a4f5578..71ba13927b3d 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p)
 		return 0;
 
 	down_read(&osdc->lock);
-	seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
+	seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
+			osdc->epoch_barrier, map->flags);
 
 	for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
 		struct ceph_pg_pool_info *pi =
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 55b7585ccefd..9b8f57fd5ee1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1298,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 		       __pool_full(pi);
 
 	WARN_ON(pi->id != t->base_oloc.pool);
-	return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
-	       (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
+	return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
+	       ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
+	       (osdc->osdmap->epoch < osdc->epoch_barrier);
 }
 
 enum calc_target_result {
@@ -1654,8 +1655,13 @@ again:
 		goto promote;
 	}
 
-	if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
-	    ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
+	if (osdc->osdmap->epoch < osdc->epoch_barrier) {
+		dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch,
+		     osdc->epoch_barrier);
+		req->r_t.paused = true;
+		maybe_request_map(osdc);
+	} else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+		   ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) {
 		dout("req %p pausewr\n", req);
 		req->r_t.paused = true;
 		maybe_request_map(osdc);
@@ -1807,19 +1813,77 @@ static void abort_request(struct ceph_osd_request *req, int err)
 	complete_request(req, err);
 }
 
+static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+	if (likely(eb > osdc->epoch_barrier)) {
+		dout("updating epoch_barrier from %u to %u\n",
+				osdc->epoch_barrier, eb);
+		osdc->epoch_barrier = eb;
+		/* Request map if we're not to the barrier yet */
+		if (eb > osdc->osdmap->epoch)
+			maybe_request_map(osdc);
+	}
+}
+
+void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
+{
+	down_read(&osdc->lock);
+	if (unlikely(eb > osdc->epoch_barrier)) {
+		up_read(&osdc->lock);
+		down_write(&osdc->lock);
+		update_epoch_barrier(osdc, eb);
+		up_write(&osdc->lock);
+	} else {
+		up_read(&osdc->lock);
+	}
+}
+EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier);
+
 /*
  * Drop all pending requests that are stalled waiting on a full condition to
- * clear, and complete them with ENOSPC as the return code.
+ * clear, and complete them with ENOSPC as the return code. Set the
+ * osdc->epoch_barrier to the latest map epoch that we've seen if any were
+ * cancelled.
  */
 static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
 {
 	struct rb_node *n;
+	bool victims = false;
 
 	dout("enter abort_on_full\n");
 
 	if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc))
 		goto out;
 
+	/* Scan list and see if there is anything to abort */
+	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
+		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+		struct rb_node *m;
+
+		m = rb_first(&osd->o_requests);
+		while (m) {
+			struct ceph_osd_request *req = rb_entry(m,
+					struct ceph_osd_request, r_node);
+			m = rb_next(m);
+
+			if (req->r_abort_on_full) {
+				victims = true;
+				break;
+			}
+		}
+		if (victims)
+			break;
+	}
+
+	if (!victims)
+		goto out;
+
+	/*
+	 * Update the barrier to current epoch if it's behind that point,
+	 * since we know we have some calls to be aborted in the tree.
+	 */
+	update_epoch_barrier(osdc, osdc->osdmap->epoch);
+
 	for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
 		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 		struct rb_node *m;
@@ -1837,7 +1901,7 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
 		}
 	}
 out:
-	dout("return abort_on_full\n");
+	dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier);
 }
 
 static void check_pool_dne(struct ceph_osd_request *req)
@@ -3293,7 +3357,8 @@ done:
 	pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) ||
 		  ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
 		  have_pool_full(osdc);
-	if (was_pauserd || was_pausewr || pauserd || pausewr)
+	if (was_pauserd || was_pausewr || pauserd || pausewr ||
+	    osdc->osdmap->epoch < osdc->epoch_barrier)
 		maybe_request_map(osdc);
 
 	kick_requests(osdc, &need_resend, &need_resend_linger);
-- 
cgit v1.2.3


From 14bb211d324d6c8140167bd6b2b8a80757348a2f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 13 Apr 2017 12:17:38 +0200
Subject: rbd: support updating the lock cookie without releasing the lock

As we no longer release the lock before potentially raising BLACKLISTED
in rbd_reregister_watch(), the "either locked or blacklisted" assert in
rbd_queue_workfn() needs to go: we can be both locked and blacklisted
at that point now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jason Dillaman <dillaman@redhat.com>
---
 drivers/block/rbd.c                  | 66 ++++++++++++++++++++++--------------
 include/linux/ceph/cls_lock_client.h |  5 +++
 net/ceph/cls_lock_client.c           | 51 ++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5f563db59820..063c8f06fb9c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3820,24 +3820,51 @@ static void rbd_unregister_watch(struct rbd_device *rbd_dev)
 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
 }
 
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	char cookie[32];
+	int ret;
+
+	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
+
+	format_lock_cookie(rbd_dev, cookie);
+	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
+				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
+				  RBD_LOCK_TAG, cookie);
+	if (ret) {
+		if (ret != -EOPNOTSUPP)
+			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
+				 ret);
+
+		/*
+		 * Lock cookie cannot be updated on older OSDs, so do
+		 * a manual release and queue an acquire.
+		 */
+		if (rbd_release_lock(rbd_dev))
+			queue_delayed_work(rbd_dev->task_wq,
+					   &rbd_dev->lock_dwork, 0);
+	} else {
+		strcpy(rbd_dev->lock_cookie, cookie);
+	}
+}
+
 static void rbd_reregister_watch(struct work_struct *work)
 {
 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
 					    struct rbd_device, watch_dwork);
-	bool was_lock_owner = false;
-	bool need_to_wake = false;
 	int ret;
 
 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
 
-	down_write(&rbd_dev->lock_rwsem);
-	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
-		was_lock_owner = rbd_release_lock(rbd_dev);
-
 	mutex_lock(&rbd_dev->watch_mutex);
 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
 		mutex_unlock(&rbd_dev->watch_mutex);
-		goto out;
+		return;
 	}
 
 	ret = __rbd_register_watch(rbd_dev);
@@ -3845,36 +3872,28 @@ static void rbd_reregister_watch(struct work_struct *work)
 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
 		if (ret == -EBLACKLISTED || ret == -ENOENT) {
 			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
-			need_to_wake = true;
+			wake_requests(rbd_dev, true);
 		} else {
 			queue_delayed_work(rbd_dev->task_wq,
 					   &rbd_dev->watch_dwork,
 					   RBD_RETRY_DELAY);
 		}
 		mutex_unlock(&rbd_dev->watch_mutex);
-		goto out;
+		return;
 	}
 
-	need_to_wake = true;
 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
 	mutex_unlock(&rbd_dev->watch_mutex);
 
+	down_write(&rbd_dev->lock_rwsem);
+	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
+		rbd_reacquire_lock(rbd_dev);
+	up_write(&rbd_dev->lock_rwsem);
+
 	ret = rbd_dev_refresh(rbd_dev);
 	if (ret)
 		rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
-
-	if (was_lock_owner) {
-		ret = rbd_try_lock(rbd_dev);
-		if (ret)
-			rbd_warn(rbd_dev, "reregisteration lock failed: %d",
-				 ret);
-	}
-
-out:
-	up_write(&rbd_dev->lock_rwsem);
-	if (need_to_wake)
-		wake_requests(rbd_dev, true);
 }
 
 /*
@@ -4052,9 +4071,6 @@ static void rbd_queue_workfn(struct work_struct *work)
 		if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
 		    !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
 			rbd_wait_state_locked(rbd_dev);
-
-		WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^
-			!test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
 		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
 			result = -EBLACKLISTED;
 			goto err_unlock;
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
index 84884d8d4710..0594d3bba774 100644
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@@ -37,6 +37,11 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
 			struct ceph_object_locator *oloc,
 			char *lock_name, char *cookie,
 			struct ceph_entity_name *locker);
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+			struct ceph_object_id *oid,
+			struct ceph_object_locator *oloc,
+			char *lock_name, u8 type, char *old_cookie,
+			char *tag, char *new_cookie);
 
 void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers);
 
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b9233b990399..08ada893f01e 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_cls_break_lock);
 
+int ceph_cls_set_cookie(struct ceph_osd_client *osdc,
+			struct ceph_object_id *oid,
+			struct ceph_object_locator *oloc,
+			char *lock_name, u8 type, char *old_cookie,
+			char *tag, char *new_cookie)
+{
+	int cookie_op_buf_size;
+	int name_len = strlen(lock_name);
+	int old_cookie_len = strlen(old_cookie);
+	int tag_len = strlen(tag);
+	int new_cookie_len = strlen(new_cookie);
+	void *p, *end;
+	struct page *cookie_op_page;
+	int ret;
+
+	cookie_op_buf_size = name_len + sizeof(__le32) +
+			     old_cookie_len + sizeof(__le32) +
+			     tag_len + sizeof(__le32) +
+			     new_cookie_len + sizeof(__le32) +
+			     sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+	if (cookie_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	cookie_op_page = alloc_page(GFP_NOIO);
+	if (!cookie_op_page)
+		return -ENOMEM;
+
+	p = page_address(cookie_op_page);
+	end = p + cookie_op_buf_size;
+
+	/* encode cls_lock_set_cookie_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_8(&p, type);
+	ceph_encode_string(&p, end, old_cookie, old_cookie_len);
+	ceph_encode_string(&p, end, tag, tag_len);
+	ceph_encode_string(&p, end, new_cookie, new_cookie_len);
+
+	dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n",
+	     __func__, lock_name, type, old_cookie, tag, new_cookie);
+	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie",
+			     CEPH_OSD_FLAG_WRITE, cookie_op_page,
+			     cookie_op_buf_size, NULL, NULL);
+
+	dout("%s: status %d\n", __func__, ret);
+	__free_page(cookie_op_page);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_cls_set_cookie);
+
 void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers)
 {
 	int i;
-- 
cgit v1.2.3


From f775ff7d89f33fc9ba63f6f70df3bcc98c2d9828 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 27 Apr 2017 18:34:00 +0200
Subject: ceph: fix file open flags on ppc64

The file open flags (O_foo) are platform specific and should never go
out to an interface that is not local to the system.

Unfortunately these flags have leaked out onto the wire in the cephfs
implementation. That lead to bogus flags getting transmitted on ppc64.

This patch converts the kernel view of flags to the ceph view of file
open flags.

Fixes: 124e68e74 ("ceph: file operations")
Signed-off-by: Alexander Graf <agraf@suse.de>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c               | 34 +++++++++++++++++++++++++++++++++-
 include/linux/ceph/ceph_fs.h | 12 ++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 39866d6a34b6..9d2eeed9e323 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -13,6 +13,38 @@
 #include "mds_client.h"
 #include "cache.h"
 
+static __le32 ceph_flags_sys2wire(u32 flags)
+{
+	u32 wire_flags = 0;
+
+	switch (flags & O_ACCMODE) {
+	case O_RDONLY:
+		wire_flags |= CEPH_O_RDONLY;
+		break;
+	case O_WRONLY:
+		wire_flags |= CEPH_O_WRONLY;
+		break;
+	case O_RDWR:
+		wire_flags |= CEPH_O_RDWR;
+		break;
+	}
+
+#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
+
+	ceph_sys2wire(O_CREAT);
+	ceph_sys2wire(O_EXCL);
+	ceph_sys2wire(O_TRUNC);
+	ceph_sys2wire(O_DIRECTORY);
+	ceph_sys2wire(O_NOFOLLOW);
+
+#undef ceph_sys2wire
+
+	if (flags)
+		dout("unused open flags: %x", flags);
+
+	return cpu_to_le32(wire_flags);
+}
+
 /*
  * Ceph file operations
  *
@@ -123,7 +155,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
 	if (IS_ERR(req))
 		goto out;
 	req->r_fmode = ceph_flags_to_mode(flags);
-	req->r_args.open.flags = cpu_to_le32(flags);
+	req->r_args.open.flags = ceph_flags_sys2wire(flags);
 	req->r_args.open.mode = cpu_to_le32(create_mode);
 out:
 	return req;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 1787e4a8e251..ad078ebe25d6 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -367,6 +367,18 @@ extern const char *ceph_mds_op_name(int op);
 #define CEPH_READDIR_HASH_ORDER		(1<<9)
 #define CEPH_READDIR_OFFSET_HASH	(1<<10)
 
+/*
+ * open request flags
+ */
+#define CEPH_O_RDONLY		00000000
+#define CEPH_O_WRONLY		00000001
+#define CEPH_O_RDWR		00000002
+#define CEPH_O_CREAT		00000100
+#define CEPH_O_EXCL		00000200
+#define CEPH_O_TRUNC		00001000
+#define CEPH_O_DIRECTORY	00200000
+#define CEPH_O_NOFOLLOW		00400000
+
 union ceph_mds_request_args {
 	struct {
 		__le32 mask;                 /* CEPH_CAP_* */
-- 
cgit v1.2.3


From 44f0aeec203738bf34f4b7e16b745c8c71fe0f06 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 2 Apr 2017 16:50:33 +0200
Subject: dmaengine: pl080: Cut some unused defines

There is no in-kernel code using these indexed register
defines, and their offsets are clearly defined right below.
Cut them.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 include/linux/amba/pl080.h | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h
index 91b84a7f0539..800429f4b997 100644
--- a/include/linux/amba/pl080.h
+++ b/include/linux/amba/pl080.h
@@ -46,16 +46,8 @@
 
 /* Per channel configuration registers */
 
-#define PL080_Cx_STRIDE				(0x20)
+/* Per channel configuration registers */
 #define PL080_Cx_BASE(x)			((0x100 + (x * 0x20)))
-#define PL080_Cx_SRC_ADDR(x)			((0x100 + (x * 0x20)))
-#define PL080_Cx_DST_ADDR(x)			((0x104 + (x * 0x20)))
-#define PL080_Cx_LLI(x)				((0x108 + (x * 0x20)))
-#define PL080_Cx_CONTROL(x)			((0x10C + (x * 0x20)))
-#define PL080_Cx_CONFIG(x)			((0x110 + (x * 0x20)))
-#define PL080S_Cx_CONTROL2(x)			((0x110 + (x * 0x20)))
-#define PL080S_Cx_CONFIG(x)			((0x114 + (x * 0x20)))
-
 #define PL080_CH_SRC_ADDR			(0x00)
 #define PL080_CH_DST_ADDR			(0x04)
 #define PL080_CH_LLI				(0x08)
-- 
cgit v1.2.3


From ded091fee6806b7120f475d89c151d611758a395 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 2 Apr 2017 16:50:53 +0200
Subject: dmaengine: pl08x: Use the BIT() macro consistently

This makes the driver shift bits with BIT() which is used on other
places in the driver.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/amba-pl08x.c   | 10 +++++-----
 include/linux/amba/pl080.h | 40 ++++++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/amba-pl08x.c b/drivers/dma/amba-pl08x.c
index 8624598530f3..286114ebbbb9 100644
--- a/drivers/dma/amba-pl08x.c
+++ b/drivers/dma/amba-pl08x.c
@@ -420,7 +420,7 @@ static void pl08x_start_next_txd(struct pl08x_dma_chan *plchan)
 
 	/* Enable the DMA channel */
 	/* Do not access config register until channel shows as disabled */
-	while (readl(pl08x->base + PL080_EN_CHAN) & (1 << phychan->id))
+	while (readl(pl08x->base + PL080_EN_CHAN) & BIT(phychan->id))
 		cpu_relax();
 
 	/* Do not access config register until channel shows as inactive */
@@ -487,8 +487,8 @@ static void pl08x_terminate_phy_chan(struct pl08x_driver_data *pl08x,
 
 	writel(val, ch->reg_config);
 
-	writel(1 << ch->id, pl08x->base + PL080_ERR_CLEAR);
-	writel(1 << ch->id, pl08x->base + PL080_TC_CLEAR);
+	writel(BIT(ch->id), pl08x->base + PL080_ERR_CLEAR);
+	writel(BIT(ch->id), pl08x->base + PL080_TC_CLEAR);
 }
 
 static inline u32 get_bytes_in_cctl(u32 cctl)
@@ -1837,7 +1837,7 @@ static irqreturn_t pl08x_irq(int irq, void *dev)
 		return IRQ_NONE;
 
 	for (i = 0; i < pl08x->vd->channels; i++) {
-		if (((1 << i) & err) || ((1 << i) & tc)) {
+		if ((BIT(i) & err) || (BIT(i) & tc)) {
 			/* Locate physical channel */
 			struct pl08x_phy_chan *phychan = &pl08x->phy_chans[i];
 			struct pl08x_dma_chan *plchan = phychan->serving;
@@ -1875,7 +1875,7 @@ static irqreturn_t pl08x_irq(int irq, void *dev)
 			}
 			spin_unlock(&plchan->vc.lock);
 
-			mask |= (1 << i);
+			mask |= BIT(i);
 		}
 	}
 
diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h
index 800429f4b997..580b5323a717 100644
--- a/include/linux/amba/pl080.h
+++ b/include/linux/amba/pl080.h
@@ -38,9 +38,9 @@
 #define PL080_SOFT_LSREQ			(0x2C)
 
 #define PL080_CONFIG				(0x30)
-#define PL080_CONFIG_M2_BE			(1 << 2)
-#define PL080_CONFIG_M1_BE			(1 << 1)
-#define PL080_CONFIG_ENABLE			(1 << 0)
+#define PL080_CONFIG_M2_BE			BIT(2)
+#define PL080_CONFIG_M1_BE			BIT(1)
+#define PL080_CONFIG_ENABLE			BIT(0)
 
 #define PL080_SYNC				(0x34)
 
@@ -58,18 +58,18 @@
 
 #define PL080_LLI_ADDR_MASK			(0x3fffffff << 2)
 #define PL080_LLI_ADDR_SHIFT			(2)
-#define PL080_LLI_LM_AHB2			(1 << 0)
+#define PL080_LLI_LM_AHB2			BIT(0)
 
-#define PL080_CONTROL_TC_IRQ_EN			(1 << 31)
+#define PL080_CONTROL_TC_IRQ_EN			BIT(31)
 #define PL080_CONTROL_PROT_MASK			(0x7 << 28)
 #define PL080_CONTROL_PROT_SHIFT		(28)
-#define PL080_CONTROL_PROT_CACHE		(1 << 30)
-#define PL080_CONTROL_PROT_BUFF			(1 << 29)
-#define PL080_CONTROL_PROT_SYS			(1 << 28)
-#define PL080_CONTROL_DST_INCR			(1 << 27)
-#define PL080_CONTROL_SRC_INCR			(1 << 26)
-#define PL080_CONTROL_DST_AHB2			(1 << 25)
-#define PL080_CONTROL_SRC_AHB2			(1 << 24)
+#define PL080_CONTROL_PROT_CACHE		BIT(30)
+#define PL080_CONTROL_PROT_BUFF			BIT(29)
+#define PL080_CONTROL_PROT_SYS			BIT(28)
+#define PL080_CONTROL_DST_INCR			BIT(27)
+#define PL080_CONTROL_SRC_INCR			BIT(26)
+#define PL080_CONTROL_DST_AHB2			BIT(25)
+#define PL080_CONTROL_SRC_AHB2			BIT(24)
 #define PL080_CONTROL_DWIDTH_MASK		(0x7 << 21)
 #define PL080_CONTROL_DWIDTH_SHIFT		(21)
 #define PL080_CONTROL_SWIDTH_MASK		(0x7 << 18)
@@ -95,20 +95,20 @@
 #define PL080_WIDTH_16BIT			(0x1)
 #define PL080_WIDTH_32BIT			(0x2)
 
-#define PL080N_CONFIG_ITPROT			(1 << 20)
-#define PL080N_CONFIG_SECPROT			(1 << 19)
-#define PL080_CONFIG_HALT			(1 << 18)
-#define PL080_CONFIG_ACTIVE			(1 << 17)  /* RO */
-#define PL080_CONFIG_LOCK			(1 << 16)
-#define PL080_CONFIG_TC_IRQ_MASK		(1 << 15)
-#define PL080_CONFIG_ERR_IRQ_MASK		(1 << 14)
+#define PL080N_CONFIG_ITPROT			BIT(20)
+#define PL080N_CONFIG_SECPROT			BIT(19)
+#define PL080_CONFIG_HALT			BIT(18)
+#define PL080_CONFIG_ACTIVE			BIT(17)  /* RO */
+#define PL080_CONFIG_LOCK			BIT(16)
+#define PL080_CONFIG_TC_IRQ_MASK		BIT(15)
+#define PL080_CONFIG_ERR_IRQ_MASK		BIT(14)
 #define PL080_CONFIG_FLOW_CONTROL_MASK		(0x7 << 11)
 #define PL080_CONFIG_FLOW_CONTROL_SHIFT		(11)
 #define PL080_CONFIG_DST_SEL_MASK		(0xf << 6)
 #define PL080_CONFIG_DST_SEL_SHIFT		(6)
 #define PL080_CONFIG_SRC_SEL_MASK		(0xf << 1)
 #define PL080_CONFIG_SRC_SEL_SHIFT		(1)
-#define PL080_CONFIG_ENABLE			(1 << 0)
+#define PL080_CONFIG_ENABLE			BIT(0)
 
 #define PL080_FLOW_MEM2MEM			(0x0)
 #define PL080_FLOW_MEM2PER			(0x1)
-- 
cgit v1.2.3


From 9c1051aacde828073dbbab5e8e59c0fc802efa9a Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 4 May 2017 08:17:21 -0600
Subject: blk-mq: untangle debugfs and sysfs

Originally, I tied debugfs registration/unregistration together with
sysfs. There's no reason to do this, and it's getting in the way of
letting schedulers define their own debugfs attributes. Instead, tie the
debugfs registration to the lifetime of the structures themselves.

The saner lifetimes mean we can also get rid of the extra mq directory
and move everything one level up. I.e., nvme0n1/mq/hctx0/tags is now
just nvme0n1/hctx0/tags.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |   9 +---
 block/blk-mq-debugfs.c | 110 ++++++++++++++++++++++++++++---------------------
 block/blk-mq-debugfs.h |  21 ++++++++--
 block/blk-mq-sysfs.c   |  11 -----
 block/blk-mq.c         |   7 ++++
 block/blk-sysfs.c      |   2 +
 include/linux/blk-mq.h |   4 ++
 include/linux/blkdev.h |   1 -
 8 files changed, 94 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index acdca6536562..c580b0138a7f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -40,7 +40,6 @@
 
 #include "blk.h"
 #include "blk-mq.h"
-#include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
@@ -562,13 +561,9 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * prevent that q->request_fn() gets invoked after draining finished.
 	 */
 	blk_freeze_queue(q);
-	if (!q->mq_ops) {
-		spin_lock_irq(lock);
+	spin_lock_irq(lock);
+	if (!q->mq_ops)
 		__blk_drain_queue(q, true);
-	} else {
-		blk_mq_debugfs_unregister_mq(q);
-		spin_lock_irq(lock);
-	}
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
 	spin_unlock_irq(lock);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1dc1847b5363..260cf76e0705 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -687,19 +687,46 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{},
 };
 
+static bool debugfs_create_files(struct dentry *parent, void *data,
+				 const struct blk_mq_debugfs_attr *attr)
+{
+	d_inode(parent)->i_private = data;
+
+	for (; attr->name; attr++) {
+		if (!debugfs_create_file(attr->name, attr->mode, parent,
+					 (void *)attr, &blk_mq_debugfs_fops))
+			return false;
+	}
+	return true;
+}
+
 int blk_mq_debugfs_register(struct request_queue *q)
 {
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
 	if (!blk_debugfs_root)
 		return -ENOENT;
 
 	q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
 					    blk_debugfs_root);
 	if (!q->debugfs_dir)
-		goto err;
+		return -ENOMEM;
 
-	if (blk_mq_debugfs_register_mq(q))
+	if (!debugfs_create_files(q->debugfs_dir, q,
+				  blk_mq_debugfs_queue_attrs))
 		goto err;
 
+	/*
+	 * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir
+	 * didn't exist yet (because we don't know what to name the directory
+	 * until the queue is registered to a gendisk).
+	 */
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
+			goto err;
+	}
+
 	return 0;
 
 err:
@@ -710,32 +737,17 @@ err:
 void blk_mq_debugfs_unregister(struct request_queue *q)
 {
 	debugfs_remove_recursive(q->debugfs_dir);
-	q->mq_debugfs_dir = NULL;
 	q->debugfs_dir = NULL;
 }
 
-static bool debugfs_create_files(struct dentry *parent, void *data,
-				 const struct blk_mq_debugfs_attr *attr)
-{
-	d_inode(parent)->i_private = data;
-
-	for (; attr->name; attr++) {
-		if (!debugfs_create_file(attr->name, attr->mode, parent,
-					 (void *)attr, &blk_mq_debugfs_fops))
-			return false;
-	}
-	return true;
-}
-
-static int blk_mq_debugfs_register_ctx(struct request_queue *q,
-				       struct blk_mq_ctx *ctx,
-				       struct dentry *hctx_dir)
+static int blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
+				       struct blk_mq_ctx *ctx)
 {
 	struct dentry *ctx_dir;
 	char name[20];
 
 	snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
-	ctx_dir = debugfs_create_dir(name, hctx_dir);
+	ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir);
 	if (!ctx_dir)
 		return -ENOMEM;
 
@@ -745,59 +757,61 @@ static int blk_mq_debugfs_register_ctx(struct request_queue *q,
 	return 0;
 }
 
-static int blk_mq_debugfs_register_hctx(struct request_queue *q,
-					struct blk_mq_hw_ctx *hctx)
+int blk_mq_debugfs_register_hctx(struct request_queue *q,
+				 struct blk_mq_hw_ctx *hctx)
 {
 	struct blk_mq_ctx *ctx;
-	struct dentry *hctx_dir;
 	char name[20];
 	int i;
 
+	if (!q->debugfs_dir)
+		return -ENOENT;
+
 	snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
-	hctx_dir = debugfs_create_dir(name, q->mq_debugfs_dir);
-	if (!hctx_dir)
+	hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
+	if (!hctx->debugfs_dir)
 		return -ENOMEM;
 
-	if (!debugfs_create_files(hctx_dir, hctx, blk_mq_debugfs_hctx_attrs))
-		return -ENOMEM;
+	if (!debugfs_create_files(hctx->debugfs_dir, hctx,
+				  blk_mq_debugfs_hctx_attrs))
+		goto err;
 
 	hctx_for_each_ctx(hctx, ctx, i) {
-		if (blk_mq_debugfs_register_ctx(q, ctx, hctx_dir))
-			return -ENOMEM;
+		if (blk_mq_debugfs_register_ctx(hctx, ctx))
+			goto err;
 	}
 
 	return 0;
+
+err:
+	blk_mq_debugfs_unregister_hctx(hctx);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	debugfs_remove_recursive(hctx->debugfs_dir);
+	hctx->debugfs_dir = NULL;
 }
 
-int blk_mq_debugfs_register_mq(struct request_queue *q)
+int blk_mq_debugfs_register_hctxs(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
-	if (!q->debugfs_dir)
-		return -ENOENT;
-
-	q->mq_debugfs_dir = debugfs_create_dir("mq", q->debugfs_dir);
-	if (!q->mq_debugfs_dir)
-		goto err;
-
-	if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
-		goto err;
-
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_debugfs_register_hctx(q, hctx))
-			goto err;
+			return -ENOMEM;
 	}
 
 	return 0;
-
-err:
-	blk_mq_debugfs_unregister_mq(q);
-	return -ENOMEM;
 }
 
-void blk_mq_debugfs_unregister_mq(struct request_queue *q)
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 {
-	debugfs_remove_recursive(q->mq_debugfs_dir);
-	q->mq_debugfs_dir = NULL;
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_debugfs_unregister_hctx(hctx);
 }
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 00b0f71d0ae9..596e9b16d3d1 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -4,8 +4,11 @@
 #ifdef CONFIG_BLK_DEBUG_FS
 int blk_mq_debugfs_register(struct request_queue *q);
 void blk_mq_debugfs_unregister(struct request_queue *q);
-int blk_mq_debugfs_register_mq(struct request_queue *q);
-void blk_mq_debugfs_unregister_mq(struct request_queue *q);
+int blk_mq_debugfs_register_hctx(struct request_queue *q,
+				 struct blk_mq_hw_ctx *hctx);
+void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
+int blk_mq_debugfs_register_hctxs(struct request_queue *q);
+void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
 #else
 static inline int blk_mq_debugfs_register(struct request_queue *q)
 {
@@ -16,12 +19,22 @@ static inline void blk_mq_debugfs_unregister(struct request_queue *q)
 {
 }
 
-static inline int blk_mq_debugfs_register_mq(struct request_queue *q)
+static inline int blk_mq_debugfs_register_hctx(struct request_queue *q,
+					       struct blk_mq_hw_ctx *hctx)
 {
 	return 0;
 }
 
-static inline void blk_mq_debugfs_unregister_mq(struct request_queue *q)
+static inline void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+}
+
+static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+{
+	return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 {
 }
 #endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 71a237a90d43..79969c3c234f 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -11,7 +11,6 @@
 
 #include <linux/blk-mq.h>
 #include "blk-mq.h"
-#include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
 
 static void blk_mq_sysfs_release(struct kobject *kobj)
@@ -259,8 +258,6 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 
-	blk_mq_debugfs_unregister_mq(q);
-
 	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
 	kobject_del(&q->mq_kobj);
 	kobject_put(&dev->kobj);
@@ -319,8 +316,6 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
 
 	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
 
-	blk_mq_debugfs_register(q);
-
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
@@ -336,8 +331,6 @@ unreg:
 	while (--i >= 0)
 		blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
 
-	blk_mq_debugfs_unregister_mq(q);
-
 	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
 	kobject_del(&q->mq_kobj);
 	kobject_put(&dev->kobj);
@@ -365,8 +358,6 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
 	if (!q->mq_sysfs_init_done)
 		goto unlock;
 
-	blk_mq_debugfs_unregister_mq(q);
-
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 
@@ -383,8 +374,6 @@ int blk_mq_sysfs_register(struct request_queue *q)
 	if (!q->mq_sysfs_init_done)
 		goto unlock;
 
-	blk_mq_debugfs_register_mq(q);
-
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
 		if (ret)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 03a747105682..5d4ce7eb8dbf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -31,6 +31,7 @@
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
@@ -1862,6 +1863,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
+	blk_mq_debugfs_unregister_hctx(hctx);
+
 	blk_mq_tag_idle(hctx);
 
 	if (set->ops->exit_request)
@@ -1948,6 +1951,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
 		init_srcu_struct(&hctx->queue_rq_srcu);
 
+	blk_mq_debugfs_register_hctx(q, hctx);
+
 	return 0;
 
  free_fq:
@@ -2385,6 +2390,7 @@ static void blk_mq_queue_reinit(struct request_queue *q,
 {
 	WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
 
+	blk_mq_debugfs_unregister_hctxs(q);
 	blk_mq_sysfs_unregister(q);
 
 	/*
@@ -2396,6 +2402,7 @@ static void blk_mq_queue_reinit(struct request_queue *q,
 	blk_mq_map_swqueue(q, online_mask);
 
 	blk_mq_sysfs_register(q);
+	blk_mq_debugfs_register_hctxs(q);
 }
 
 /*
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9995355121d7..504fee940052 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -890,6 +890,8 @@ int blk_register_queue(struct gendisk *disk)
 	if (q->mq_ops)
 		__blk_mq_register_dev(dev, q);
 
+	blk_mq_debugfs_register(q);
+
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
 	wbt_enable_default(q);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a104832e7ae5..de8ed9aaa156 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -57,6 +57,10 @@ struct blk_mq_hw_ctx {
 	unsigned long		poll_considered;
 	unsigned long		poll_invoked;
 	unsigned long		poll_success;
+
+#ifdef CONFIG_BLK_DEBUG_FS
+	struct dentry		*debugfs_dir;
+#endif
 };
 
 struct blk_mq_tag_set {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83d28623645f..b49a79a29e58 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -579,7 +579,6 @@ struct request_queue {
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
-	struct dentry		*mq_debugfs_dir;
 #endif
 
 	bool			mq_sysfs_init_done;
-- 
cgit v1.2.3


From d332ce091813d11a46144354baa72b755833392f Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Thu, 4 May 2017 08:24:40 -0600
Subject: blk-mq-debugfs: allow schedulers to register debugfs attributes

This provides the infrastructure for schedulers to expose their internal
state through debugfs. We add a list of queue attributes and a list of
hctx attributes to struct elevator_type and wire them up when switching
schedulers.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>

Add missing seq_file.h header in blk-mq-debugfs.h

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-debugfs.c   | 74 ++++++++++++++++++++++++++++++++++++++++++------
 block/blk-mq-debugfs.h   | 37 ++++++++++++++++++++++++
 block/blk-mq-sched.c     | 24 ++++++++++------
 include/linux/blk-mq.h   |  1 +
 include/linux/blkdev.h   |  1 +
 include/linux/elevator.h |  7 +++++
 6 files changed, 127 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 260cf76e0705..a3b887109310 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -24,15 +24,6 @@
 #include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
 
-struct blk_mq_debugfs_attr {
-	const char *name;
-	umode_t mode;
-	int (*show)(void *, struct seq_file *);
-	ssize_t (*write)(void *, const char __user *, size_t, loff_t *);
-	/* Set either .show or .seq_ops. */
-	const struct seq_operations *seq_ops;
-};
-
 static int blk_flags_show(struct seq_file *m, const unsigned long flags,
 			  const char *const *flag_name, int flag_name_count)
 {
@@ -725,6 +716,9 @@ int blk_mq_debugfs_register(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx))
 			goto err;
+		if (q->elevator && !hctx->sched_debugfs_dir &&
+		    blk_mq_debugfs_register_sched_hctx(q, hctx))
+			goto err;
 	}
 
 	return 0;
@@ -737,6 +731,7 @@ err:
 void blk_mq_debugfs_unregister(struct request_queue *q)
 {
 	debugfs_remove_recursive(q->debugfs_dir);
+	q->sched_debugfs_dir = NULL;
 	q->debugfs_dir = NULL;
 }
 
@@ -791,6 +786,7 @@ err:
 void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	debugfs_remove_recursive(hctx->debugfs_dir);
+	hctx->sched_debugfs_dir = NULL;
 	hctx->debugfs_dir = NULL;
 }
 
@@ -815,3 +811,63 @@ void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_debugfs_unregister_hctx(hctx);
 }
+
+int blk_mq_debugfs_register_sched(struct request_queue *q)
+{
+	struct elevator_type *e = q->elevator->type;
+
+	if (!q->debugfs_dir)
+		return -ENOENT;
+
+	if (!e->queue_debugfs_attrs)
+		return 0;
+
+	q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir);
+	if (!q->sched_debugfs_dir)
+		return -ENOMEM;
+
+	if (!debugfs_create_files(q->sched_debugfs_dir, q,
+				  e->queue_debugfs_attrs))
+		goto err;
+
+	return 0;
+
+err:
+	blk_mq_debugfs_unregister_sched(q);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_sched(struct request_queue *q)
+{
+	debugfs_remove_recursive(q->sched_debugfs_dir);
+	q->sched_debugfs_dir = NULL;
+}
+
+int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
+				       struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_type *e = q->elevator->type;
+
+	if (!hctx->debugfs_dir)
+		return -ENOENT;
+
+	if (!e->hctx_debugfs_attrs)
+		return 0;
+
+	hctx->sched_debugfs_dir = debugfs_create_dir("sched",
+						     hctx->debugfs_dir);
+	if (!hctx->sched_debugfs_dir)
+		return -ENOMEM;
+
+	if (!debugfs_create_files(hctx->sched_debugfs_dir, hctx,
+				  e->hctx_debugfs_attrs))
+		return -ENOMEM;
+
+	return 0;
+}
+
+void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	debugfs_remove_recursive(hctx->sched_debugfs_dir);
+	hctx->sched_debugfs_dir = NULL;
+}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 596e9b16d3d1..a5ac21c81ea3 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -2,6 +2,18 @@
 #define INT_BLK_MQ_DEBUGFS_H
 
 #ifdef CONFIG_BLK_DEBUG_FS
+
+#include <linux/seq_file.h>
+
+struct blk_mq_debugfs_attr {
+	const char *name;
+	umode_t mode;
+	int (*show)(void *, struct seq_file *);
+	ssize_t (*write)(void *, const char __user *, size_t, loff_t *);
+	/* Set either .show or .seq_ops. */
+	const struct seq_operations *seq_ops;
+};
+
 int blk_mq_debugfs_register(struct request_queue *q);
 void blk_mq_debugfs_unregister(struct request_queue *q);
 int blk_mq_debugfs_register_hctx(struct request_queue *q,
@@ -9,6 +21,12 @@ int blk_mq_debugfs_register_hctx(struct request_queue *q,
 void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
 int blk_mq_debugfs_register_hctxs(struct request_queue *q);
 void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+
+int blk_mq_debugfs_register_sched(struct request_queue *q);
+void blk_mq_debugfs_unregister_sched(struct request_queue *q);
+int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
+				       struct blk_mq_hw_ctx *hctx);
+void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
 #else
 static inline int blk_mq_debugfs_register(struct request_queue *q)
 {
@@ -37,6 +55,25 @@ static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
 static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
 {
 }
+
+static inline int blk_mq_debugfs_register_sched(struct request_queue *q)
+{
+	return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_sched(struct request_queue *q)
+{
+}
+
+static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
+						     struct blk_mq_hw_ctx *hctx)
+{
+	return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
+{
+}
 #endif
 
 #endif
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e79e9f18d7c2..1f5b692526ae 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -11,6 +11,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 #include "blk-wbt.h"
@@ -472,6 +473,8 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
+	blk_mq_debugfs_register_sched_hctx(q, hctx);
+
 	return 0;
 }
 
@@ -483,6 +486,8 @@ void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 	if (!e)
 		return;
 
+	blk_mq_debugfs_unregister_sched_hctx(hctx);
+
 	if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
 		e->type->ops.mq.exit_hctx(hctx, hctx_idx);
 		hctx->sched_data = NULL;
@@ -519,8 +524,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	if (ret)
 		goto err;
 
-	if (e->ops.mq.init_hctx) {
-		queue_for_each_hw_ctx(q, hctx, i) {
+	blk_mq_debugfs_register_sched(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (e->ops.mq.init_hctx) {
 			ret = e->ops.mq.init_hctx(hctx, i);
 			if (ret) {
 				eq = q->elevator;
@@ -529,6 +536,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 				return ret;
 			}
 		}
+		blk_mq_debugfs_register_sched_hctx(q, hctx);
 	}
 
 	return 0;
@@ -544,14 +552,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 
-	if (e->type->ops.mq.exit_hctx) {
-		queue_for_each_hw_ctx(q, hctx, i) {
-			if (hctx->sched_data) {
-				e->type->ops.mq.exit_hctx(hctx, i);
-				hctx->sched_data = NULL;
-			}
+	queue_for_each_hw_ctx(q, hctx, i) {
+		blk_mq_debugfs_unregister_sched_hctx(hctx);
+		if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
+			e->type->ops.mq.exit_hctx(hctx, i);
+			hctx->sched_data = NULL;
 		}
 	}
+	blk_mq_debugfs_unregister_sched(q);
 	if (e->type->ops.mq.exit_sched)
 		e->type->ops.mq.exit_sched(e);
 	blk_mq_sched_tags_teardown(q);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index de8ed9aaa156..c47aa248c640 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -60,6 +60,7 @@ struct blk_mq_hw_ctx {
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
+	struct dentry		*sched_debugfs_dir;
 #endif
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b49a79a29e58..80ae958717a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -579,6 +579,7 @@ struct request_queue {
 
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
+	struct dentry		*sched_debugfs_dir;
 #endif
 
 	bool			mq_sysfs_init_done;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index d44840368ee7..9ec5e22846e0 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -8,6 +8,9 @@
 
 struct io_cq;
 struct elevator_type;
+#ifdef CONFIG_BLK_DEBUG_FS
+struct blk_mq_debugfs_attr;
+#endif
 
 /*
  * Return values from elevator merger
@@ -144,6 +147,10 @@ struct elevator_type
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
 	bool uses_mq;
+#ifdef CONFIG_BLK_DEBUG_FS
+	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
+	const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
+#endif
 
 	/* managed by elevator core */
 	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
-- 
cgit v1.2.3


From 17159420a6c18bb3515ff85598b5ccf1a572763d Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 24 Apr 2017 10:00:10 -0700
Subject: fscrypt: introduce helper function for filename matching

Introduce a helper function fscrypt_match_name() which tests whether a
fscrypt_name matches a directory entry.  Also clean up the magic numbers
and document things properly.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/crypto/fname.c               | 90 ++++++++++++++++++++++++++++++++---------
 fs/crypto/fscrypt_private.h     |  2 -
 include/linux/fscrypt_notsupp.h |  9 +++++
 include/linux/fscrypt_supp.h    | 78 +++++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 15bf9c31a34d..d1bb02b1ee58 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode,
 static const char *lookup_table =
 	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
 
+#define BASE64_CHARS(nbytes)	DIV_ROUND_UP((nbytes) * 4, 3)
+
 /**
  * digest_encode() -
  *
@@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
 int fscrypt_fname_alloc_buffer(const struct inode *inode,
 				u32 ilen, struct fscrypt_str *crypto_str)
 {
-	unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
+	u32 olen = fscrypt_fname_encrypted_size(inode, ilen);
+	const u32 max_encoded_len =
+		max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE),
+		      1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)));
 
 	crypto_str->len = olen;
-	if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
-		olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+	olen = max(olen, max_encoded_len);
+
 	/*
 	 * Allocated buffer can hold one more character to null-terminate the
 	 * string
@@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer);
  *
  * The caller must have allocated sufficient memory for the @oname string.
  *
+ * If the key is available, we'll decrypt the disk name; otherwise, we'll encode
+ * it for presentation.  Short names are directly base64-encoded, while long
+ * names are encoded in fscrypt_digested_name format.
+ *
  * Return: 0 on success, -errno on failure
  */
 int fscrypt_fname_disk_to_usr(struct inode *inode,
@@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 			struct fscrypt_str *oname)
 {
 	const struct qstr qname = FSTR_TO_QSTR(iname);
-	char buf[24];
+	struct fscrypt_digested_name digested_name;
 
 	if (fscrypt_is_dot_dotdot(&qname)) {
 		oname->name[0] = '.';
@@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode,
 	if (inode->i_crypt_info)
 		return fname_decrypt(inode, iname, oname);
 
-	if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
+	if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) {
 		oname->len = digest_encode(iname->name, iname->len,
 					   oname->name);
 		return 0;
 	}
 	if (hash) {
-		memcpy(buf, &hash, 4);
-		memcpy(buf + 4, &minor_hash, 4);
+		digested_name.hash = hash;
+		digested_name.minor_hash = minor_hash;
 	} else {
-		memset(buf, 0, 8);
+		digested_name.hash = 0;
+		digested_name.minor_hash = 0;
 	}
-	memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16);
+	memcpy(digested_name.digest,
+	       FSCRYPT_FNAME_DIGEST(iname->name, iname->len),
+	       FSCRYPT_FNAME_DIGEST_SIZE);
 	oname->name[0] = '_';
-	oname->len = 1 + digest_encode(buf, 24, oname->name + 1);
+	oname->len = 1 + digest_encode((const char *)&digested_name,
+				       sizeof(digested_name), oname->name + 1);
 	return 0;
 }
 EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
@@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
 }
 EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
 
+/**
+ * fscrypt_setup_filename() - prepare to search a possibly encrypted directory
+ * @dir: the directory that will be searched
+ * @iname: the user-provided filename being searched for
+ * @lookup: 1 if we're allowed to proceed without the key because it's
+ *	->lookup() or we're finding the dir_entry for deletion; 0 if we cannot
+ *	proceed without the key because we're going to create the dir_entry.
+ * @fname: the filename information to be filled in
+ *
+ * Given a user-provided filename @iname, this function sets @fname->disk_name
+ * to the name that would be stored in the on-disk directory entry, if possible.
+ * If the directory is unencrypted this is simply @iname.  Else, if we have the
+ * directory's encryption key, then @iname is the plaintext, so we encrypt it to
+ * get the disk_name.
+ *
+ * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
+ * we decode it to get either the ciphertext disk_name (for short names) or the
+ * fscrypt_digested_name (for long names).  Non-@lookup operations will be
+ * impossible in this case, so we fail them with ENOKEY.
+ *
+ * If successful, fscrypt_free_filename() must be called later to clean up.
+ *
+ * Return: 0 on success, -errno on failure
+ */
 int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 			      int lookup, struct fscrypt_name *fname)
 {
-	int ret = 0, bigname = 0;
+	int ret;
+	int digested;
 
 	memset(fname, 0, sizeof(struct fscrypt_name));
 	fname->usr_fname = iname;
@@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 	 * We don't have the key and we are doing a lookup; decode the
 	 * user-supplied name
 	 */
-	if (iname->name[0] == '_')
-		bigname = 1;
-	if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
-		return -ENOENT;
+	if (iname->name[0] == '_') {
+		if (iname->len !=
+		    1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name)))
+			return -ENOENT;
+		digested = 1;
+	} else {
+		if (iname->len >
+		    BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE))
+			return -ENOENT;
+		digested = 0;
+	}
 
-	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+	fname->crypto_buf.name =
+		kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE,
+			      sizeof(struct fscrypt_digested_name)),
+			GFP_KERNEL);
 	if (fname->crypto_buf.name == NULL)
 		return -ENOMEM;
 
-	ret = digest_decode(iname->name + bigname, iname->len - bigname,
+	ret = digest_decode(iname->name + digested, iname->len - digested,
 				fname->crypto_buf.name);
 	if (ret < 0) {
 		ret = -ENOENT;
 		goto errout;
 	}
 	fname->crypto_buf.len = ret;
-	if (bigname) {
-		memcpy(&fname->hash, fname->crypto_buf.name, 4);
-		memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
+	if (digested) {
+		const struct fscrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		fname->hash = n->hash;
+		fname->minor_hash = n->minor_hash;
 	} else {
 		fname->disk_name.name = fname->crypto_buf.name;
 		fname->disk_name.len = fname->crypto_buf.len;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index e08ca6d1ca0f..1e1f8a361b75 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -13,8 +13,6 @@
 
 #include <linux/fscrypt_supp.h>
 
-#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
-
 /* Encryption parameters */
 #define FS_XTS_TWEAK_SIZE		16
 #define FS_AES_128_ECB_KEY_SIZE		16
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h
index 3511ca798804..ec406aed2f2f 100644
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -147,6 +147,15 @@ static inline int fscrypt_fname_usr_to_disk(struct inode *inode,
 	return -EOPNOTSUPP;
 }
 
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	/* Encryption support disabled; use standard comparison */
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
 /* bio.c */
 static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx,
 					     struct bio *bio)
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index a140f47e9b27..e12c224a0d1e 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -57,6 +57,84 @@ extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
 extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
 			struct fscrypt_str *);
 
+#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE	32
+
+/* Extracts the second-to-last ciphertext block; see explanation below */
+#define FSCRYPT_FNAME_DIGEST(name, len)	\
+	((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \
+			     FS_CRYPTO_BLOCK_SIZE))
+
+#define FSCRYPT_FNAME_DIGEST_SIZE	FS_CRYPTO_BLOCK_SIZE
+
+/**
+ * fscrypt_digested_name - alternate identifier for an on-disk filename
+ *
+ * When userspace lists an encrypted directory without access to the key,
+ * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE
+ * bytes are shown in this abbreviated form (base64-encoded) rather than as the
+ * full ciphertext (base64-encoded).  This is necessary to allow supporting
+ * filenames up to NAME_MAX bytes, since base64 encoding expands the length.
+ *
+ * To make it possible for filesystems to still find the correct directory entry
+ * despite not knowing the full on-disk name, we encode any filesystem-specific
+ * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups,
+ * followed by the second-to-last ciphertext block of the filename.  Due to the
+ * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
+ * depends on the full plaintext.  (Note that ciphertext stealing causes the
+ * last two blocks to appear "flipped".)  This makes collisions very unlikely:
+ * just a 1 in 2^128 chance for two filenames to collide even if they share the
+ * same filesystem-specific hashes.
+ *
+ * This scheme isn't strictly immune to intentional collisions because it's
+ * basically like a CBC-MAC, which isn't secure on variable-length inputs.
+ * However, generating a CBC-MAC collision requires the ability to choose
+ * arbitrary ciphertext, which won't normally be possible with filename
+ * encryption since it would require write access to the raw disk.
+ *
+ * Taking a real cryptographic hash like SHA-256 over the full ciphertext would
+ * be better in theory but would be less efficient and more complicated to
+ * implement, especially since the filesystem would need to calculate it for
+ * each directory entry examined during a search.
+ */
+struct fscrypt_digested_name {
+	u32 hash;
+	u32 minor_hash;
+	u8 digest[FSCRYPT_FNAME_DIGEST_SIZE];
+};
+
+/**
+ * fscrypt_match_name() - test whether the given name matches a directory entry
+ * @fname: the name being searched for
+ * @de_name: the name from the directory entry
+ * @de_name_len: the length of @de_name in bytes
+ *
+ * Normally @fname->disk_name will be set, and in that case we simply compare
+ * that to the name stored in the directory entry.  The only exception is that
+ * if we don't have the key for an encrypted directory and a filename in it is
+ * very long, then we won't have the full disk_name and we'll instead need to
+ * match against the fscrypt_digested_name.
+ *
+ * Return: %true if the name matches, otherwise %false.
+ */
+static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
+				      const u8 *de_name, u32 de_name_len)
+{
+	if (unlikely(!fname->disk_name.name)) {
+		const struct fscrypt_digested_name *n =
+			(const void *)fname->crypto_buf.name;
+		if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_'))
+			return false;
+		if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)
+			return false;
+		return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len),
+			       n->digest, FSCRYPT_FNAME_DIGEST_SIZE);
+	}
+
+	if (de_name_len != fname->disk_name.len)
+		return false;
+	return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
+}
+
 /* bio.c */
 extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
 extern void fscrypt_pullback_bio_page(struct page **, bool);
-- 
cgit v1.2.3


From 6f9d696f016f5b42f6c6e8c9f723f8d3380e5903 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 1 May 2017 11:43:32 -0700
Subject: fscrypt: correct collision claim for digested names

As I noted on the mailing list, it's easier than I originally thought to
create intentional collisions in the digested names.  Unfortunately it's
not too easy to solve this, so for now just fix the comment to not lie.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/fscrypt_supp.h | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h
index e12c224a0d1e..cd4e82c17304 100644
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -81,20 +81,16 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
  * followed by the second-to-last ciphertext block of the filename.  Due to the
  * use of the CBC-CTS encryption mode, the second-to-last ciphertext block
  * depends on the full plaintext.  (Note that ciphertext stealing causes the
- * last two blocks to appear "flipped".)  This makes collisions very unlikely:
- * just a 1 in 2^128 chance for two filenames to collide even if they share the
- * same filesystem-specific hashes.
+ * last two blocks to appear "flipped".)  This makes accidental collisions very
+ * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they
+ * share the same filesystem-specific hashes.
  *
- * This scheme isn't strictly immune to intentional collisions because it's
- * basically like a CBC-MAC, which isn't secure on variable-length inputs.
- * However, generating a CBC-MAC collision requires the ability to choose
- * arbitrary ciphertext, which won't normally be possible with filename
- * encryption since it would require write access to the raw disk.
- *
- * Taking a real cryptographic hash like SHA-256 over the full ciphertext would
- * be better in theory but would be less efficient and more complicated to
- * implement, especially since the filesystem would need to calculate it for
- * each directory entry examined during a search.
+ * However, this scheme isn't immune to intentional collisions, which can be
+ * created by anyone able to create arbitrary plaintext filenames and view them
+ * without the key.  Making the "digest" be a real cryptographic hash like
+ * SHA-256 over the full ciphertext would prevent this, although it would be
+ * less efficient and harder to implement, especially since the filesystem would
+ * need to calculate it for each directory entry examined during a search.
  */
 struct fscrypt_digested_name {
 	u32 hash;
-- 
cgit v1.2.3


From f870a3c6727db5fcfeaa42d099f75872e4b17553 Mon Sep 17 00:00:00 2001
From: "sudarsana.kalluru@cavium.com" <sudarsana.kalluru@cavium.com>
Date: Thu, 4 May 2017 08:15:03 -0700
Subject: qed*: Fix possible overflow for status block id field.

Value for status block id could be more than 256 in 100G mode, need to
update its data type from u8 to u16.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c       | 8 ++++----
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h   | 4 ++--
 drivers/net/ethernet/qlogic/qed/qed_main.c      | 2 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 5 ++---
 include/linux/qed/qed_if.h                      | 2 +-
 5 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 5f31140d0b77..5c6874af4b65 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -3586,7 +3586,7 @@ static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 }
 
 int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id)
+			 u16 coalesce, u16 qid, u16 sb_id)
 {
 	struct ustorm_eth_queue_zone eth_qzone;
 	u8 timeset, timer_res;
@@ -3607,7 +3607,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 	timeset = (u8)(coalesce >> timer_res);
 
-	rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid);
+	rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid);
 	if (rc)
 		return rc;
 
@@ -3628,7 +3628,7 @@ out:
 }
 
 int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id)
+			 u16 coalesce, u16 qid, u16 sb_id)
 {
 	struct xstorm_eth_queue_zone eth_qzone;
 	u8 timeset, timer_res;
@@ -3649,7 +3649,7 @@ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 	timeset = (u8)(coalesce >> timer_res);
 
-	rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid);
+	rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid);
 	if (rc)
 		return rc;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index cefe3ee9064a..12d16c096e36 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -454,7 +454,7 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
  * @return int
  */
 int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id);
+			 u16 coalesce, u16 qid, u16 sb_id);
 
 /**
  * @brief qed_set_txq_coalesce - Configure coalesce parameters for a Tx queue
@@ -471,7 +471,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
  * @return int
  */
 int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id);
+			 u16 coalesce, u16 qid, u16 sb_id);
 
 const char *qed_hw_get_resc_name(enum qed_resources res_id);
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 59992cf20d42..a5eef1abc5a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1521,7 +1521,7 @@ static void qed_get_coalesce(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal)
 }
 
 static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
-			    u8 qid, u16 sb_id)
+			    u16 qid, u16 sb_id)
 {
 	struct qed_hwfn *hwfn;
 	struct qed_ptt *ptt;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 4dcfe9614731..b22753c5fd44 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -706,8 +706,7 @@ static int qede_set_coalesce(struct net_device *dev,
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	int i, rc = 0;
-	u16 rxc, txc;
-	u8 sb_id;
+	u16 rxc, txc, sb_id;
 
 	if (!netif_running(dev)) {
 		DP_INFO(edev, "Interface is down\n");
@@ -729,7 +728,7 @@ static int qede_set_coalesce(struct net_device *dev,
 	for_each_queue(i) {
 		sb_id = edev->fp_array[i].sb_info->igu_sb_id;
 		rc = edev->ops->common->set_coalesce(edev->cdev, rxc, txc,
-						     (u8)i, sb_id);
+						     (u16)i, sb_id);
 		if (rc) {
 			DP_INFO(edev, "Set coalesce error, rc = %d\n", rc);
 			return rc;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 5544d7b2f2bb..c70ac13a97e6 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -635,7 +635,7 @@ struct qed_common_ops {
  * @return 0 on success, error otherwise.
  */
 	int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
-			    u8 qid, u16 sb_id);
+			    u16 qid, u16 sb_id);
 
 /**
  * @brief set_led - Configure LED mode
-- 
cgit v1.2.3


From 17a70355ea576843a7ac851f1db26872a50b2850 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 4 May 2017 12:56:12 -0500
Subject: of: fix sparse warnings in fdt, irq, reserved mem, and resolver code

sparse generates the following warnings in drivers/of/:

../drivers/of/fdt.c:63:36: warning: cast to restricted __be32
../drivers/of/fdt.c:68:33: warning: cast to restricted __be32
../drivers/of/irq.c:105:88: warning: incorrect type in initializer (different base types)
../drivers/of/irq.c:105:88:    expected restricted __be32
../drivers/of/irq.c:105:88:    got int
../drivers/of/irq.c:526:35: warning: incorrect type in assignment (different modifiers)
../drivers/of/irq.c:526:35:    expected int ( *const [usertype] irq_init_cb )( ... )
../drivers/of/irq.c:526:35:    got void const *const data
../drivers/of/of_reserved_mem.c:200:50: warning: incorrect type in initializer (different modifiers)
../drivers/of/of_reserved_mem.c:200:50:    expected int ( *[usertype] initfn )( ... )
../drivers/of/of_reserved_mem.c:200:50:    got void const *const data
../drivers/of/resolver.c:95:42: warning: incorrect type in assignment (different base types)
../drivers/of/resolver.c:95:42:    expected unsigned int [unsigned] [usertype] <noident>
../drivers/of/resolver.c:95:42:    got restricted __be32 [usertype] <noident>

All these are harmless type mismatches fixed by adjusting the types.

Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/fdt.c             | 4 ++--
 drivers/of/irq.c             | 2 +-
 drivers/of/of_reserved_mem.c | 2 +-
 drivers/of/resolver.c        | 2 +-
 include/linux/of_irq.h       | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index e33f7818bc6c..a0972219ccfc 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -48,8 +48,8 @@ void of_fdt_limit_memory(int limit)
 	const void *val;
 	int nr_address_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT;
 	int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
-	const uint32_t *addr_prop;
-	const uint32_t *size_prop;
+	const __be32 *addr_prop;
+	const __be32 *size_prop;
 	int root_offset;
 	int cell_size;
 
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 7c56b72d1dc6..d11437cb1187 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -102,7 +102,7 @@ int of_irq_parse_raw(const __be32 *addr, struct of_phandle_args *out_irq)
 	struct device_node *ipar, *tnode, *old = NULL, *newpar = NULL;
 	__be32 initial_match_array[MAX_PHANDLE_ARGS];
 	const __be32 *match_array = initial_match_array;
-	const __be32 *tmp, *imap, *imask, dummy_imask[] = { [0 ... MAX_PHANDLE_ARGS] = ~0 };
+	const __be32 *tmp, *imap, *imask, dummy_imask[] = { [0 ... MAX_PHANDLE_ARGS] = cpu_to_be32(~0) };
 	u32 intsize = 1, addrsize, newintsize = 0, newaddrsize = 0;
 	int imaplen, match, i, rc = -EINVAL;
 
diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index d507c3569a88..4dec07ea510f 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -197,7 +197,7 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem)
 	const struct of_device_id *i;
 
 	for (i = __reservedmem_of_table; i < &__rmem_of_table_sentinel; i++) {
-		reservedmem_of_init_fn initfn = i->data;
+		int const (*initfn)(struct reserved_mem *rmem) = i->data;
 		const char *compat = i->compatible;
 
 		if (!of_flat_dt_is_compatible(rmem->fdt_node, compat))
diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
index 7ae9863cb0a4..771f4844c781 100644
--- a/drivers/of/resolver.c
+++ b/drivers/of/resolver.c
@@ -92,7 +92,7 @@ static void adjust_overlay_phandles(struct device_node *overlay,
 		if (phandle == OF_PHANDLE_ILLEGAL)
 			continue;
 
-		*(uint32_t *)prop->value = cpu_to_be32(overlay->phandle);
+		*(__be32 *)prop->value = cpu_to_be32(overlay->phandle);
 	}
 
 	for_each_child_of_node(overlay, child)
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 1e0deb8e8494..ec6b11deb773 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -8,7 +8,7 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
-typedef int (*of_irq_init_cb_t)(struct device_node *, struct device_node *);
+typedef int const (*of_irq_init_cb_t)(struct device_node *, struct device_node *);
 
 /*
  * Workarounds only applied to 32bit powermac machines
-- 
cgit v1.2.3


From 693dfd5a3f19efc44acf3a57217c0480e414f8ee Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@mellanox.com>
Date: Thu, 27 Apr 2017 17:01:34 +0300
Subject: IB/mlx5: Enable IPoIB acceleration

Enable mlx5 IPoIB acceleration by declaring
mlx5_ib_{alloc,free}_rdma_netdev and assigning the mlx5
IPoIB rdma_netdev callbacks.

In addition, this patch brings in sync mlx5's IPoIB parts for net and IB
trees. As a precaution, we disabled IPoIB acceleration by default (in
the mlx5_core Kconfig file).

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/mlx5/main.c               | 22 +++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/ipoib.c | 44 +++++++++++++++----------
 drivers/net/ethernet/mellanox/mlx5/core/ipoib.h |  2 ++
 include/linux/mlx5/driver.h                     | 19 +++++++++++
 5 files changed, 70 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 9f3ba320ce70..d45772da0963 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3530,6 +3530,26 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
 	return num_counters;
 }
 
+static struct net_device*
+mlx5_ib_alloc_rdma_netdev(struct ib_device *hca,
+			  u8 port_num,
+			  enum rdma_netdev_t type,
+			  const char *name,
+			  unsigned char name_assign_type,
+			  void (*setup)(struct net_device *))
+{
+	if (type != RDMA_NETDEV_IPOIB)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	return mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca,
+				      name, setup);
+}
+
+static void mlx5_ib_free_rdma_netdev(struct net_device *netdev)
+{
+	return mlx5_rdma_netdev_free(netdev);
+}
+
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
 	struct mlx5_ib_dev *dev;
@@ -3660,6 +3680,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
 	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
 	dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
+	dev->ib_dev.alloc_rdma_netdev	= mlx5_ib_alloc_rdma_netdev;
+	dev->ib_dev.free_rdma_netdev	= mlx5_ib_free_rdma_netdev;
 	if (mlx5_core_is_pf(mdev)) {
 		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
 		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index a84b652f9b54..fc52d742b7f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -35,6 +35,6 @@ config MLX5_CORE_EN_DCB
 config MLX5_CORE_IPOIB
 	bool "Mellanox Technologies ConnectX-4 IPoIB offloads support"
 	depends on MLX5_CORE_EN
-	default y
+	default n
 	---help---
 	  MLX5 IPoIB offloads & acceleration support.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
index 3c84e36af018..019c230da498 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <rdma/ib_verbs.h>
 #include <linux/mlx5/fs.h>
 #include "en.h"
 #include "ipoib.h"
@@ -359,10 +360,10 @@ unlock:
 	return 0;
 }
 
-#ifdef notusedyet
 /* IPoIB RDMA netdev callbacks */
 static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca,
-			      union ib_gid *gid, u16 lid, int set_qkey)
+			      union ib_gid *gid, u16 lid, int set_qkey,
+			      u32 qkey)
 {
 	struct mlx5e_priv    *epriv = mlx5i_epriv(netdev);
 	struct mlx5_core_dev *mdev  = epriv->mdev;
@@ -375,6 +376,12 @@ static int mlx5i_attach_mcast(struct net_device *netdev, struct ib_device *hca,
 		mlx5_core_warn(mdev, "failed attaching QPN 0x%x, MGID %pI6\n",
 			       ipriv->qp.qpn, gid->raw);
 
+	if (set_qkey) {
+		mlx5_core_dbg(mdev, "%s setting qkey 0x%x\n",
+			      netdev->name, qkey);
+		ipriv->qkey = qkey;
+	}
+
 	return err;
 }
 
@@ -397,15 +404,15 @@ static int mlx5i_detach_mcast(struct net_device *netdev, struct ib_device *hca,
 }
 
 static int mlx5i_xmit(struct net_device *dev, struct sk_buff *skb,
-	       struct ib_ah *address, u32 dqpn, u32 dqkey)
+		      struct ib_ah *address, u32 dqpn)
 {
 	struct mlx5e_priv *epriv = mlx5i_epriv(dev);
 	struct mlx5e_txqsq *sq   = epriv->txq2sq[skb_get_queue_mapping(skb)];
 	struct mlx5_ib_ah *mah   = to_mah(address);
+	struct mlx5i_priv *ipriv = epriv->ppriv;
 
-	return mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, dqkey);
+	return mlx5i_sq_xmit(sq, skb, &mah->av, dqpn, ipriv->qkey);
 }
-#endif
 
 static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
 {
@@ -414,22 +421,23 @@ static int mlx5i_check_required_hca_cap(struct mlx5_core_dev *mdev)
 
 	if (!MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads)) {
 		mlx5_core_warn(mdev, "IPoIB enhanced offloads are not supported\n");
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
 	return 0;
 }
 
-static struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
-						 struct ib_device *ibdev,
-						 const char *name,
-						 void (*setup)(struct net_device *))
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *))
 {
 	const struct mlx5e_profile *profile = &mlx5i_nic_profile;
 	int nch = profile->max_nch(mdev);
 	struct net_device *netdev;
 	struct mlx5i_priv *ipriv;
 	struct mlx5e_priv *epriv;
+	struct rdma_netdev *rn;
 	int err;
 
 	if (mlx5i_check_required_hca_cap(mdev)) {
@@ -464,13 +472,13 @@ static struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
 	mlx5e_attach_netdev(epriv);
 	netif_carrier_off(netdev);
 
-	/* TODO: set rdma_netdev func pointers
-	 * rn = &ipriv->rn;
-	 * rn->hca  = ibdev;
-	 * rn->send = mlx5i_xmit;
-	 * rn->attach_mcast = mlx5i_attach_mcast;
-	 * rn->detach_mcast = mlx5i_detach_mcast;
-	 */
+	/* set rdma_netdev func pointers */
+	rn = &ipriv->rn;
+	rn->hca  = ibdev;
+	rn->send = mlx5i_xmit;
+	rn->attach_mcast = mlx5i_attach_mcast;
+	rn->detach_mcast = mlx5i_detach_mcast;
+
 	return netdev;
 
 err_free_netdev:
@@ -482,7 +490,7 @@ free_mdev_resources:
 }
 EXPORT_SYMBOL(mlx5_rdma_netdev_alloc);
 
-static void mlx5_rdma_netdev_free(struct net_device *netdev)
+void mlx5_rdma_netdev_free(struct net_device *netdev)
 {
 	struct mlx5e_priv          *priv    = mlx5i_epriv(netdev);
 	const struct mlx5e_profile *profile = priv->profile;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h
index bae0a5cbc8ad..213191a78464 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.h
@@ -40,7 +40,9 @@
 
 /* ipoib rdma netdev's private data structure */
 struct mlx5i_priv {
+	struct rdma_netdev rn; /* keep this first */
 	struct mlx5_core_qp qp;
+	u32    qkey;
 	char  *mlx5e_priv[0];
 };
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3fece51dcf13..cef2b98d479f 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1102,6 +1102,25 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
 struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev);
 void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up);
 
+#ifndef CONFIG_MLX5_CORE_IPOIB
+static inline
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *))
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void mlx5_rdma_netdev_free(struct net_device *netdev) {}
+#else
+struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
+					  struct ib_device *ibdev,
+					  const char *name,
+					  void (*setup)(struct net_device *));
+void mlx5_rdma_netdev_free(struct net_device *netdev);
+#endif /* CONFIG_MLX5_CORE_IPOIB */
+
 struct mlx5_profile {
 	u64	mask;
 	u8	log_max_qp;
-- 
cgit v1.2.3


From 8f078b38dd382710884ce7abd31a1935c440e6f8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 4 May 2017 14:01:24 -0700
Subject: libnvdimm: convert NDD_ flags to use bitops, introduce NDD_LOCKED

This is a preparation patch for handling locked nvdimm label regions, a
new concept as introduced by the latest DSM document on pmem.io [1]. A
future patch will leverage nvdimm_set_locked() at DIMM probe time to
flag regions that can not be enabled. There should be no functional
difference resulting from this change.

[1]: http://pmem.io/documents/NVDIMM_DSM_Interface_Example-V1.3.pdf

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c        |  4 ++--
 drivers/nvdimm/dimm_devs.c      | 11 +++++++++--
 drivers/nvdimm/namespace_devs.c |  2 +-
 drivers/nvdimm/nd.h             |  1 +
 drivers/nvdimm/region_devs.c    |  4 ++--
 include/linux/libnvdimm.h       |  6 ++++--
 6 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index ced95a79c1a1..a26c2297c1ed 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1512,7 +1512,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		}
 
 		if (nfit_mem->bdw && nfit_mem->memdev_pmem)
-			flags |= NDD_ALIASING;
+			set_bit(NDD_ALIASING, &flags);
 
 		/* collate flags across all memdevs for this dimm */
 		list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
@@ -1527,7 +1527,7 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 
 		mem_flags = __to_nfit_memdev(nfit_mem)->flags;
 		if (mem_flags & ACPI_NFIT_MEM_NOT_ARMED)
-			flags |= NDD_UNARMED;
+			set_bit(NDD_UNARMED, &flags);
 
 		rc = acpi_nfit_add_dimm(acpi_desc, nfit_mem, device_handle);
 		if (rc)
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 8b721321be5b..7d1a3dbc7d5d 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -34,7 +34,7 @@ int nvdimm_check_config_data(struct device *dev)
 
 	if (!nvdimm->cmd_mask ||
 	    !test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask)) {
-		if (nvdimm->flags & NDD_ALIASING)
+		if (test_bit(NDD_ALIASING, &nvdimm->flags))
 			return -ENXIO;
 		else
 			return -ENOTTY;
@@ -188,7 +188,14 @@ void nvdimm_set_aliasing(struct device *dev)
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 
-	nvdimm->flags |= NDD_ALIASING;
+	set_bit(NDD_ALIASING, &nvdimm->flags);
+}
+
+void nvdimm_set_locked(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	set_bit(NDD_LOCKED, &nvdimm->flags);
 }
 
 static void nvdimm_release(struct device *dev)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 1b481a5fb966..2f2d8afc684a 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -2240,7 +2240,7 @@ static int init_active_labels(struct nd_region *nd_region)
 		 * being activated if it aliases DPA.
 		 */
 		if (!ndd) {
-			if ((nvdimm->flags & NDD_ALIASING) == 0)
+			if (!test_bit(NDD_ALIASING, &nvdimm->flags))
 				return 0;
 			dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n",
 					dev_name(&nd_mapping->nvdimm->dev));
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index c3b33cf655fb..77d032192bf7 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -240,6 +240,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len);
 void nvdimm_set_aliasing(struct device *dev);
+void nvdimm_set_locked(struct device *dev);
 struct nd_btt *to_nd_btt(struct device *dev);
 
 struct nd_gen_sb {
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 07756b2e1cd5..b550edf2571f 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -222,7 +222,7 @@ int nd_region_to_nstype(struct nd_region *nd_region)
 			struct nd_mapping *nd_mapping = &nd_region->mapping[i];
 			struct nvdimm *nvdimm = nd_mapping->nvdimm;
 
-			if (nvdimm->flags & NDD_ALIASING)
+			if (test_bit(NDD_ALIASING, &nvdimm->flags))
 				alias++;
 		}
 		if (alias)
@@ -881,7 +881,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 			return NULL;
 		}
 
-		if (nvdimm->flags & NDD_UNARMED)
+		if (test_bit(NDD_UNARMED, &nvdimm->flags))
 			ro = 1;
 	}
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f07b1b14159a..6c807017128d 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -20,9 +20,11 @@
 
 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
-	NDD_ALIASING = 1 << 0,
+	NDD_ALIASING = 0,
 	/* unarmed memory devices may not persist writes */
-	NDD_UNARMED = 1 << 1,
+	NDD_UNARMED = 1,
+	/* locked memory devices should not be accessed */
+	NDD_LOCKED = 2,
 
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
-- 
cgit v1.2.3


From deccf497d804a4c5fca2dbfad2f104675a6f9102 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 May 2017 23:30:16 +0100
Subject: Make stat/lstat/fstatat pass AT_NO_AUTOMOUNT to vfs_statx()

stat/lstat/fstatat need to pass AT_NO_AUTOMOUNT to vfs_statx() as the
pre-statx code didn't set LOOKUP_AUTOMOUNT, even though fstatat() accepted
the AT_NO_AUTOMOUNT flag.

Fixes: a528d35e8bfc ("statx: Add a system call to make enhanced file info available")
Reported-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Ian Kent <raven@themaw.net>
cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fc1b4faa6272..866c955314db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2920,17 +2920,19 @@ extern int vfs_statx_fd(unsigned int, struct kstat *, u32, unsigned int);
 
 static inline int vfs_stat(const char __user *filename, struct kstat *stat)
 {
-	return vfs_statx(AT_FDCWD, filename, 0, stat, STATX_BASIC_STATS);
+	return vfs_statx(AT_FDCWD, filename, AT_NO_AUTOMOUNT,
+			 stat, STATX_BASIC_STATS);
 }
 static inline int vfs_lstat(const char __user *name, struct kstat *stat)
 {
-	return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW,
+	return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT,
 			 stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstatat(int dfd, const char __user *filename,
 			      struct kstat *stat, int flags)
 {
-	return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS);
+	return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
+			 stat, STATX_BASIC_STATS);
 }
 static inline int vfs_fstat(int fd, struct kstat *stat)
 {
-- 
cgit v1.2.3


From 2be83da85a64773efaa407639de81bd1377f880e Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Thu, 4 May 2017 12:34:32 +0100
Subject: thermal: devfreq_cooling: add new interface for direct power read

This patch introduces a new interface for device drivers connected to
devfreq_cooling in the thermal framework: get_real_power().

Some devices have more sophisticated methods (like power counters)
to approximate the actual power that they use.
In the previous implementation we had a pre-calculated power
table which was then scaled by 'utilization'
('busy_time' and 'total_time' taken from devfreq 'last_status').

With this new interface the driver can provide more precise data
regarding actual power to the thermal governor every time the power
budget is calculated. We then use this value and calculate the real
resource utilization scaling factor.

Reviewed-by: Chris Diamand <chris.diamand@arm.com>
Acked-by: Javi Merino <javi.merino@kernel.org>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
---
 drivers/thermal/devfreq_cooling.c | 105 +++++++++++++++++++++++++++++---------
 include/linux/devfreq_cooling.h   |  19 +++++++
 2 files changed, 101 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index af9d32837a3a..26c31571a12c 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -28,6 +28,8 @@
 
 #include <trace/events/thermal.h>
 
+#define SCALE_ERROR_MITIGATION 100
+
 static DEFINE_IDA(devfreq_ida);
 
 /**
@@ -45,6 +47,12 @@ static DEFINE_IDA(devfreq_ida);
  * @freq_table_size:	Size of the @freq_table and @power_table
  * @power_ops:	Pointer to devfreq_cooling_power, used to generate the
  *		@power_table.
+ * @res_util:	Resource utilization scaling factor for the power.
+ *		It is multiplied by 100 to minimize the error. It is used
+ *		for estimation of the power budget instead of using
+ *		'utilization' (which is	'busy_time / 'total_time').
+ *		The 'res_util' range is from 100 to (power_table[state] * 100)
+ *		for the corresponding 'state'.
  */
 struct devfreq_cooling_device {
 	int id;
@@ -55,6 +63,8 @@ struct devfreq_cooling_device {
 	u32 *freq_table;
 	size_t freq_table_size;
 	struct devfreq_cooling_power *power_ops;
+	u32 res_util;
+	int capped_state;
 };
 
 /**
@@ -250,6 +260,16 @@ get_dynamic_power(struct devfreq_cooling_device *dfc, unsigned long freq,
 	return power;
 }
 
+
+static inline unsigned long get_total_power(struct devfreq_cooling_device *dfc,
+					    unsigned long freq,
+					    unsigned long voltage)
+{
+	return get_static_power(dfc, freq) + get_dynamic_power(dfc, freq,
+							       voltage);
+}
+
+
 static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cdev,
 					       struct thermal_zone_device *tz,
 					       u32 *power)
@@ -259,27 +279,55 @@ static int devfreq_cooling_get_requested_power(struct thermal_cooling_device *cd
 	struct devfreq_dev_status *status = &df->last_status;
 	unsigned long state;
 	unsigned long freq = status->current_frequency;
-	u32 dyn_power, static_power;
+	unsigned long voltage;
+	u32 dyn_power = 0;
+	u32 static_power = 0;
+	int res;
 
-	/* Get dynamic power for state */
 	state = freq_get_state(dfc, freq);
-	if (state == THERMAL_CSTATE_INVALID)
-		return -EAGAIN;
+	if (state == THERMAL_CSTATE_INVALID) {
+		res = -EAGAIN;
+		goto fail;
+	}
 
-	dyn_power = dfc->power_table[state];
+	if (dfc->power_ops->get_real_power) {
+		voltage = get_voltage(df, freq);
+		if (voltage == 0) {
+			res = -EINVAL;
+			goto fail;
+		}
 
-	/* Scale dynamic power for utilization */
-	dyn_power = (dyn_power * status->busy_time) / status->total_time;
+		res = dfc->power_ops->get_real_power(df, power, freq, voltage);
+		if (!res) {
+			state = dfc->capped_state;
+			dfc->res_util = dfc->power_table[state];
+			dfc->res_util *= SCALE_ERROR_MITIGATION;
 
-	/* Get static power */
-	static_power = get_static_power(dfc, freq);
+			if (*power > 1)
+				dfc->res_util /= *power;
+		} else {
+			goto fail;
+		}
+	} else {
+		dyn_power = dfc->power_table[state];
+
+		/* Scale dynamic power for utilization */
+		dyn_power *= status->busy_time;
+		dyn_power /= status->total_time;
+		/* Get static power */
+		static_power = get_static_power(dfc, freq);
+
+		*power = dyn_power + static_power;
+	}
 
 	trace_thermal_power_devfreq_get_power(cdev, status, freq, dyn_power,
 					      static_power);
 
-	*power = dyn_power + static_power;
-
 	return 0;
+fail:
+	/* It is safe to set max in this case */
+	dfc->res_util = SCALE_ERROR_MITIGATION;
+	return res;
 }
 
 static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
@@ -312,26 +360,34 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev,
 	unsigned long busy_time;
 	s32 dyn_power;
 	u32 static_power;
+	s32 est_power;
 	int i;
 
-	static_power = get_static_power(dfc, freq);
+	if (dfc->power_ops->get_real_power) {
+		/* Scale for resource utilization */
+		est_power = power * dfc->res_util;
+		est_power /= SCALE_ERROR_MITIGATION;
+	} else {
+		static_power = get_static_power(dfc, freq);
 
-	dyn_power = power - static_power;
-	dyn_power = dyn_power > 0 ? dyn_power : 0;
+		dyn_power = power - static_power;
+		dyn_power = dyn_power > 0 ? dyn_power : 0;
 
-	/* Scale dynamic power for utilization */
-	busy_time = status->busy_time ?: 1;
-	dyn_power = (dyn_power * status->total_time) / busy_time;
+		/* Scale dynamic power for utilization */
+		busy_time = status->busy_time ?: 1;
+		est_power = (dyn_power * status->total_time) / busy_time;
+	}
 
 	/*
 	 * Find the first cooling state that is within the power
 	 * budget for dynamic power.
 	 */
 	for (i = 0; i < dfc->freq_table_size - 1; i++)
-		if (dyn_power >= dfc->power_table[i])
+		if (est_power >= dfc->power_table[i])
 			break;
 
 	*state = i;
+	dfc->capped_state = i;
 	trace_thermal_power_devfreq_limit(cdev, freq, *state, power);
 	return 0;
 }
@@ -387,7 +443,7 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
 	}
 
 	for (i = 0, freq = ULONG_MAX; i < num_opps; i++, freq--) {
-		unsigned long power_dyn, voltage;
+		unsigned long power, voltage;
 		struct dev_pm_opp *opp;
 
 		opp = dev_pm_opp_find_freq_floor(dev, &freq);
@@ -400,12 +456,15 @@ static int devfreq_cooling_gen_tables(struct devfreq_cooling_device *dfc)
 		dev_pm_opp_put(opp);
 
 		if (dfc->power_ops) {
-			power_dyn = get_dynamic_power(dfc, freq, voltage);
+			if (dfc->power_ops->get_real_power)
+				power = get_total_power(dfc, freq, voltage);
+			else
+				power = get_dynamic_power(dfc, freq, voltage);
 
-			dev_dbg(dev, "Dynamic power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
-				freq / 1000000, voltage, power_dyn, power_dyn);
+			dev_dbg(dev, "Power table: %lu MHz @ %lu mV: %lu = %lu mW\n",
+				freq / 1000000, voltage, power, power);
 
-			power_table[i] = power_dyn;
+			power_table[i] = power;
 		}
 
 		freq_table[i] = freq;
diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h
index c35d0c0e0ada..4635f95000a4 100644
--- a/include/linux/devfreq_cooling.h
+++ b/include/linux/devfreq_cooling.h
@@ -34,6 +34,23 @@
  *			If get_dynamic_power() is NULL, then the
  *			dynamic power is calculated as
  *			@dyn_power_coeff * frequency * voltage^2
+ * @get_real_power:	When this is set, the framework uses it to ask the
+ *			device driver for the actual power.
+ *			Some devices have more sophisticated methods
+ *			(like power counters) to approximate the actual power
+ *			that they use.
+ *			This function provides more accurate data to the
+ *			thermal governor. When the driver does not provide
+ *			such function, framework just uses pre-calculated
+ *			table and scale the power by 'utilization'
+ *			(based on 'busy_time' and 'total_time' taken from
+ *			devfreq 'last_status').
+ *			The value returned by this function must be lower
+ *			or equal than the maximum power value
+ *			for the current	state
+ *			(which can be found in power_table[state]).
+ *			When this interface is used, the power_table holds
+ *			max total (static + dynamic) power value for each OPP.
  */
 struct devfreq_cooling_power {
 	unsigned long (*get_static_power)(struct devfreq *devfreq,
@@ -41,6 +58,8 @@ struct devfreq_cooling_power {
 	unsigned long (*get_dynamic_power)(struct devfreq *devfreq,
 					   unsigned long freq,
 					   unsigned long voltage);
+	int (*get_real_power)(struct devfreq *df, u32 *power,
+			      unsigned long freq, unsigned long voltage);
 	unsigned long dyn_power_coeff;
 };
 
-- 
cgit v1.2.3


From 8a537ece3d946227e4afa81eae0e43fa47439c7d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 26 Apr 2017 23:22:09 +0200
Subject: PM / wakeup: Integrate mechanism to abort transitions in progress

The system wakeup framework is not very consistent with respect to
the way it handles suspend-to-idle and generally wakeup events
occurring during transitions to system low-power states.

First off, system transitions in progress are aborted by the event
reporting helpers like pm_wakeup_event() only if the wakeup_count
sysfs attribute is in use (as documented), but there are cases in
which system-wide transitions should be aborted even if that is
not the case.  For example, a wakeup signal from a designated
wakeup device during system-wide PM transition, it should cause
the transition to be aborted right away.

Moreover, there is a freeze_wake() call in wakeup_source_activate(),
but that really is only effective after suspend_freeze_state has
been set to FREEZE_STATE_ENTER by freeze_enter().  However, it
is very unlikely that wakeup_source_activate() will ever be called
at that time, as it could only be triggered by a IRQF_NO_SUSPEND
interrupt handler, so wakeups from suspend-to-idle don't really
occur in wakeup_source_activate().

At the same time there is a way to abort a system suspend in
progress (or wake up the system from suspend-to-idle), which is by
calling pm_system_wakeup(), but in turn that doesn't cause any
wakeup source objects to be activated, so it will not be covered
by wakeup source statistics and will not prevent the system from
suspending again immediately (in case autosleep is used, for
example).  Consequently, if anyone wants to abort system transitions
in progress and allow the wakeup_count mechanism to work, they need
to use both pm_system_wakeup() and pm_wakeup_event(), say, at the
same time which is awkward.

For the above reasons, make it possible to trigger
pm_system_wakeup() from within wakeup_source_activate() and
provide a new pm_wakeup_hard_event() helper to do so within the
wakeup framework.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 36 ++++++++++++++++++------------------
 include/linux/pm_wakeup.h   | 25 +++++++++++++++++++++----
 2 files changed, 39 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 136854970489..84c41c98d6fc 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -512,12 +512,13 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
 /**
  * wakup_source_activate - Mark given wakeup source as active.
  * @ws: Wakeup source to handle.
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  *
  * Update the @ws' statistics and, if @ws has just been activated, notify the PM
  * core of the event by incrementing the counter of of wakeup events being
  * processed.
  */
-static void wakeup_source_activate(struct wakeup_source *ws)
+static void wakeup_source_activate(struct wakeup_source *ws, bool hard)
 {
 	unsigned int cec;
 
@@ -525,11 +526,8 @@ static void wakeup_source_activate(struct wakeup_source *ws)
 			"unregistered wakeup source\n"))
 		return;
 
-	/*
-	 * active wakeup source should bring the system
-	 * out of PM_SUSPEND_FREEZE state
-	 */
-	freeze_wake();
+	if (hard)
+		pm_system_wakeup();
 
 	ws->active = true;
 	ws->active_count++;
@@ -546,8 +544,9 @@ static void wakeup_source_activate(struct wakeup_source *ws)
 /**
  * wakeup_source_report_event - Report wakeup event using the given source.
  * @ws: Wakeup source to report the event for.
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  */
-static void wakeup_source_report_event(struct wakeup_source *ws)
+static void wakeup_source_report_event(struct wakeup_source *ws, bool hard)
 {
 	ws->event_count++;
 	/* This is racy, but the counter is approximate anyway. */
@@ -555,7 +554,7 @@ static void wakeup_source_report_event(struct wakeup_source *ws)
 		ws->wakeup_count++;
 
 	if (!ws->active)
-		wakeup_source_activate(ws);
+		wakeup_source_activate(ws, hard);
 }
 
 /**
@@ -573,7 +572,7 @@ void __pm_stay_awake(struct wakeup_source *ws)
 
 	spin_lock_irqsave(&ws->lock, flags);
 
-	wakeup_source_report_event(ws);
+	wakeup_source_report_event(ws, false);
 	del_timer(&ws->timer);
 	ws->timer_expires = 0;
 
@@ -739,9 +738,10 @@ static void pm_wakeup_timer_fn(unsigned long data)
 }
 
 /**
- * __pm_wakeup_event - Notify the PM core of a wakeup event.
+ * pm_wakeup_ws_event - Notify the PM core of a wakeup event.
  * @ws: Wakeup source object associated with the event source.
  * @msec: Anticipated event processing time (in milliseconds).
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  *
  * Notify the PM core of a wakeup event whose source is @ws that will take
  * approximately @msec milliseconds to be processed by the kernel.  If @ws is
@@ -750,7 +750,7 @@ static void pm_wakeup_timer_fn(unsigned long data)
  *
  * It is safe to call this function from interrupt context.
  */
-void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
+void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard)
 {
 	unsigned long flags;
 	unsigned long expires;
@@ -760,7 +760,7 @@ void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
 
 	spin_lock_irqsave(&ws->lock, flags);
 
-	wakeup_source_report_event(ws);
+	wakeup_source_report_event(ws, hard);
 
 	if (!msec) {
 		wakeup_source_deactivate(ws);
@@ -779,17 +779,17 @@ void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
  unlock:
 	spin_unlock_irqrestore(&ws->lock, flags);
 }
-EXPORT_SYMBOL_GPL(__pm_wakeup_event);
-
+EXPORT_SYMBOL_GPL(pm_wakeup_ws_event);
 
 /**
  * pm_wakeup_event - Notify the PM core of a wakeup event.
  * @dev: Device the wakeup event is related to.
  * @msec: Anticipated event processing time (in milliseconds).
+ * @hard: If set, abort suspends in progress and wake up from suspend-to-idle.
  *
- * Call __pm_wakeup_event() for the @dev's wakeup source object.
+ * Call pm_wakeup_ws_event() for the @dev's wakeup source object.
  */
-void pm_wakeup_event(struct device *dev, unsigned int msec)
+void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
 {
 	unsigned long flags;
 
@@ -797,10 +797,10 @@ void pm_wakeup_event(struct device *dev, unsigned int msec)
 		return;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
-	__pm_wakeup_event(dev->power.wakeup, msec);
+	pm_wakeup_ws_event(dev->power.wakeup, msec, hard);
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 }
-EXPORT_SYMBOL_GPL(pm_wakeup_event);
+EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);
 
 void pm_print_active_wakeup_sources(void)
 {
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index a3447932df1f..4c2cba7ec1d4 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -106,8 +106,8 @@ extern void __pm_stay_awake(struct wakeup_source *ws);
 extern void pm_stay_awake(struct device *dev);
 extern void __pm_relax(struct wakeup_source *ws);
 extern void pm_relax(struct device *dev);
-extern void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec);
-extern void pm_wakeup_event(struct device *dev, unsigned int msec);
+extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard);
+extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard);
 
 #else /* !CONFIG_PM_SLEEP */
 
@@ -182,9 +182,11 @@ static inline void __pm_relax(struct wakeup_source *ws) {}
 
 static inline void pm_relax(struct device *dev) {}
 
-static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) {}
+static inline void pm_wakeup_ws_event(struct wakeup_source *ws,
+				      unsigned int msec, bool hard) {}
 
-static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {}
+static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec,
+				       bool hard) {}
 
 #endif /* !CONFIG_PM_SLEEP */
 
@@ -201,4 +203,19 @@ static inline void wakeup_source_trash(struct wakeup_source *ws)
 	wakeup_source_drop(ws);
 }
 
+static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec)
+{
+	return pm_wakeup_ws_event(ws, msec, false);
+}
+
+static inline void pm_wakeup_event(struct device *dev, unsigned int msec)
+{
+	return pm_wakeup_dev_event(dev, msec, false);
+}
+
+static inline void pm_wakeup_hard_event(struct device *dev)
+{
+	return pm_wakeup_dev_event(dev, 0, true);
+}
+
 #endif /* _LINUX_PM_WAKEUP_H */
-- 
cgit v1.2.3


From eed4d47efe9508b855b09754cf6de4325d8a2f0d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 26 Apr 2017 23:23:03 +0200
Subject: ACPI / sleep: Ignore spurious SCI wakeups from suspend-to-idle

The ACPI SCI (System Control Interrupt) is set up as a wakeup IRQ
during suspend-to-idle transitions and, consequently, any events
signaled through it wake up the system from that state.  However,
on some systems some of the events signaled via the ACPI SCI while
suspended to idle should not cause the system to wake up.  In fact,
quite often they should just be discarded.

Arguably, systems should not resume entirely on such events, but in
order to decide which events really should cause the system to resume
and which are spurious, it is necessary to resume up to the point
when ACPI SCIs are actually handled and processed, which is after
executing dpm_resume_noirq() in the system resume path.

For this reasons, add a loop around freeze_enter() in which the
platforms can process events signaled via multiplexed IRQ lines
like the ACPI SCI and add suspend-to-idle hooks that can be
used for this purpose to struct platform_freeze_ops.

In the ACPI case, the ->wake hook is used for checking if the SCI
has triggered while suspended and deferring the interrupt-induced
system wakeup until the events signaled through it are actually
processed sufficiently to decide whether or not the system should
resume.  In turn, the ->sync hook allows all of the relevant event
queues to be flushed so as to prevent events from being missed due
to race conditions.

In addition to that, some ACPI code processing wakeup events needs
to be modified to use the "hard" version of wakeup triggers, so that
it will cause a system resume to happen on device-induced wakeup
events even if the "soft" mechanism to prevent the system from
suspending is not enabled (that also helps to catch device-induced
wakeup events occurring during suspend transitions in progress).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/battery.c      |  2 +-
 drivers/acpi/button.c       |  5 +++--
 drivers/acpi/device_pm.c    |  3 ++-
 drivers/acpi/sleep.c        | 28 ++++++++++++++++++++++++++++
 drivers/base/power/main.c   |  5 -----
 drivers/base/power/wakeup.c | 18 ++++++++++++------
 include/linux/suspend.h     |  7 +++++--
 kernel/power/process.c      |  2 +-
 kernel/power/suspend.c      | 29 +++++++++++++++++++++++++----
 9 files changed, 77 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index 4ef1e4624b2b..83ab17e4a795 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -776,7 +776,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume)
 	if ((battery->state & ACPI_BATTERY_STATE_CRITICAL) ||
 	    (test_bit(ACPI_BATTERY_ALARM_PRESENT, &battery->flags) &&
             (battery->capacity_now <= battery->alarm)))
-		pm_wakeup_event(&battery->device->dev, 0);
+		pm_wakeup_hard_event(&battery->device->dev);
 
 	return result;
 }
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 668137e4a069..b7c2a06963d6 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -216,7 +216,7 @@ static int acpi_lid_notify_state(struct acpi_device *device, int state)
 	}
 
 	if (state)
-		pm_wakeup_event(&device->dev, 0);
+		pm_wakeup_hard_event(&device->dev);
 
 	ret = blocking_notifier_call_chain(&acpi_lid_notifier, state, device);
 	if (ret == NOTIFY_DONE)
@@ -398,7 +398,7 @@ static void acpi_button_notify(struct acpi_device *device, u32 event)
 		} else {
 			int keycode;
 
-			pm_wakeup_event(&device->dev, 0);
+			pm_wakeup_hard_event(&device->dev);
 			if (button->suspended)
 				break;
 
@@ -530,6 +530,7 @@ static int acpi_button_add(struct acpi_device *device)
 		lid_device = device;
 	}
 
+	device_init_wakeup(&device->dev, true);
 	printk(KERN_INFO PREFIX "%s [%s]\n", name, acpi_device_bid(device));
 	return 0;
 
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 993fd31394c8..798d5003a039 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -24,6 +24,7 @@
 #include <linux/pm_qos.h>
 #include <linux/pm_domain.h>
 #include <linux/pm_runtime.h>
+#include <linux/suspend.h>
 
 #include "internal.h"
 
@@ -399,7 +400,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 	mutex_lock(&acpi_pm_notifier_lock);
 
 	if (adev->wakeup.flags.notifier_present) {
-		__pm_wakeup_event(adev->wakeup.ws, 0);
+		pm_wakeup_ws_event(adev->wakeup.ws, 0, true);
 		if (adev->wakeup.context.work.func)
 			queue_pm_work(&adev->wakeup.context.work);
 	}
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index a4327af676fe..e84005d642e6 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -662,14 +662,40 @@ static int acpi_freeze_prepare(void)
 	acpi_os_wait_events_complete();
 	if (acpi_sci_irq_valid())
 		enable_irq_wake(acpi_sci_irq);
+
 	return 0;
 }
 
+static void acpi_freeze_wake(void)
+{
+	/*
+	 * If IRQD_WAKEUP_ARMED is not set for the SCI at this point, it means
+	 * that the SCI has triggered while suspended, so cancel the wakeup in
+	 * case it has not been a wakeup event (the GPEs will be checked later).
+	 */
+	if (acpi_sci_irq_valid() &&
+	    !irqd_is_wakeup_armed(irq_get_irq_data(acpi_sci_irq)))
+		pm_system_cancel_wakeup();
+}
+
+static void acpi_freeze_sync(void)
+{
+	/*
+	 * Process all pending events in case there are any wakeup ones.
+	 *
+	 * The EC driver uses the system workqueue, so that one needs to be
+	 * flushed too.
+	 */
+	acpi_os_wait_events_complete();
+	flush_scheduled_work();
+}
+
 static void acpi_freeze_restore(void)
 {
 	acpi_disable_wakeup_devices(ACPI_STATE_S0);
 	if (acpi_sci_irq_valid())
 		disable_irq_wake(acpi_sci_irq);
+
 	acpi_enable_all_runtime_gpes();
 }
 
@@ -681,6 +707,8 @@ static void acpi_freeze_end(void)
 static const struct platform_freeze_ops acpi_freeze_ops = {
 	.begin = acpi_freeze_begin,
 	.prepare = acpi_freeze_prepare,
+	.wake = acpi_freeze_wake,
+	.sync = acpi_freeze_sync,
 	.restore = acpi_freeze_restore,
 	.end = acpi_freeze_end,
 };
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9faee1c893e5..e987a6f55d36 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1091,11 +1091,6 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	if (async_error)
 		goto Complete;
 
-	if (pm_wakeup_pending()) {
-		async_error = -EBUSY;
-		goto Complete;
-	}
-
 	if (dev->power.syscore || dev->power.direct_complete)
 		goto Complete;
 
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 84c41c98d6fc..f62082fdd670 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -28,8 +28,8 @@ bool events_check_enabled __read_mostly;
 /* First wakeup IRQ seen by the kernel in the last cycle. */
 unsigned int pm_wakeup_irq __read_mostly;
 
-/* If set and the system is suspending, terminate the suspend. */
-static bool pm_abort_suspend __read_mostly;
+/* If greater than 0 and the system is suspending, terminate the suspend. */
+static atomic_t pm_abort_suspend __read_mostly;
 
 /*
  * Combined counters of registered wakeup events and wakeup events in progress.
@@ -856,20 +856,26 @@ bool pm_wakeup_pending(void)
 		pm_print_active_wakeup_sources();
 	}
 
-	return ret || pm_abort_suspend;
+	return ret || atomic_read(&pm_abort_suspend) > 0;
 }
 
 void pm_system_wakeup(void)
 {
-	pm_abort_suspend = true;
+	atomic_inc(&pm_abort_suspend);
 	freeze_wake();
 }
 EXPORT_SYMBOL_GPL(pm_system_wakeup);
 
-void pm_wakeup_clear(void)
+void pm_system_cancel_wakeup(void)
+{
+	atomic_dec(&pm_abort_suspend);
+}
+
+void pm_wakeup_clear(bool reset)
 {
-	pm_abort_suspend = false;
 	pm_wakeup_irq = 0;
+	if (reset)
+		atomic_set(&pm_abort_suspend, 0);
 }
 
 void pm_system_irq_wakeup(unsigned int irq_number)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d9718378a8be..0b1cf32edfd7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -189,6 +189,8 @@ struct platform_suspend_ops {
 struct platform_freeze_ops {
 	int (*begin)(void);
 	int (*prepare)(void);
+	void (*wake)(void);
+	void (*sync)(void);
 	void (*restore)(void);
 	void (*end)(void);
 };
@@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq;
 
 extern bool pm_wakeup_pending(void);
 extern void pm_system_wakeup(void);
-extern void pm_wakeup_clear(void);
+extern void pm_system_cancel_wakeup(void);
+extern void pm_wakeup_clear(bool reset);
 extern void pm_system_irq_wakeup(unsigned int irq_number);
 extern bool pm_get_wakeup_count(unsigned int *count, bool block);
 extern bool pm_save_wakeup_count(unsigned int count);
@@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 
 static inline bool pm_wakeup_pending(void) { return false; }
 static inline void pm_system_wakeup(void) {}
-static inline void pm_wakeup_clear(void) {}
+static inline void pm_wakeup_clear(bool reset) {}
 static inline void pm_system_irq_wakeup(unsigned int irq_number) {}
 
 static inline void lock_system_sleep(void) {}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
 	if (!pm_freezing)
 		atomic_inc(&system_freezing_cnt);
 
-	pm_wakeup_clear();
+	pm_wakeup_clear(true);
 	pr_info("Freezing user space processes ... ");
 	pm_freezing = true;
 	error = try_to_freeze_tasks(true);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..c0248c74d6d4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
 
 static void freeze_enter(void)
 {
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+
 	spin_lock_irq(&suspend_freeze_lock);
 	if (pm_wakeup_pending())
 		goto out;
@@ -98,6 +100,27 @@ static void freeze_enter(void)
  out:
 	suspend_freeze_state = FREEZE_STATE_NONE;
 	spin_unlock_irq(&suspend_freeze_lock);
+
+	trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+}
+
+static void s2idle_loop(void)
+{
+	do {
+		freeze_enter();
+
+		if (freeze_ops && freeze_ops->wake)
+			freeze_ops->wake();
+
+		dpm_resume_noirq(PMSG_RESUME);
+		if (freeze_ops && freeze_ops->sync)
+			freeze_ops->sync();
+
+		if (pm_wakeup_pending())
+			break;
+
+		pm_wakeup_clear(false);
+	} while (!dpm_suspend_noirq(PMSG_SUSPEND));
 }
 
 void freeze_wake(void)
@@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 	 * all the devices are suspended.
 	 */
 	if (state == PM_SUSPEND_FREEZE) {
-		trace_suspend_resume(TPS("machine_suspend"), state, true);
-		freeze_enter();
-		trace_suspend_resume(TPS("machine_suspend"), state, false);
-		goto Platform_wake;
+		s2idle_loop();
+		goto Platform_early_resume;
 	}
 
 	error = disable_nonboot_cpus();
-- 
cgit v1.2.3


From 71afe470e20db133b30730cfa856e5d6854312e9 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 13 Apr 2017 09:06:20 +0200
Subject: KVM: arm64: vgic-its: Introduce migration ABI infrastructure

We plan to support different migration ABIs, ie. characterizing
the ITS table layout format in guest RAM. For example, a new ABI
will be needed if vLPIs get supported for nested use case.

So let's introduce an array of supported ABIs (at the moment a single
ABI is supported though). The following characteristics are foreseen
to vary with the ABI: size of table entries, save/restore operation,
the way abi settings are applied.

By default the MAX_ABI_REV is applied on its creation. In subsequent
patches we will introduce a way for the userspace to change the ABI
in use.

The entry sizes now are set according to the ABI version and not
hardcoded anymore.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
---
 include/kvm/arm_vgic.h             |  3 ++
 include/linux/irqchip/arm-gic-v3.h |  5 ++
 virt/kvm/arm/vgic/vgic-its.c       | 93 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 97 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 26ed4fb896bb..fabcc649e2ce 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -162,6 +162,9 @@ struct vgic_its {
 	u32			creadr;
 	u32			cwriter;
 
+	/* migration ABI revision in use */
+	u32			abi_rev;
+
 	/* Protects the device and collection lists */
 	struct mutex		its_lock;
 	struct list_head	device_list;
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 97cbca19430d..81ebe437ccc3 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -132,6 +132,9 @@
 #define GIC_BASER_SHAREABILITY(reg, type)				\
 	(GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
 
+/* encode a size field of width @w containing @n - 1 units */
+#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0))
+
 #define GICR_PROPBASER_SHAREABILITY_SHIFT		(10)
 #define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT		(7)
 #define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT		(56)
@@ -232,6 +235,7 @@
 #define GITS_CTLR_QUIESCENT		(1U << 31)
 
 #define GITS_TYPER_PLPIS		(1UL << 0)
+#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT	4
 #define GITS_TYPER_IDBITS_SHIFT		8
 #define GITS_TYPER_DEVBITS_SHIFT	13
 #define GITS_TYPER_DEVBITS(r)		((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
@@ -290,6 +294,7 @@
 #define GITS_BASER_TYPE(r)		(((r) >> GITS_BASER_TYPE_SHIFT) & 7)
 #define GITS_BASER_ENTRY_SIZE_SHIFT		(48)
 #define GITS_BASER_ENTRY_SIZE(r)	((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1)
+#define GITS_BASER_ENTRY_SIZE_MASK	GENMASK_ULL(52, 48)
 #define GITS_BASER_SHAREABILITY_SHIFT	(10)
 #define GITS_BASER_InnerShareable					\
 	GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index bf3ff0931572..4f6ea46c496c 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -33,6 +33,10 @@
 #include "vgic.h"
 #include "vgic-mmio.h"
 
+static int vgic_its_save_tables_v0(struct vgic_its *its);
+static int vgic_its_restore_tables_v0(struct vgic_its *its);
+static int vgic_its_commit_v0(struct vgic_its *its);
+
 /*
  * Creates a new (reference to a) struct vgic_irq for a given LPI.
  * If this LPI is already mapped on another ITS, we increase its refcount
@@ -123,6 +127,50 @@ struct its_ite {
 	u32 event_id;
 };
 
+/**
+ * struct vgic_its_abi - ITS abi ops and settings
+ * @cte_esz: collection table entry size
+ * @dte_esz: device table entry size
+ * @ite_esz: interrupt translation table entry size
+ * @save tables: save the ITS tables into guest RAM
+ * @restore_tables: restore the ITS internal structs from tables
+ *  stored in guest RAM
+ * @commit: initialize the registers which expose the ABI settings,
+ *  especially the entry sizes
+ */
+struct vgic_its_abi {
+	int cte_esz;
+	int dte_esz;
+	int ite_esz;
+	int (*save_tables)(struct vgic_its *its);
+	int (*restore_tables)(struct vgic_its *its);
+	int (*commit)(struct vgic_its *its);
+};
+
+static const struct vgic_its_abi its_table_abi_versions[] = {
+	[0] = {.cte_esz = 8, .dte_esz = 8, .ite_esz = 8,
+	 .save_tables = vgic_its_save_tables_v0,
+	 .restore_tables = vgic_its_restore_tables_v0,
+	 .commit = vgic_its_commit_v0,
+	},
+};
+
+#define NR_ITS_ABIS	ARRAY_SIZE(its_table_abi_versions)
+
+inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
+{
+	return &its_table_abi_versions[its->abi_rev];
+}
+
+int vgic_its_set_abi(struct vgic_its *its, int rev)
+{
+	const struct vgic_its_abi *abi;
+
+	its->abi_rev = rev;
+	abi = vgic_its_get_abi(its);
+	return abi->commit(its);
+}
+
 /*
  * Find and returns a device in the device table for an ITS.
  * Must be called with the its_lock mutex held.
@@ -364,6 +412,7 @@ static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
 					      struct vgic_its *its,
 					      gpa_t addr, unsigned int len)
 {
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 	u64 reg = GITS_TYPER_PLPIS;
 
 	/*
@@ -376,6 +425,7 @@ static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
 	 */
 	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
 	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
 
 	return extract_bytes(reg, addr & 7, len);
 }
@@ -1268,6 +1318,7 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
 				      gpa_t addr, unsigned int len,
 				      unsigned long val)
 {
+	const struct vgic_its_abi *abi = vgic_its_get_abi(its);
 	u64 entry_size, device_type;
 	u64 reg, *regptr, clearbits = 0;
 
@@ -1278,12 +1329,12 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm,
 	switch (BASER_INDEX(addr)) {
 	case 0:
 		regptr = &its->baser_device_table;
-		entry_size = 8;
+		entry_size = abi->dte_esz;
 		device_type = GITS_BASER_TYPE_DEVICE;
 		break;
 	case 1:
 		regptr = &its->baser_coll_table;
-		entry_size = 8;
+		entry_size = abi->cte_esz;
 		device_type = GITS_BASER_TYPE_COLLECTION;
 		clearbits = GITS_BASER_INDIRECT;
 		break;
@@ -1425,7 +1476,6 @@ static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its)
 	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
 	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
 	 GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)		| \
-	 ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT)			| \
 	 GITS_BASER_PAGE_SIZE_64K)
 
 #define INITIAL_PROPBASER_VALUE						  \
@@ -1465,7 +1515,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 
 	dev->private = its;
 
-	return 0;
+	return vgic_its_set_abi(its, NR_ITS_ABIS - 1);
 }
 
 static void vgic_its_destroy(struct kvm_device *kvm_dev)
@@ -1592,6 +1642,41 @@ out:
 	return ret;
 }
 
+/**
+ * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
+ * according to v0 ABI
+ */
+static int vgic_its_save_tables_v0(struct vgic_its *its)
+{
+	return -ENXIO;
+}
+
+/**
+ * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
+ * to internal data structs according to V0 ABI
+ *
+ */
+static int vgic_its_restore_tables_v0(struct vgic_its *its)
+{
+	return -ENXIO;
+}
+
+static int vgic_its_commit_v0(struct vgic_its *its)
+{
+	const struct vgic_its_abi *abi;
+
+	abi = vgic_its_get_abi(its);
+	its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+	its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
+
+	its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
+					<< GITS_BASER_ENTRY_SIZE_SHIFT);
+
+	its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
+					<< GITS_BASER_ENTRY_SIZE_SHIFT);
+	return 0;
+}
+
 static int vgic_its_has_attr(struct kvm_device *dev,
 			     struct kvm_device_attr *attr)
 {
-- 
cgit v1.2.3


From ab01c6bdacc43c41c6b326889f4358f5afc38bf9 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 23 Mar 2017 15:14:00 +0100
Subject: KVM: arm64: vgic-its: Implement vgic_mmio_uaccess_write_its_iidr

The GITS_IIDR revision field is used to encode the migration ABI
revision. So we need to restore it to check the table layout is
readable by the destination.

By writing the IIDR, userspace thus forces the ABI revision to be
used and this must be less than or equal to the max revision KVM
supports.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
---
 include/linux/irqchip/arm-gic-v3.h |  5 +++++
 virt/kvm/arm/vgic/vgic-its.c       | 23 ++++++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 81ebe437ccc3..2eaea308f003 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -242,6 +242,11 @@
 #define GITS_TYPER_PTA			(1UL << 19)
 #define GITS_TYPER_HWCOLLCNT_SHIFT	24
 
+#define GITS_IIDR_REV_SHIFT		12
+#define GITS_IIDR_REV_MASK		(0xf << GITS_IIDR_REV_SHIFT)
+#define GITS_IIDR_REV(r)		(((r) >> GITS_IIDR_REV_SHIFT) & 0xf)
+#define GITS_IIDR_PRODUCTID_SHIFT	24
+
 #define GITS_CBASER_VALID			(1ULL << 63)
 #define GITS_CBASER_SHAREABILITY_SHIFT		(10)
 #define GITS_CBASER_INNER_CACHEABILITY_SHIFT	(59)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 4f6ea46c496c..9338efb79a54 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -434,7 +434,23 @@ static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
 					     struct vgic_its *its,
 					     gpa_t addr, unsigned int len)
 {
-	return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+	u32 val;
+
+	val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
+	val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
+	return val;
+}
+
+static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
+					    struct vgic_its *its,
+					    gpa_t addr, unsigned int len,
+					    unsigned long val)
+{
+	u32 rev = GITS_IIDR_REV(val);
+
+	if (rev >= NR_ITS_ABIS)
+		return -EINVAL;
+	return vgic_its_set_abi(its, rev);
 }
 
 static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
@@ -1415,8 +1431,9 @@ static struct vgic_register_region its_registers[] = {
 	REGISTER_ITS_DESC(GITS_CTLR,
 		vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
 		VGIC_ACCESS_32bit),
-	REGISTER_ITS_DESC(GITS_IIDR,
-		vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
+	REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
+		vgic_mmio_read_its_iidr, its_mmio_write_wi,
+		vgic_mmio_uaccess_write_its_iidr, 4,
 		VGIC_ACCESS_32bit),
 	REGISTER_ITS_DESC(GITS_TYPER,
 		vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
-- 
cgit v1.2.3


From 0d44cdb631ef53ea75be056886cf0541311e48df Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 22 Dec 2016 18:14:14 +0100
Subject: KVM: arm64: vgic-its: Interpret MAPD Size field and check related
 errors

Up to now the MAPD's ITT size field has been ignored. It encodes
the number of eventid bit minus 1. It should be used to check
the eventid when a MAPTI command is issued on a device. Let's
store the number of eventid bits in the its_device and do the
check on MAPTI. Also make sure the ITT size field does
not exceed the GITS_TYPER IDBITS field.

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqchip/arm-gic-v3.h |  2 ++
 virt/kvm/arm/vgic/vgic-its.c       | 15 ++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 2eaea308f003..be8bad00c419 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -347,9 +347,11 @@
 #define E_ITS_INT_UNMAPPED_INTERRUPT		0x010307
 #define E_ITS_CLEAR_UNMAPPED_INTERRUPT		0x010507
 #define E_ITS_MAPD_DEVICE_OOR			0x010801
+#define E_ITS_MAPD_ITTSIZE_OOR			0x010802
 #define E_ITS_MAPC_PROCNUM_OOR			0x010902
 #define E_ITS_MAPC_COLLECTION_OOR		0x010903
 #define E_ITS_MAPTI_UNMAPPED_DEVICE		0x010a04
+#define E_ITS_MAPTI_ID_OOR			0x010a05
 #define E_ITS_MAPTI_PHYSICALID_OOR		0x010a06
 #define E_ITS_INV_UNMAPPED_INTERRUPT		0x010c07
 #define E_ITS_INVALL_UNMAPPED_COLLECTION	0x010d09
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 9338efb79a54..031f6abd50fd 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -103,6 +103,7 @@ struct its_device {
 
 	/* the head for the list of ITTEs */
 	struct list_head itt_head;
+	u32 num_eventid_bits;
 	u32 device_id;
 };
 
@@ -224,6 +225,8 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
 
 #define GIC_LPI_OFFSET 8192
 
+#define VITS_TYPER_IDBITS 16
+
 /*
  * Finds and returns a collection in the ITS collection table.
  * Must be called with the its_lock mutex held.
@@ -424,7 +427,7 @@ static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
 	 * DevBits low - as least for the time being.
 	 */
 	reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
-	reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+	reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
 	reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;
 
 	return extract_bytes(reg, addr & 7, len);
@@ -595,6 +598,7 @@ static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
 
 #define its_cmd_get_command(cmd)	its_cmd_mask_field(cmd, 0,  0,  8)
 #define its_cmd_get_deviceid(cmd)	its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_size(cmd)		(its_cmd_mask_field(cmd, 1,  0,  5) + 1)
 #define its_cmd_get_id(cmd)		its_cmd_mask_field(cmd, 1,  0, 32)
 #define its_cmd_get_physical_id(cmd)	its_cmd_mask_field(cmd, 1, 32, 32)
 #define its_cmd_get_collection(cmd)	its_cmd_mask_field(cmd, 2,  0, 16)
@@ -785,6 +789,9 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
 	if (!device)
 		return E_ITS_MAPTI_UNMAPPED_DEVICE;
 
+	if (event_id >= BIT_ULL(device->num_eventid_bits))
+		return E_ITS_MAPTI_ID_OOR;
+
 	if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
 		lpi_nr = its_cmd_get_physical_id(its_cmd);
 	else
@@ -865,11 +872,15 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 {
 	u32 device_id = its_cmd_get_deviceid(its_cmd);
 	bool valid = its_cmd_get_validbit(its_cmd);
+	u8 num_eventid_bits = its_cmd_get_size(its_cmd);
 	struct its_device *device;
 
 	if (!vgic_its_check_id(its, its->baser_device_table, device_id))
 		return E_ITS_MAPD_DEVICE_OOR;
 
+	if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
+		return E_ITS_MAPD_ITTSIZE_OOR;
+
 	device = find_its_device(its, device_id);
 
 	/*
@@ -892,6 +903,8 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
 		return -ENOMEM;
 
 	device->device_id = device_id;
+	device->num_eventid_bits = num_eventid_bits;
+
 	INIT_LIST_HEAD(&device->itt_head);
 
 	list_add_tail(&device->dev_list, &its->device_list);
-- 
cgit v1.2.3


From 44de9d683847ba6dbac290bb8c9f1b773cbda746 Mon Sep 17 00:00:00 2001
From: Eric Auger <eric.auger@redhat.com>
Date: Thu, 4 May 2017 11:19:52 +0200
Subject: KVM: arm64: vgic-v3: vgic_v3_lpi_sync_pending_status

this new helper synchronizes the irq pending_latch
with the LPI pending bit status found in rdist pending table.
As the status is consumed, we reset the bit in pending table.

As we need the PENDBASER_ADDRESS() in vgic-v3, let's move its
definition in the irqchip header. We restore the full length
of the field, ie [51:16]. Same for PROPBASER_ADDRESS with full
field length of [51:12].

Signed-off-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Christoffer Dall <cdall@linaro.org>
---
 include/linux/irqchip/arm-gic-v3.h |  2 ++
 virt/kvm/arm/vgic/vgic-its.c       |  6 ++----
 virt/kvm/arm/vgic/vgic-v3.c        | 44 ++++++++++++++++++++++++++++++++++++++
 virt/kvm/arm/vgic/vgic.h           |  1 +
 4 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index be8bad00c419..fffb91202bc9 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -159,6 +159,8 @@
 #define GICR_PROPBASER_RaWaWb	GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
 
 #define GICR_PROPBASER_IDBITS_MASK			(0x1f)
+#define GICR_PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 12))
+#define GICR_PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(51, 16))
 
 #define GICR_PENDBASER_SHAREABILITY_SHIFT		(10)
 #define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT		(7)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index bd1362e9c214..3601790d841e 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -221,8 +221,6 @@ static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
  */
 #define BASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
 #define CBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
-#define PENDBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 16))
-#define PROPBASER_ADDRESS(x)	((x) & GENMASK_ULL(47, 12))
 
 #define GIC_LPI_OFFSET 8192
 
@@ -257,7 +255,7 @@ static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
 static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 			     struct kvm_vcpu *filter_vcpu)
 {
-	u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+	u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
 	u8 prop;
 	int ret;
 
@@ -369,7 +367,7 @@ static u32 max_lpis_propbaser(u64 propbaser)
  */
 static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
 {
-	gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+	gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
 	struct vgic_irq *irq;
 	int last_byte_offset = -1;
 	int ret = 0;
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index df1503650300..9ae8adddff69 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -234,6 +234,50 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	vgic_v3->vgic_hcr = ICH_HCR_EN;
 }
 
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
+{
+	struct kvm_vcpu *vcpu;
+	int byte_offset, bit_nr;
+	gpa_t pendbase, ptr;
+	bool status;
+	u8 val;
+	int ret;
+
+retry:
+	vcpu = irq->target_vcpu;
+	if (!vcpu)
+		return 0;
+
+	pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+
+	byte_offset = irq->intid / BITS_PER_BYTE;
+	bit_nr = irq->intid % BITS_PER_BYTE;
+	ptr = pendbase + byte_offset;
+
+	ret = kvm_read_guest(kvm, ptr, &val, 1);
+	if (ret)
+		return ret;
+
+	status = val & (1 << bit_nr);
+
+	spin_lock(&irq->irq_lock);
+	if (irq->target_vcpu != vcpu) {
+		spin_unlock(&irq->irq_lock);
+		goto retry;
+	}
+	irq->pending_latch = status;
+	vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+	if (status) {
+		/* clear consumed data */
+		val &= ~(1 << bit_nr);
+		ret = kvm_write_guest(kvm, ptr, &val, 1);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /* check for overlapping regions and for regions crossing the end of memory */
 static bool vgic_v3_check_base(struct kvm *kvm)
 {
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index a48f33b04b2d..79768c801863 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -149,6 +149,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
 void vgic_v3_enable(struct kvm_vcpu *vcpu);
 int vgic_v3_probe(const struct gic_kvm_info *info);
 int vgic_v3_map_resources(struct kvm *kvm);
+int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
 int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
 
 void vgic_v3_load(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From ef51042472f55b325fd7f2b26a2e29fd89757234 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 8 May 2017 10:55:27 -0700
Subject: block, dax: move "select DAX" from BLOCK to FS_DAX

For configurations that do not enable DAX filesystems or drivers, do not
require the DAX core to be built.

Given that the 'direct_access' method has been removed from
'block_device_operations', we can also go ahead and remove the
block-related dax helper functions from fs/block_dev.c to
drivers/dax/super.c. This keeps dax details out of the block layer and
lets the DAX core be built as a module in the FS_DAX=n case.

Filesystems need to include dax.h to call bdev_dax_supported().

Cc: linux-xfs@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 block/Kconfig          |  1 -
 drivers/dax/super.c    | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/Kconfig             |  1 +
 fs/block_dev.c         | 66 -----------------------------------------------
 fs/ext2/super.c        |  1 +
 fs/ext4/super.c        |  1 +
 fs/xfs/xfs_super.c     |  1 +
 include/linux/blkdev.h |  2 --
 include/linux/dax.h    | 30 ++++++++++++++++++++--
 9 files changed, 102 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 93da7fc3f254..e9f780f815f5 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -6,7 +6,6 @@ menuconfig BLOCK
        default y
        select SBITMAP
        select SRCU
-       select DAX
        help
 	 Provide block layer support for the kernel.
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 465dcd7317d5..e8998d15f3cd 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/magic.h>
+#include <linux/genhd.h>
 #include <linux/cdev.h>
 #include <linux/hash.h>
 #include <linux/slab.h>
@@ -47,6 +48,75 @@ void dax_read_unlock(int id)
 }
 EXPORT_SYMBOL_GPL(dax_read_unlock);
 
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		pgoff_t *pgoff)
+{
+	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
+
+	if (pgoff)
+		*pgoff = PHYS_PFN(phys_off);
+	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(bdev_dax_pgoff);
+
+/**
+ * __bdev_dax_supported() - Check if the device supports dax for filesystem
+ * @sb: The superblock of the device
+ * @blocksize: The block size of the device
+ *
+ * This is a library function for filesystems to check if the block device
+ * can be mounted with dax option.
+ *
+ * Return: negative errno if unsupported, 0 if supported.
+ */
+int __bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	struct block_device *bdev = sb->s_bdev;
+	struct dax_device *dax_dev;
+	pgoff_t pgoff;
+	int err, id;
+	void *kaddr;
+	pfn_t pfn;
+	long len;
+
+	if (blocksize != PAGE_SIZE) {
+		pr_err("VFS (%s): error: unsupported blocksize for dax\n",
+				sb->s_id);
+		return -EINVAL;
+	}
+
+	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+	if (err) {
+		pr_err("VFS (%s): error: unaligned partition for dax\n",
+				sb->s_id);
+		return err;
+	}
+
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev) {
+		pr_err("VFS (%s): error: device does not support dax\n",
+				sb->s_id);
+		return -EOPNOTSUPP;
+	}
+
+	id = dax_read_lock();
+	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+	dax_read_unlock(id);
+
+	put_dax(dax_dev);
+
+	if (len < 1) {
+		pr_err("VFS (%s): error: dax access failed (%ld)",
+				sb->s_id, len);
+		return len < 0 ? len : -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__bdev_dax_supported);
+
 /**
  * struct dax_device - anchor object for dax services
  * @inode: core vfs
diff --git a/fs/Kconfig b/fs/Kconfig
index 83eab52fb3f6..b0e42b6a96b9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -39,6 +39,7 @@ config FS_DAX
 	depends on MMU
 	depends on !(ARM || MIPS || SPARC)
 	select FS_IOMAP
+	select DAX
 	help
 	  Direct Access (DAX) can be used on memory-backed block devices.
 	  If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 666367e13711..3096ecd48304 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -718,72 +718,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
-int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
-		pgoff_t *pgoff)
-{
-	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
-
-	if (pgoff)
-		*pgoff = PHYS_PFN(phys_off);
-	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
-		return -EINVAL;
-	return 0;
-}
-EXPORT_SYMBOL(bdev_dax_pgoff);
-
-/**
- * bdev_dax_supported() - Check if the device supports dax for filesystem
- * @sb: The superblock of the device
- * @blocksize: The block size of the device
- *
- * This is a library function for filesystems to check if the block device
- * can be mounted with dax option.
- *
- * Return: negative errno if unsupported, 0 if supported.
- */
-int bdev_dax_supported(struct super_block *sb, int blocksize)
-{
-	struct block_device *bdev = sb->s_bdev;
-	struct dax_device *dax_dev;
-	pgoff_t pgoff;
-	int err, id;
-	void *kaddr;
-	pfn_t pfn;
-	long len;
-
-	if (blocksize != PAGE_SIZE) {
-		vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
-		return -EINVAL;
-	}
-
-	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
-	if (err) {
-		vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
-		return err;
-	}
-
-	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
-	if (!dax_dev) {
-		vfs_msg(sb, KERN_ERR, "error: device does not support dax");
-		return -EOPNOTSUPP;
-	}
-
-	id = dax_read_lock();
-	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
-	dax_read_unlock(id);
-
-	put_dax(dax_dev);
-
-	if (len < 1) {
-		vfs_msg(sb, KERN_ERR,
-				"error: dax access failed (%ld)", len);
-		return len < 0 ? len : -EIO;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
 /*
  * pseudo-fs
  */
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9e25a71fe1a2..d07773b81da9 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -32,6 +32,7 @@
 #include <linux/log2.h>
 #include <linux/quotaops.h>
 #include <linux/uaccess.h>
+#include <linux/dax.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9448db1cf7e..bf6bb8997124 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -37,6 +37,7 @@
 #include <linux/ctype.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
+#include <linux/dax.h>
 #include <linux/cleancache.h>
 #include <linux/uaccess.h>
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 685c042a120f..f5c58d6dcafb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -52,6 +52,7 @@
 #include "xfs_reflink.h"
 
 #include <linux/namei.h>
+#include <linux/dax.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 848f87eb1905..e4d9899755a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1940,8 +1940,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 extern int bdev_read_page(struct block_device *, sector_t, struct page *);
 extern int bdev_write_page(struct block_device *, sector_t, struct page *,
 						struct writeback_control *);
-extern int bdev_dax_supported(struct super_block *, int);
-int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #else /* CONFIG_BLOCK */
 
 struct block_device;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d3158e74a59e..7fdf1d710042 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -18,12 +18,38 @@ struct dax_operations {
 			void **, pfn_t *);
 };
 
+int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
+#if IS_ENABLED(CONFIG_FS_DAX)
+int __bdev_dax_supported(struct super_block *sb, int blocksize);
+static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	return __bdev_dax_supported(sb, blocksize);
+}
+#else
+static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#if IS_ENABLED(CONFIG_DAX)
+struct dax_device *dax_get_by_host(const char *host);
+void put_dax(struct dax_device *dax_dev);
+#else
+static inline struct dax_device *dax_get_by_host(const char *host)
+{
+	return NULL;
+}
+
+static inline void put_dax(struct dax_device *dax_dev)
+{
+}
+#endif
+
 int dax_read_lock(void);
 void dax_read_unlock(int id);
-struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
 		const struct dax_operations *ops);
-void put_dax(struct dax_device *dax_dev);
 bool dax_alive(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
-- 
cgit v1.2.3


From e092693443b995c8e3a565a73b5fdb05f1260f9b Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Mon, 8 May 2017 18:02:24 -0400
Subject: NFS append COMMIT after synchronous COPY

Instead of messing with the commit path which has been causing issues,
add a COMMIT op after the COPY and ask for stable copies in the first
space.

It saves a round trip, since after the COPY, the client sends a COMMIT
anyway.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/internal.h       |  1 -
 fs/nfs/nfs42proc.c      | 21 +++++++++++++++------
 fs/nfs/nfs42xdr.c       | 22 ++++++++++++++++++++--
 fs/nfs/write.c          | 30 ------------------------------
 include/linux/nfs_xdr.h |  1 +
 5 files changed, 36 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 31b26cf1b476..e9b4c3320e37 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -495,7 +495,6 @@ void nfs_mark_request_commit(struct nfs_page *req,
 			     u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
 void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
 			    int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 87f5b7b971ca..929d09a5310a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src,
 	if (status)
 		return status;
 
+	res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS);
+	if (!res->commit_res.verf)
+		return -ENOMEM;
 	status = nfs4_call_sync(server->client, server, &msg,
 				&args->seq_args, &res->seq_res, 0);
 	if (status == -ENOTSUPP)
 		server->caps &= ~NFS_CAP_COPY;
 	if (status)
-		return status;
+		goto out;
 
-	if (res->write_res.verifier.committed != NFS_FILE_SYNC) {
-		status = nfs_commit_file(dst, &res->write_res.verifier.verifier);
-		if (status)
-			return status;
+	if (!nfs_write_verifier_cmp(&res->write_res.verifier.verifier,
+				    &res->commit_res.verf->verifier)) {
+		status = -EAGAIN;
+		goto out;
 	}
 
 	truncate_pagecache_range(dst_inode, pos_dst,
 				 pos_dst + res->write_res.count);
 
-	return res->write_res.count;
+	status = res->write_res.count;
+out:
+	kfree(res->commit_res.verf);
+	return status;
 }
 
 ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
@@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
 		if (err == -ENOTSUPP) {
 			err = -EOPNOTSUPP;
 			break;
+		} if (err == -EAGAIN) {
+			dst_exception.retry = 1;
+			continue;
 		}
 
 		err2 = nfs4_handle_exception(server, err, &src_exception);
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6c7296454bbc..528362f69cc1 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -66,12 +66,14 @@
 					 encode_putfh_maxsz + \
 					 encode_savefh_maxsz + \
 					 encode_putfh_maxsz + \
-					 encode_copy_maxsz)
+					 encode_copy_maxsz + \
+					 encode_commit_maxsz)
 #define NFS4_dec_copy_sz		(compound_decode_hdr_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_savefh_maxsz + \
 					 decode_putfh_maxsz + \
-					 decode_copy_maxsz)
+					 decode_copy_maxsz + \
+					 decode_commit_maxsz)
 #define NFS4_enc_deallocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
 					 encode_deallocate_maxsz + \
@@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+static void encode_copy_commit(struct xdr_stream *xdr,
+			  struct nfs42_copy_args *args,
+			  struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr);
+	p = reserve_space(xdr, 12);
+	p = xdr_encode_hyper(p, args->dst_pos);
+	*p = cpu_to_be32(args->count);
+}
+
 /*
  * Encode COPY request
  */
@@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req,
 	encode_savefh(xdr, &hdr);
 	encode_putfh(xdr, args->dst_fh, &hdr);
 	encode_copy(xdr, args, &hdr);
+	encode_copy_commit(xdr, args, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp,
 	if (status)
 		goto out;
 	status = decode_copy(xdr, res);
+	if (status)
+		goto out;
+	status = decode_commit(xdr, &res->commit_res);
 out:
 	return status;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 59e21cc0a266..85bfa416fcc8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1742,36 +1742,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 				   data->mds_ops, how, 0);
 }
 
-int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
-{
-	struct inode *inode = file_inode(file);
-	struct nfs_open_context *open;
-	struct nfs_commit_info cinfo;
-	struct nfs_page *req;
-	int ret;
-
-	open = get_nfs_open_context(nfs_file_open_context(file));
-	req  = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode));
-	if (IS_ERR(req)) {
-		ret = PTR_ERR(req);
-		goto out_put;
-	}
-
-	nfs_init_cinfo_from_inode(&cinfo, inode);
-
-	memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier));
-	nfs_request_add_commit_list(req, &cinfo);
-	ret = nfs_commit_inode(inode, FLUSH_SYNC);
-	if (ret > 0)
-		ret = 0;
-
-	nfs_free_request(req);
-out_put:
-	put_nfs_open_context(open);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_commit_file);
-
 /*
  * COMMIT call returned
  */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 677c6b91dfcd..b28c83475ee8 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1383,6 +1383,7 @@ struct nfs42_copy_res {
 	struct nfs42_write_res		write_res;
 	bool				consecutive;
 	bool				synchronous;
+	struct nfs_commitres		commit_res;
 };
 
 struct nfs42_seek_args {
-- 
cgit v1.2.3


From 02aa0cdd72483c6dd436ed24d1000f86e0038d28 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:40 -0700
Subject: mm, page_alloc: count movable pages when stealing from pageblock

When stealing pages from pageblock of a different migratetype, we count
how many free pages were stolen, and change the pageblock's migratetype
if more than half of the pageblock was free.  This might be too
conservative, as there might be other pages that are not free, but were
allocated with the same migratetype as our allocation requested.

While we cannot determine the migratetype of allocated pages precisely
(at least without the page_owner functionality enabled), we can count
pages that compaction would try to isolate for migration - those are
either on LRU or __PageMovable().  The rest can be assumed to be
MIGRATE_RECLAIMABLE or MIGRATE_UNMOVABLE, which we cannot easily
distinguish.  This counting can be done as part of free page stealing
with little additional overhead.

The page stealing code is changed so that it considers free pages plus
pages of the "good" migratetype for the decision whether to change
pageblock's migratetype.

The result should be more accurate migratetype of pageblocks wrt the
actual pages in the pageblocks, when stealing from semi-occupied
pageblocks.  This should help the efficiency of page grouping by
mobility.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced
the number of unmovable allocations falling back to movable pageblocks
by 47%.  The number of movable allocations falling back to other
pageblocks are increased by 55%, but these events don't cause permanent
fragmentation, so the tradeoff should be positive.  Later patches also
offset the movable fallback increase to some extent.

[akpm@linux-foundation.org: merge fix]
Link: http://lkml.kernel.org/r/20170307131545.28577-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h |  5 +--
 mm/page_alloc.c                | 74 +++++++++++++++++++++++++++++++++---------
 mm/page_isolation.c            |  5 +--
 3 files changed, 63 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 047d64706f2a..d4cd2014fa6f 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			 bool skip_hwpoisoned_pages);
 void set_pageblock_migratetype(struct page *page, int migratetype);
 int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype);
-int move_freepages(struct zone *zone,
-			  struct page *start_page, struct page *end_page,
-			  int migratetype);
+				int migratetype, int *num_movable);
 
 /*
  * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f1118b4dda4..d90792addeb9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1832,9 +1832,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
-			  int migratetype)
+			  int migratetype, int *num_movable)
 {
 	struct page *page;
 	unsigned int order;
@@ -1851,6 +1851,9 @@ int move_freepages(struct zone *zone,
 	VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 
+	if (num_movable)
+		*num_movable = 0;
+
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
@@ -1861,6 +1864,15 @@ int move_freepages(struct zone *zone,
 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 
 		if (!PageBuddy(page)) {
+			/*
+			 * We assume that pages that could be isolated for
+			 * migration are movable. But we don't actually try
+			 * isolating, as that would be expensive.
+			 */
+			if (num_movable &&
+					(PageLRU(page) || __PageMovable(page)))
+				(*num_movable)++;
+
 			page++;
 			continue;
 		}
@@ -1876,7 +1888,7 @@ int move_freepages(struct zone *zone,
 }
 
 int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+				int migratetype, int *num_movable)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
@@ -1893,7 +1905,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 
-	return move_freepages(zone, start_page, end_page, migratetype);
+	return move_freepages(zone, start_page, end_page, migratetype,
+								num_movable);
 }
 
 static void change_pageblock_range(struct page *pageblock_page,
@@ -1943,22 +1956,26 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 /*
  * This function implements actual steal behaviour. If order is large enough,
  * we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
- * pages are moved, we can change migratetype of pageblock and permanently
- * use it's pages as requested migratetype in the future.
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
 					int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
 	struct free_area *area;
-	int pages;
+	int free_pages, movable_pages, alike_pages;
+	int old_block_type;
+
+	old_block_type = get_pageblock_migratetype(page);
 
 	/*
 	 * This can happen due to races and we want to prevent broken
 	 * highatomic accounting.
 	 */
-	if (is_migrate_highatomic_page(page))
+	if (is_migrate_highatomic(old_block_type))
 		goto single_page;
 
 	/* Take ownership for orders >= pageblock_order */
@@ -1971,13 +1988,39 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 	if (!whole_block)
 		goto single_page;
 
-	pages = move_freepages_block(zone, page, start_type);
+	free_pages = move_freepages_block(zone, page, start_type,
+						&movable_pages);
+	/*
+	 * Determine how many pages are compatible with our allocation.
+	 * For movable allocation, it's the number of movable pages which
+	 * we just obtained. For other types it's a bit more tricky.
+	 */
+	if (start_type == MIGRATE_MOVABLE) {
+		alike_pages = movable_pages;
+	} else {
+		/*
+		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+		 * to MOVABLE pageblock, consider all non-movable pages as
+		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+		 * vice versa, be conservative since we can't distinguish the
+		 * exact migratetype of non-movable pages.
+		 */
+		if (old_block_type == MIGRATE_MOVABLE)
+			alike_pages = pageblock_nr_pages
+						- (free_pages + movable_pages);
+		else
+			alike_pages = 0;
+	}
+
 	/* moving whole block can fail due to zone boundary conditions */
-	if (!pages)
+	if (!free_pages)
 		goto single_page;
 
-	/* Claim the whole block if over half of it is free */
-	if (pages >= (1 << (pageblock_order-1)) ||
+	/*
+	 * If a sufficient number of pages in the block are either free or of
+	 * comparable migratability as our allocation, claim the whole block.
+	 */
+	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled)
 		set_pageblock_migratetype(page, start_type);
 
@@ -2055,7 +2098,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 	    && !is_migrate_cma(mt)) {
 		zone->nr_reserved_highatomic += pageblock_nr_pages;
 		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
-		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
 	}
 
 out_unlock:
@@ -2132,7 +2175,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * may increase.
 			 */
 			set_pageblock_migratetype(page, ac->migratetype);
-			ret = move_freepages_block(zone, page, ac->migratetype);
+			ret = move_freepages_block(zone, page, ac->migratetype,
+									NULL);
 			if (ret) {
 				spin_unlock_irqrestore(&zone->lock, flags);
 				return ret;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7927bbb54a4e..5092e4ef00c8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -66,7 +66,8 @@ out:
 
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+									NULL);
 
 		__mod_zone_freepage_state(zone, -nr_pages, migratetype);
 	}
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	 * pageblock scanning for freepage moving.
 	 */
 	if (!isolated_page) {
-		nr_pages = move_freepages_block(zone, page, migratetype);
+		nr_pages = move_freepages_block(zone, page, migratetype, NULL);
 		__mod_zone_freepage_state(zone, nr_pages, migratetype);
 	}
 	set_pageblock_migratetype(page, migratetype);
-- 
cgit v1.2.3


From b682debd97153706ffbe2fe3f8ec30a7ee11f9e1 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:43 -0700
Subject: mm, compaction: change migrate_async_suitable() to
 suitable_migration_source()

Preparation for making the decisions more complex and depending on
compact_control flags.  No functional change.

Link: http://lkml.kernel.org/r/20170307131545.28577-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  5 +++++
 mm/compaction.c        | 19 +++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e0c3c5e3d8a0..ebaccd4e7d8c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES];
 #  define is_migrate_cma_page(_page) false
 #endif
 
+static inline bool is_migrate_movable(int mt)
+{
+	return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
+}
+
 #define for_each_migratetype_order(order, type) \
 	for (order = 0; order < MAX_ORDER; order++) \
 		for (type = 0; type < MIGRATE_TYPES; type++)
diff --git a/mm/compaction.c b/mm/compaction.c
index 01b1fb8f6f47..a20876e37648 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list)
 	list_splice(&tmp_list, list);
 }
 
-static inline bool migrate_async_suitable(int migratetype)
-{
-	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-}
-
 #ifdef CONFIG_COMPACTION
 
 int PageMovable(struct page *page)
@@ -988,6 +983,15 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
 
+static bool suitable_migration_source(struct compact_control *cc,
+							struct page *page)
+{
+	if (cc->mode != MIGRATE_ASYNC)
+		return true;
+
+	return is_migrate_movable(get_pageblock_migratetype(page));
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct compact_control *cc,
 							struct page *page)
@@ -1007,7 +1011,7 @@ static bool suitable_migration_target(struct compact_control *cc,
 		return true;
 
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(get_pageblock_migratetype(page)))
+	if (is_migrate_movable(get_pageblock_migratetype(page)))
 		return true;
 
 	/* Otherwise skip the block */
@@ -1242,8 +1246,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 		 * Async compaction is optimistic to see if the minimum amount
 		 * of work satisfies the allocation.
 		 */
-		if (cc->mode == MIGRATE_ASYNC &&
-		    !migrate_async_suitable(get_pageblock_migratetype(page)))
+		if (!suitable_migration_source(cc, page))
 			continue;
 
 		/* Perform the isolation */
-- 
cgit v1.2.3


From 7c30f352c852bae2715ad65ac4a38ca9af7d7696 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 8 May 2017 15:55:05 -0700
Subject: jiffies.h: declare jiffies and jiffies_64 with
 ____cacheline_aligned_in_smp

jiffies_64 is defined in kernel/time/timer.c with
____cacheline_aligned_in_smp, however this macro is not part of the
declaration of jiffies and jiffies_64 in jiffies.h.

As a result clang generates the following warning:

  kernel/time/timer.c:57:26: error: section does not match previous declaration [-Werror,-Wsection]
  __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
                           ^
  include/linux/cache.h:39:36: note: expanded from macro '__cacheline_aligned_in_smp'
                                     ^
  include/linux/cache.h:34:4: note: expanded from macro '__cacheline_aligned'
                   __section__(".data..cacheline_aligned")))
                   ^
  include/linux/jiffies.h:77:12: note: previous attribute is here
  extern u64 __jiffy_data jiffies_64;
             ^
  include/linux/jiffies.h:70:38: note: expanded from macro '__jiffy_data'

Link: http://lkml.kernel.org/r/20170403190200.70273-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Michael Davidson <md@google.com>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/jiffies.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 624215cebee5..36872fbb815d 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
+#include <linux/cache.h>
 #include <linux/math64.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -63,19 +64,13 @@ extern int register_refined_jiffies(long clock_tick_rate);
 /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
 #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ)
 
-/* some arch's have a small-data section that can be accessed register-relative
- * but that can only take up to, say, 4-byte variables. jiffies being part of
- * an 8-byte variable may not be correctly accessed unless we force the issue
- */
-#define __jiffy_data  __attribute__((section(".data")))
-
 /*
  * The 64-bit value is not atomic - you MUST NOT read it
  * without sampling the sequence number in jiffies_lock.
  * get_jiffies_64() will do this for you as appropriate.
  */
-extern u64 __jiffy_data jiffies_64;
-extern unsigned long volatile __jiffy_data jiffies;
+extern u64 __cacheline_aligned_in_smp jiffies_64;
+extern unsigned long volatile __cacheline_aligned_in_smp jiffies;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void);
-- 
cgit v1.2.3


From c311c797998c1e70eade463dd60b843da4f1a203 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 8 May 2017 15:56:15 -0700
Subject: cpumask: make "nr_cpumask_bits" unsigned

Bit searching functions accept "unsigned long" indices but
"nr_cpumask_bits" is "int" which is signed, so inevitable sign
extensions occur on x86_64.  Those MOVSX are #1 MOVSX bloat by number of
uses across whole kernel.

Change "nr_cpumask_bits" to unsigned, this number can't be negative
after all.  It allows to do implicit zero-extension on x86_64 without
MOVSX.

Change signed comparisons into unsigned comparisons where necessary.

Other uses looks fine because it is either argument passed to a function
or comparison is already unsigned.

Net win on allyesconfig type of kernel: ~2.8 KB (!)

	add/remove: 0/0 grow/shrink: 8/725 up/down: 93/-2926 (-2833)
	function                                     old     new   delta
	xen_exit_mmap                                691     735     +44
	qstat_read                                   426     440     +14
	__cpufreq_cooling_register                  1678    1687      +9
	trace_rb_cpu_prepare                         447     455      +8
	vermagic                                      54      60      +6
	nfp_driver_version                            54      60      +6
	rcu_torture_stats_print                     1147    1151      +4
	find_next_push_cpu                           267     269      +2
	xen_irq_resume                               961     960      -1
				...
	init_vp_index                                946     906     -40
	od_set_powersave_bias                        328     281     -47
	power_cpu_exit                               193     139     -54
	arch_show_interrupts                        3538    3484     -54
	select_idle_sibling                         1558    1471     -87
	Total: Before=158358910, After=158356077, chg -0.00%

Same arguments apply to "nr_cpu_ids" but I haven't yet found enough
courage to delve into this issue (and proper fix may require new type
"cpu_t" which is whole separate story).

Link: http://lkml.kernel.org/r/20170309205322.GA1728@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/perf_event_mipsxx.c | 2 +-
 arch/s390/kernel/perf_cpum_sf.c      | 2 +-
 include/linux/cpumask.h              | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c
index 9452b02ce079..313a88b2973f 100644
--- a/arch/mips/kernel/perf_event_mipsxx.c
+++ b/arch/mips/kernel/perf_event_mipsxx.c
@@ -618,7 +618,7 @@ static int mipspmu_event_init(struct perf_event *event)
 		return -ENOENT;
 	}
 
-	if (event->cpu >= nr_cpumask_bits ||
+	if ((unsigned int)event->cpu >= nr_cpumask_bits ||
 	    (event->cpu >= 0 && !cpu_online(event->cpu)))
 		return -ENODEV;
 
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index 9a4f279d25ca..ca960d0370d5 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -823,7 +823,7 @@ static int cpumsf_pmu_event_init(struct perf_event *event)
 	}
 
 	/* Check online status of the CPU to which the event is pinned */
-	if (event->cpu >= nr_cpumask_bits ||
+	if ((unsigned int)event->cpu >= nr_cpumask_bits ||
 	    (event->cpu >= 0 && !cpu_online(event->cpu)))
 		return -ENODEV;
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1a675604b17d..2404ad238c0b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -40,9 +40,9 @@ extern int nr_cpu_ids;
 #ifdef CONFIG_CPUMASK_OFFSTACK
 /* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
  * not all bits may be allocated. */
-#define nr_cpumask_bits	nr_cpu_ids
+#define nr_cpumask_bits	((unsigned int)nr_cpu_ids)
 #else
-#define nr_cpumask_bits	NR_CPUS
+#define nr_cpumask_bits	((unsigned int)NR_CPUS)
 #endif
 
 /*
-- 
cgit v1.2.3


From 692f66f26a4c19d73249736aa973c13a1521b387 Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Mon, 8 May 2017 15:56:18 -0700
Subject: crash: move crashkernel parsing and vmcore related code under
 CONFIG_CRASH_CORE

Patch series "kexec/fadump: remove dependency with CONFIG_KEXEC and
reuse crashkernel parameter for fadump", v4.

Traditionally, kdump is used to save vmcore in case of a crash.  Some
architectures like powerpc can save vmcore using architecture specific
support instead of kexec/kdump mechanism.  Such architecture specific
support also needs to reserve memory, to be used by dump capture kernel.
crashkernel parameter can be a reused, for memory reservation, by such
architecture specific infrastructure.

This patchset removes dependency with CONFIG_KEXEC for crashkernel
parameter and vmcoreinfo related code as it can be reused without kexec
support.  Also, crashkernel parameter is reused instead of
fadump_reserve_mem to reserve memory for fadump.

The first patch moves crashkernel parameter parsing and vmcoreinfo
related code under CONFIG_CRASH_CORE instead of CONFIG_KEXEC_CORE.  The
second patch reuses the definitions of append_elf_note() & final_note()
functions under CONFIG_CRASH_CORE in IA64 arch code.  The third patch
removes dependency on CONFIG_KEXEC for firmware-assisted dump (fadump)
in powerpc.  The next patch reuses crashkernel parameter for reserving
memory for fadump, instead of the fadump_reserve_mem parameter.  This
has the advantage of using all syntaxes crashkernel parameter supports,
for fadump as well.  The last patch updates fadump kernel documentation
about use of crashkernel parameter.

This patch (of 5):

Traditionally, kdump is used to save vmcore in case of a crash.  Some
architectures like powerpc can save vmcore using architecture specific
support instead of kexec/kdump mechanism.  Such architecture specific
support also needs to reserve memory, to be used by dump capture kernel.
crashkernel parameter can be a reused, for memory reservation, by such
architecture specific infrastructure.

But currently, code related to vmcoreinfo and parsing of crashkernel
parameter is built under CONFIG_KEXEC_CORE.  This patch introduces
CONFIG_CRASH_CORE and moves the above mentioned code under this config,
allowing code reuse without dependency on CONFIG_KEXEC.  There is no
functional change with this patch.

Link: http://lkml.kernel.org/r/149035338104.6881.4550894432615189948.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig               |   4 +
 include/linux/crash_core.h |  65 +++++++
 include/linux/kexec.h      |  57 +-----
 include/linux/printk.h     |   4 +-
 kernel/Makefile            |   1 +
 kernel/crash_core.c        | 445 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/kexec_core.c        | 403 ----------------------------------------
 kernel/ksysfs.c            |   8 +-
 kernel/printk/printk.c     |   6 +-
 9 files changed, 531 insertions(+), 462 deletions(-)
 create mode 100644 include/linux/crash_core.h
 create mode 100644 kernel/crash_core.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 640999412d11..dcbd462b68b1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -2,7 +2,11 @@
 # General architecture dependent options
 #
 
+config CRASH_CORE
+	bool
+
 config KEXEC_CORE
+	select CRASH_CORE
 	bool
 
 config HAVE_IMA_KEXEC
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
new file mode 100644
index 000000000000..18d0f946fda3
--- /dev/null
+++ b/include/linux/crash_core.h
@@ -0,0 +1,65 @@
+#ifndef LINUX_CRASH_CORE_H
+#define LINUX_CRASH_CORE_H
+
+#include <linux/linkage.h>
+#include <linux/elfcore.h>
+#include <linux/elf.h>
+
+#define CRASH_CORE_NOTE_NAME	   "CORE"
+#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+
+#define CRASH_CORE_NOTE_BYTES	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     CRASH_CORE_NOTE_NAME_BYTES +	\
+				     CRASH_CORE_NOTE_DESC_BYTES)
+
+#define VMCOREINFO_BYTES	   (4096)
+#define VMCOREINFO_NOTE_NAME	   "VMCOREINFO"
+#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
+#define VMCOREINFO_NOTE_SIZE	   ((CRASH_CORE_NOTE_HEAD_BYTES * 2) +	\
+				     VMCOREINFO_NOTE_NAME_BYTES +	\
+				     VMCOREINFO_BYTES)
+
+typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
+
+void crash_save_vmcoreinfo(void);
+void arch_crash_save_vmcoreinfo(void);
+__printf(1, 2)
+void vmcoreinfo_append_str(const char *fmt, ...);
+phys_addr_t paddr_vmcoreinfo_note(void);
+
+#define VMCOREINFO_OSRELEASE(value) \
+	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
+#define VMCOREINFO_PAGESIZE(value) \
+	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
+#define VMCOREINFO_SYMBOL(name) \
+	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
+#define VMCOREINFO_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(name))
+#define VMCOREINFO_STRUCT_SIZE(name) \
+	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
+			      (unsigned long)sizeof(struct name))
+#define VMCOREINFO_OFFSET(name, field) \
+	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+			      (unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_LENGTH(name, value) \
+	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
+#define VMCOREINFO_NUMBER(name) \
+	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
+#define VMCOREINFO_CONFIG(name) \
+	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
+
+extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+extern size_t vmcoreinfo_size;
+extern size_t vmcoreinfo_max_size;
+
+int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
+
+#endif /* LINUX_CRASH_CORE_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d419d0e51fe5..c9481ebcbc0c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -14,17 +14,15 @@
 
 #if !defined(__ASSEMBLY__)
 
+#include <linux/crash_core.h>
 #include <asm/io.h>
 
 #include <uapi/linux/kexec.h>
 
 #ifdef CONFIG_KEXEC_CORE
 #include <linux/list.h>
-#include <linux/linkage.h>
 #include <linux/compat.h>
 #include <linux/ioport.h>
-#include <linux/elfcore.h>
-#include <linux/elf.h>
 #include <linux/module.h>
 #include <asm/kexec.h>
 
@@ -62,19 +60,15 @@
 #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
 #endif
 
-#define KEXEC_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define KEXEC_CORE_NOTE_NAME "CORE"
-#define KEXEC_CORE_NOTE_NAME_BYTES ALIGN(sizeof(KEXEC_CORE_NOTE_NAME), 4)
-#define KEXEC_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
+#define KEXEC_CORE_NOTE_NAME	CRASH_CORE_NOTE_NAME
+
 /*
  * The per-cpu notes area is a list of notes terminated by a "NULL"
  * note header.  For kdump, the code in vmcore.c runs in the context
  * of the second kernel to combine them into one note.
  */
 #ifndef KEXEC_NOTE_BYTES
-#define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) +		\
-			    KEXEC_CORE_NOTE_NAME_BYTES +		\
-			    KEXEC_CORE_NOTE_DESC_BYTES )
+#define KEXEC_NOTE_BYTES	CRASH_CORE_NOTE_BYTES
 #endif
 
 /*
@@ -256,33 +250,6 @@ extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 int kexec_crash_loaded(void);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
-void crash_save_vmcoreinfo(void);
-void arch_crash_save_vmcoreinfo(void);
-__printf(1, 2)
-void vmcoreinfo_append_str(const char *fmt, ...);
-phys_addr_t paddr_vmcoreinfo_note(void);
-
-#define VMCOREINFO_OSRELEASE(value) \
-	vmcoreinfo_append_str("OSRELEASE=%s\n", value)
-#define VMCOREINFO_PAGESIZE(value) \
-	vmcoreinfo_append_str("PAGESIZE=%ld\n", value)
-#define VMCOREINFO_SYMBOL(name) \
-	vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name)
-#define VMCOREINFO_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(name))
-#define VMCOREINFO_STRUCT_SIZE(name) \
-	vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \
-			      (unsigned long)sizeof(struct name))
-#define VMCOREINFO_OFFSET(name, field) \
-	vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
-			      (unsigned long)offsetof(struct name, field))
-#define VMCOREINFO_LENGTH(name, value) \
-	vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
-#define VMCOREINFO_NUMBER(name) \
-	vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
-#define VMCOREINFO_CONFIG(name) \
-	vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
 
 extern struct kimage *kexec_image;
 extern struct kimage *kexec_crash_image;
@@ -303,31 +270,15 @@ extern int kexec_load_disabled;
 #define KEXEC_FILE_FLAGS	(KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
 				 KEXEC_FILE_NO_INITRAMFS)
 
-#define VMCOREINFO_BYTES           (4096)
-#define VMCOREINFO_NOTE_NAME       "VMCOREINFO"
-#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4)
-#define VMCOREINFO_NOTE_SIZE       (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \
-				    + VMCOREINFO_NOTE_NAME_BYTES)
-
 /* Location of a reserved region to hold the crash kernel.
  */
 extern struct resource crashk_res;
 extern struct resource crashk_low_res;
-typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4];
 extern note_buf_t __percpu *crash_notes;
-extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-extern size_t vmcoreinfo_size;
-extern size_t vmcoreinfo_max_size;
 
 /* flag to track if kexec reboot is in progress */
 extern bool kexec_in_progress;
 
-int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
-int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
-		unsigned long long *crash_size, unsigned long long *crash_base);
 int crash_shrink_memory(unsigned long new_size);
 size_t crash_get_memory_size(void);
 void crash_free_reserved_phys_range(unsigned long begin, unsigned long end);
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 571257e0f53d..e10f27468322 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -198,7 +198,7 @@ extern void wake_up_klogd(void);
 
 char *log_buf_addr_get(void);
 u32 log_buf_len_get(void);
-void log_buf_kexec_setup(void);
+void log_buf_vmcoreinfo_setup(void);
 void __init setup_log_buf(int early);
 __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
@@ -246,7 +246,7 @@ static inline u32 log_buf_len_get(void)
 	return 0;
 }
 
-static inline void log_buf_kexec_setup(void)
+static inline void log_buf_vmcoreinfo_setup(void)
 {
 }
 
diff --git a/kernel/Makefile b/kernel/Makefile
index b302b4731d16..72aa080f91f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_MODULE_SIG) += module_signing.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_CRASH_CORE) += crash_core.o
 obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
new file mode 100644
index 000000000000..4261587a34d2
--- /dev/null
+++ b/kernel/crash_core.c
@@ -0,0 +1,445 @@
+/*
+ * crash.c - kernel crash support code.
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/crash_core.h>
+#include <linux/utsname.h>
+#include <linux/vmalloc.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ *   crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+					unsigned long long system_ram,
+					unsigned long long *crash_size,
+					unsigned long long *crash_base)
+{
+	char *cur = cmdline, *tmp;
+
+	/* for each entry of the comma-separated list */
+	do {
+		unsigned long long start, end = ULLONG_MAX, size;
+
+		/* get the start of the range */
+		start = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("crashkernel: Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (*cur != '-') {
+			pr_warn("crashkernel: '-' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		/* if no ':' is here, than we read the end */
+		if (*cur != ':') {
+			end = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("crashkernel: Memory value expected\n");
+				return -EINVAL;
+			}
+			cur = tmp;
+			if (end <= start) {
+				pr_warn("crashkernel: end <= start\n");
+				return -EINVAL;
+			}
+		}
+
+		if (*cur != ':') {
+			pr_warn("crashkernel: ':' expected\n");
+			return -EINVAL;
+		}
+		cur++;
+
+		size = memparse(cur, &tmp);
+		if (cur == tmp) {
+			pr_warn("Memory value expected\n");
+			return -EINVAL;
+		}
+		cur = tmp;
+		if (size >= system_ram) {
+			pr_warn("crashkernel: invalid size\n");
+			return -EINVAL;
+		}
+
+		/* match ? */
+		if (system_ram >= start && system_ram < end) {
+			*crash_size = size;
+			break;
+		}
+	} while (*cur++ == ',');
+
+	if (*crash_size > 0) {
+		while (*cur && *cur != ' ' && *cur != '@')
+			cur++;
+		if (*cur == '@') {
+			cur++;
+			*crash_base = memparse(cur, &tmp);
+			if (cur == tmp) {
+				pr_warn("Memory value expected after '@'\n");
+				return -EINVAL;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ *	crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+					   unsigned long long *crash_size,
+					   unsigned long long *crash_base)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	if (*cur == '@')
+		*crash_base = memparse(cur+1, &cur);
+	else if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW  1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+	[SUFFIX_HIGH] = ",high",
+	[SUFFIX_LOW]  = ",low",
+	[SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix"  crashkernel command lines like
+ *
+ *	crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+					   unsigned long long	*crash_size,
+					   const char *suffix)
+{
+	char *cur = cmdline;
+
+	*crash_size = memparse(cmdline, &cur);
+	if (cmdline == cur) {
+		pr_warn("crashkernel: memory value expected\n");
+		return -EINVAL;
+	}
+
+	/* check with suffix */
+	if (strncmp(cur, suffix, strlen(suffix))) {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+	cur += strlen(suffix);
+	if (*cur != ' ' && *cur != '\0') {
+		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+			     const char *name,
+			     const char *suffix)
+{
+	char *p = cmdline, *ck_cmdline = NULL;
+
+	/* find crashkernel and use the last one if there are more */
+	p = strstr(p, name);
+	while (p) {
+		char *end_p = strchr(p, ' ');
+		char *q;
+
+		if (!end_p)
+			end_p = p + strlen(p);
+
+		if (!suffix) {
+			int i;
+
+			/* skip the one with any known suffix */
+			for (i = 0; suffix_tbl[i]; i++) {
+				q = end_p - strlen(suffix_tbl[i]);
+				if (!strncmp(q, suffix_tbl[i],
+					     strlen(suffix_tbl[i])))
+					goto next;
+			}
+			ck_cmdline = p;
+		} else {
+			q = end_p - strlen(suffix);
+			if (!strncmp(q, suffix, strlen(suffix)))
+				ck_cmdline = p;
+		}
+next:
+		p = strstr(p+1, name);
+	}
+
+	if (!ck_cmdline)
+		return NULL;
+
+	return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base,
+			     const char *name,
+			     const char *suffix)
+{
+	char	*first_colon, *first_space;
+	char	*ck_cmdline;
+
+	BUG_ON(!crash_size || !crash_base);
+	*crash_size = 0;
+	*crash_base = 0;
+
+	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+	if (!ck_cmdline)
+		return -EINVAL;
+
+	ck_cmdline += strlen(name);
+
+	if (suffix)
+		return parse_crashkernel_suffix(ck_cmdline, crash_size,
+				suffix);
+	/*
+	 * if the commandline contains a ':', then that's the extended
+	 * syntax -- if not, it must be the classic syntax
+	 */
+	first_colon = strchr(ck_cmdline, ':');
+	first_space = strchr(ck_cmdline, ' ');
+	if (first_colon && (!first_space || first_colon < first_space))
+		return parse_crashkernel_mem(ck_cmdline, system_ram,
+				crash_size, crash_base);
+
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+					"crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned int type,
+			    void *data, size_t data_len)
+{
+	struct elf_note note;
+
+	note.n_namesz = strlen(name) + 1;
+	note.n_descsz = data_len;
+	note.n_type   = type;
+	memcpy(buf, &note, sizeof(note));
+	buf += (sizeof(note) + 3)/4;
+	memcpy(buf, name, note.n_namesz);
+	buf += (note.n_namesz + 3)/4;
+	memcpy(buf, data, note.n_descsz);
+	buf += (note.n_descsz + 3)/4;
+
+	return buf;
+}
+
+static void final_note(u32 *buf)
+{
+	struct elf_note note;
+
+	note.n_namesz = 0;
+	note.n_descsz = 0;
+	note.n_type   = 0;
+	memcpy(buf, &note, sizeof(note));
+}
+
+static void update_vmcoreinfo_note(void)
+{
+	u32 *buf = vmcoreinfo_note;
+
+	if (!vmcoreinfo_size)
+		return;
+	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+			      vmcoreinfo_size);
+	final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+	update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+	va_list args;
+	char buf[0x50];
+	size_t r;
+
+	va_start(args, fmt);
+	r = vscnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+
+	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+	vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+phys_addr_t __weak paddr_vmcoreinfo_note(void)
+{
+	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+	VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+	VMCOREINFO_SYMBOL(init_uts_ns);
+	VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+	VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+	VMCOREINFO_SYMBOL(_stext);
+	VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	VMCOREINFO_SYMBOL(mem_map);
+	VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+	VMCOREINFO_SYMBOL(mem_section);
+	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+	VMCOREINFO_STRUCT_SIZE(mem_section);
+	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+	VMCOREINFO_STRUCT_SIZE(page);
+	VMCOREINFO_STRUCT_SIZE(pglist_data);
+	VMCOREINFO_STRUCT_SIZE(zone);
+	VMCOREINFO_STRUCT_SIZE(free_area);
+	VMCOREINFO_STRUCT_SIZE(list_head);
+	VMCOREINFO_SIZE(nodemask_t);
+	VMCOREINFO_OFFSET(page, flags);
+	VMCOREINFO_OFFSET(page, _refcount);
+	VMCOREINFO_OFFSET(page, mapping);
+	VMCOREINFO_OFFSET(page, lru);
+	VMCOREINFO_OFFSET(page, _mapcount);
+	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(page, compound_dtor);
+	VMCOREINFO_OFFSET(page, compound_order);
+	VMCOREINFO_OFFSET(page, compound_head);
+	VMCOREINFO_OFFSET(pglist_data, node_zones);
+	VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+	VMCOREINFO_OFFSET(pglist_data, node_id);
+	VMCOREINFO_OFFSET(zone, free_area);
+	VMCOREINFO_OFFSET(zone, vm_stat);
+	VMCOREINFO_OFFSET(zone, spanned_pages);
+	VMCOREINFO_OFFSET(free_area, free_list);
+	VMCOREINFO_OFFSET(list_head, next);
+	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_OFFSET(vmap_area, va_start);
+	VMCOREINFO_OFFSET(vmap_area, list);
+	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_vmcoreinfo_setup();
+	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+	VMCOREINFO_NUMBER(NR_FREE_PAGES);
+	VMCOREINFO_NUMBER(PG_lru);
+	VMCOREINFO_NUMBER(PG_private);
+	VMCOREINFO_NUMBER(PG_swapcache);
+	VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+	VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+	VMCOREINFO_NUMBER(PG_head_mask);
+	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_HUGETLB_PAGE
+	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
+#endif
+
+	arch_crash_save_vmcoreinfo();
+	update_vmcoreinfo_note();
+
+	return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bfe62d5b3872..9dd722912850 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex);
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
 /* Flag to indicate we are going to kexec a new kernel */
 bool kexec_in_progress = false;
 
@@ -1084,403 +1078,6 @@ static int __init crash_notes_memory_init(void)
 subsys_initcall(crash_notes_memory_init);
 
 
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- *   crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
-					unsigned long long system_ram,
-					unsigned long long *crash_size,
-					unsigned long long *crash_base)
-{
-	char *cur = cmdline, *tmp;
-
-	/* for each entry of the comma-separated list */
-	do {
-		unsigned long long start, end = ULLONG_MAX, size;
-
-		/* get the start of the range */
-		start = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("crashkernel: Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (*cur != '-') {
-			pr_warn("crashkernel: '-' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		/* if no ':' is here, than we read the end */
-		if (*cur != ':') {
-			end = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("crashkernel: Memory value expected\n");
-				return -EINVAL;
-			}
-			cur = tmp;
-			if (end <= start) {
-				pr_warn("crashkernel: end <= start\n");
-				return -EINVAL;
-			}
-		}
-
-		if (*cur != ':') {
-			pr_warn("crashkernel: ':' expected\n");
-			return -EINVAL;
-		}
-		cur++;
-
-		size = memparse(cur, &tmp);
-		if (cur == tmp) {
-			pr_warn("Memory value expected\n");
-			return -EINVAL;
-		}
-		cur = tmp;
-		if (size >= system_ram) {
-			pr_warn("crashkernel: invalid size\n");
-			return -EINVAL;
-		}
-
-		/* match ? */
-		if (system_ram >= start && system_ram < end) {
-			*crash_size = size;
-			break;
-		}
-	} while (*cur++ == ',');
-
-	if (*crash_size > 0) {
-		while (*cur && *cur != ' ' && *cur != '@')
-			cur++;
-		if (*cur == '@') {
-			cur++;
-			*crash_base = memparse(cur, &tmp);
-			if (cur == tmp) {
-				pr_warn("Memory value expected after '@'\n");
-				return -EINVAL;
-			}
-		}
-	}
-
-	return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- *	crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
-					   unsigned long long *crash_size,
-					   unsigned long long *crash_base)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	if (*cur == '@')
-		*crash_base = memparse(cur+1, &cur);
-	else if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
-	[SUFFIX_HIGH] = ",high",
-	[SUFFIX_LOW]  = ",low",
-	[SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix"  crashkernel command lines like
- *
- *	crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
-					   unsigned long long	*crash_size,
-					   const char *suffix)
-{
-	char *cur = cmdline;
-
-	*crash_size = memparse(cmdline, &cur);
-	if (cmdline == cur) {
-		pr_warn("crashkernel: memory value expected\n");
-		return -EINVAL;
-	}
-
-	/* check with suffix */
-	if (strncmp(cur, suffix, strlen(suffix))) {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-	cur += strlen(suffix);
-	if (*cur != ' ' && *cur != '\0') {
-		pr_warn("crashkernel: unrecognized char: %c\n", *cur);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
-			     const char *name,
-			     const char *suffix)
-{
-	char *p = cmdline, *ck_cmdline = NULL;
-
-	/* find crashkernel and use the last one if there are more */
-	p = strstr(p, name);
-	while (p) {
-		char *end_p = strchr(p, ' ');
-		char *q;
-
-		if (!end_p)
-			end_p = p + strlen(p);
-
-		if (!suffix) {
-			int i;
-
-			/* skip the one with any known suffix */
-			for (i = 0; suffix_tbl[i]; i++) {
-				q = end_p - strlen(suffix_tbl[i]);
-				if (!strncmp(q, suffix_tbl[i],
-					     strlen(suffix_tbl[i])))
-					goto next;
-			}
-			ck_cmdline = p;
-		} else {
-			q = end_p - strlen(suffix);
-			if (!strncmp(q, suffix, strlen(suffix)))
-				ck_cmdline = p;
-		}
-next:
-		p = strstr(p+1, name);
-	}
-
-	if (!ck_cmdline)
-		return NULL;
-
-	return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base,
-			     const char *name,
-			     const char *suffix)
-{
-	char	*first_colon, *first_space;
-	char	*ck_cmdline;
-
-	BUG_ON(!crash_size || !crash_base);
-	*crash_size = 0;
-	*crash_base = 0;
-
-	ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
-	if (!ck_cmdline)
-		return -EINVAL;
-
-	ck_cmdline += strlen(name);
-
-	if (suffix)
-		return parse_crashkernel_suffix(ck_cmdline, crash_size,
-				suffix);
-	/*
-	 * if the commandline contains a ':', then that's the extended
-	 * syntax -- if not, it must be the classic syntax
-	 */
-	first_colon = strchr(ck_cmdline, ':');
-	first_space = strchr(ck_cmdline, ' ');
-	if (first_colon && (!first_space || first_colon < first_space))
-		return parse_crashkernel_mem(ck_cmdline, system_ram,
-				crash_size, crash_base);
-
-	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-					"crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
-			     unsigned long long system_ram,
-			     unsigned long long *crash_size,
-			     unsigned long long *crash_base)
-{
-	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
-				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
-	u32 *buf = vmcoreinfo_note;
-
-	if (!vmcoreinfo_size)
-		return;
-	buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
-			      vmcoreinfo_size);
-	final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
-	vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
-	update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
-	va_list args;
-	char buf[0x50];
-	size_t r;
-
-	va_start(args, fmt);
-	r = vscnprintf(buf, sizeof(buf), fmt, args);
-	va_end(args);
-
-	r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
-	memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
-	vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-phys_addr_t __weak paddr_vmcoreinfo_note(void)
-{
-	return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
-	VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
-	VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
-	VMCOREINFO_SYMBOL(init_uts_ns);
-	VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
-	VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
-	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-	VMCOREINFO_SYMBOL(mem_map);
-	VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
-	VMCOREINFO_SYMBOL(mem_section);
-	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
-	VMCOREINFO_STRUCT_SIZE(mem_section);
-	VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
-	VMCOREINFO_STRUCT_SIZE(page);
-	VMCOREINFO_STRUCT_SIZE(pglist_data);
-	VMCOREINFO_STRUCT_SIZE(zone);
-	VMCOREINFO_STRUCT_SIZE(free_area);
-	VMCOREINFO_STRUCT_SIZE(list_head);
-	VMCOREINFO_SIZE(nodemask_t);
-	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _refcount);
-	VMCOREINFO_OFFSET(page, mapping);
-	VMCOREINFO_OFFSET(page, lru);
-	VMCOREINFO_OFFSET(page, _mapcount);
-	VMCOREINFO_OFFSET(page, private);
-	VMCOREINFO_OFFSET(page, compound_dtor);
-	VMCOREINFO_OFFSET(page, compound_order);
-	VMCOREINFO_OFFSET(page, compound_head);
-	VMCOREINFO_OFFSET(pglist_data, node_zones);
-	VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-	VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
-	VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
-	VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
-	VMCOREINFO_OFFSET(pglist_data, node_id);
-	VMCOREINFO_OFFSET(zone, free_area);
-	VMCOREINFO_OFFSET(zone, vm_stat);
-	VMCOREINFO_OFFSET(zone, spanned_pages);
-	VMCOREINFO_OFFSET(free_area, free_list);
-	VMCOREINFO_OFFSET(list_head, next);
-	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vmap_area, va_start);
-	VMCOREINFO_OFFSET(vmap_area, list);
-	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
-	log_buf_kexec_setup();
-	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
-	VMCOREINFO_NUMBER(NR_FREE_PAGES);
-	VMCOREINFO_NUMBER(PG_lru);
-	VMCOREINFO_NUMBER(PG_private);
-	VMCOREINFO_NUMBER(PG_swapcache);
-	VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
-	VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
-	VMCOREINFO_NUMBER(PG_head_mask);
-	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLB_PAGE
-	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
-#endif
-
-	arch_crash_save_vmcoreinfo();
-	update_vmcoreinfo_note();
-
-	return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
 /*
  * Move into place and start executing a preloaded standalone
  * executable.  If nothing was preloaded return an error.
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0999679d6f26..23cd70651238 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj,
 }
 KERNEL_ATTR_RW(kexec_crash_size);
 
+#endif /* CONFIG_KEXEC_CORE */
+
+#ifdef CONFIG_CRASH_CORE
+
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
@@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(vmcoreinfo);
 
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_CORE */
 
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
@@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = {
 	&kexec_loaded_attr.attr,
 	&kexec_crash_loaded_attr.attr,
 	&kexec_crash_size_attr.attr,
+#endif
+#ifdef CONFIG_CRASH_CORE
 	&vmcoreinfo_attr.attr,
 #endif
 #ifndef CONFIG_TINY_RCU
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 779479ac9f57..fb2d1591f671 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -32,7 +32,7 @@
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/syscalls.h>
-#include <linux/kexec.h>
+#include <linux/crash_core.h>
 #include <linux/kdb.h>
 #include <linux/ratelimit.h>
 #include <linux/kmsg_dump.h>
@@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = {
 	.release = devkmsg_release,
 };
 
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_CORE
 /*
  * This appends the listed symbols to /proc/vmcore
  *
@@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = {
  * symbols are specifically used so that utilities can access and extract the
  * dmesg log from a vmcore file after a crash.
  */
-void log_buf_kexec_setup(void)
+void log_buf_vmcoreinfo_setup(void)
 {
 	VMCOREINFO_SYMBOL(log_buf);
 	VMCOREINFO_SYMBOL(log_buf_len);
-- 
cgit v1.2.3


From 51dbd92520d4344fef78481b1bcbc3a7de32b69b Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Mon, 8 May 2017 15:56:21 -0700
Subject: ia64: reuse append_elf_note() and final_note() functions

Get rid of multiple definitions of append_elf_note() & final_note()
functions.  Reuse these functions compiled under CONFIG_CRASH_CORE Also,
define Elf_Word and use it instead of generic u32 or the more specific
Elf64_Word.

Link: http://lkml.kernel.org/r/149035342324.6881.11667840929850361402.stgit@hbathini.in.ibm.com
Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Dave Young <dyoung@redhat.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/crash.c   | 22 ----------------------
 include/linux/crash_core.h |  4 ++++
 include/linux/elf.h        |  2 ++
 kernel/crash_core.c        | 34 ++++++++++++++--------------------
 kernel/kexec_core.c        | 28 ----------------------------
 5 files changed, 20 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
index 2955f359e2a7..75859a07d75b 100644
--- a/arch/ia64/kernel/crash.c
+++ b/arch/ia64/kernel/crash.c
@@ -27,28 +27,6 @@ static int kdump_freeze_monarch;
 static int kdump_on_init = 1;
 static int kdump_on_fatal_mca = 1;
 
-static inline Elf64_Word
-*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data,
-		size_t data_len)
-{
-	struct elf_note *note = (struct elf_note *)buf;
-	note->n_namesz = strlen(name) + 1;
-	note->n_descsz = data_len;
-	note->n_type   = type;
-	buf += (sizeof(*note) + 3)/4;
-	memcpy(buf, name, note->n_namesz);
-	buf += (note->n_namesz + 3)/4;
-	memcpy(buf, data, data_len);
-	buf += (data_len + 3)/4;
-	return buf;
-}
-
-static void
-final_note(void *buf)
-{
-	memset(buf, 0, sizeof(struct elf_note));
-}
-
 extern void ia64_dump_cpu_regs(void *);
 
 static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 18d0f946fda3..541a197ba4a2 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -55,6 +55,10 @@ extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 extern size_t vmcoreinfo_size;
 extern size_t vmcoreinfo_max_size;
 
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len);
+void final_note(Elf_Word *buf);
+
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 20fa8d8ae313..ba069e8f4f78 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC [];
 #define elf_note	elf32_note
 #define elf_addr_t	Elf32_Off
 #define Elf_Half	Elf32_Half
+#define Elf_Word	Elf32_Word
 
 #else
 
@@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC [];
 #define elf_note	elf64_note
 #define elf_addr_t	Elf64_Off
 #define Elf_Half	Elf64_Half
+#define Elf_Word	Elf64_Word
 
 #endif
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 4261587a34d2..fcbd568f1e95 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -291,32 +291,26 @@ int __init parse_crashkernel_low(char *cmdline,
 				"crashkernel=", suffix_tbl[SUFFIX_LOW]);
 }
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned int type,
-			    void *data, size_t data_len)
+Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
+			  void *data, size_t data_len)
 {
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
+	struct elf_note *note = (struct elf_note *)buf;
+
+	note->n_namesz = strlen(name) + 1;
+	note->n_descsz = data_len;
+	note->n_type   = type;
+	buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+	memcpy(buf, name, note->n_namesz);
+	buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word));
+	memcpy(buf, data, data_len);
+	buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word));
 
 	return buf;
 }
 
-static void final_note(u32 *buf)
+void final_note(Elf_Word *buf)
 {
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
+	memset(buf, 0, sizeof(struct elf_note));
 }
 
 static void update_vmcoreinfo_note(void)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 9dd722912850..ae1a3ba24df5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -990,34 +990,6 @@ unlock:
 	return ret;
 }
 
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
-			    size_t data_len)
-{
-	struct elf_note note;
-
-	note.n_namesz = strlen(name) + 1;
-	note.n_descsz = data_len;
-	note.n_type   = type;
-	memcpy(buf, &note, sizeof(note));
-	buf += (sizeof(note) + 3)/4;
-	memcpy(buf, name, note.n_namesz);
-	buf += (note.n_namesz + 3)/4;
-	memcpy(buf, data, note.n_descsz);
-	buf += (note.n_descsz + 3)/4;
-
-	return buf;
-}
-
-static void final_note(u32 *buf)
-{
-	struct elf_note note;
-
-	note.n_namesz = 0;
-	note.n_descsz = 0;
-	note.n_type   = 0;
-	memcpy(buf, &note, sizeof(note));
-}
-
 void crash_save_cpu(struct pt_regs *regs, int cpu)
 {
 	struct elf_prstatus prstatus;
-- 
cgit v1.2.3


From 25b14e92af1a563c7331466ca59188f88050bbf0 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 8 May 2017 15:56:38 -0700
Subject: ns: allow ns_entries to have custom symlink content

Patch series "Expose task pid_ns_for_children to userspace".

pid_ns_for_children set by a task is known only to the task itself, and
it's impossible to identify it from outside.

It's a big problem for checkpoint/restore software like CRIU, because it
can't correctly handle tasks, that do setns(CLONE_NEWPID) in proccess of
their work.  If they have a custom pid_ns_for_children before dump, they
must have the same ns after restore.  Otherwise, restored task bumped
into enviroment it does not expect.

This patchset solves the problem.  It exposes pid_ns_for_children to ns
directory in standard way with the name "pid_for_children":

  ~# ls /proc/5531/ns -l | grep pid
  lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
  lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]

This patch (of 2):

Make possible to have link content prefix yyy different from the link
name xxx:

  $ readlink /proc/[pid]/ns/xxx
  yyy:[4026531838]

This will be used in next patch.

Link: http://lkml.kernel.org/r/149201120318.6007.7362655181033883000.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nsfs.c               | 4 +++-
 include/linux/proc_ns.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 323f492e0822..f3db56e83dd2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -196,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task,
 {
 	struct ns_common *ns;
 	int res = -ENOENT;
+	const char *name;
 	ns = ns_ops->get(task);
 	if (ns) {
-		res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum);
+		name = ns_ops->real_ns_name ? : ns_ops->name;
+		res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
 		ns_ops->put(ns);
 	}
 	return res;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 12cb8bd81d2d..88dba3b53375 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -14,6 +14,7 @@ struct inode;
 
 struct proc_ns_operations {
 	const char *name;
+	const char *real_ns_name;
 	int type;
 	struct ns_common *(*get)(struct task_struct *task);
 	void (*put)(struct ns_common *ns);
-- 
cgit v1.2.3


From eaa0d190bfe1ed891b814a52712dcd852554cb08 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 8 May 2017 15:56:41 -0700
Subject: pidns: expose task pid_ns_for_children to userspace

pid_ns_for_children set by a task is known only to the task itself, and
it's impossible to identify it from outside.

It's a big problem for checkpoint/restore software like CRIU, because it
can't correctly handle tasks, that do setns(CLONE_NEWPID) in proccess of
their work.

This patch solves the problem, and it exposes pid_ns_for_children to ns
directory in standard way with the name "pid_for_children":

  ~# ls /proc/5531/ns -l | grep pid
  lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid -> pid:[4026531836]
  lrwxrwxrwx 1 root root 0 Jan 14 16:38 pid_for_children -> pid:[4026532286]

Link: http://lkml.kernel.org/r/149201123914.6007.2187327078064239572.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Andrei Vagin <avagin@virtuozzo.com>
Cc: Andreas Gruenbacher <agruenba@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Serge Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/namespaces.c    |  1 +
 include/linux/proc_ns.h |  1 +
 kernel/pid_namespace.c  | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 766f0c637ad1..3803b24ca220 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = {
 #endif
 #ifdef CONFIG_PID_NS
 	&pidns_operations,
+	&pidns_for_children_operations,
 #endif
 #ifdef CONFIG_USER_NS
 	&userns_operations,
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 88dba3b53375..58ab28d81fc2 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -27,6 +27,7 @@ extern const struct proc_ns_operations netns_operations;
 extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations pidns_for_children_operations;
 extern const struct proc_ns_operations userns_operations;
 extern const struct proc_ns_operations mntns_operations;
 extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index de461aa0bf9a..d1f3e9f558b8 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -374,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task)
 	return ns ? &ns->ns : NULL;
 }
 
+static struct ns_common *pidns_for_children_get(struct task_struct *task)
+{
+	struct pid_namespace *ns = NULL;
+
+	task_lock(task);
+	if (task->nsproxy) {
+		ns = task->nsproxy->pid_ns_for_children;
+		get_pid_ns(ns);
+	}
+	task_unlock(task);
+
+	if (ns) {
+		read_lock(&tasklist_lock);
+		if (!ns->child_reaper) {
+			put_pid_ns(ns);
+			ns = NULL;
+		}
+		read_unlock(&tasklist_lock);
+	}
+
+	return ns ? &ns->ns : NULL;
+}
+
 static void pidns_put(struct ns_common *ns)
 {
 	put_pid_ns(to_pid_ns(ns));
@@ -443,6 +466,17 @@ const struct proc_ns_operations pidns_operations = {
 	.get_parent	= pidns_get_parent,
 };
 
+const struct proc_ns_operations pidns_for_children_operations = {
+	.name		= "pid_for_children",
+	.real_ns_name	= "pid",
+	.type		= CLONE_NEWPID,
+	.get		= pidns_for_children_get,
+	.put		= pidns_put,
+	.install	= pidns_install,
+	.owner		= pidns_owner,
+	.get_parent	= pidns_get_parent,
+};
+
 static __init int pid_namespaces_init(void)
 {
 	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
-- 
cgit v1.2.3


From 60f3e00d25b44e3aa51846590d1e10f408466a83 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Mon, 8 May 2017 15:57:06 -0700
Subject: sysv,ipc: cacheline align kern_ipc_perm

Assign 'struct kern_ipc_perm' its own cacheline to avoid false sharing
with sysv ipc calls.

While the structure itself is rather read-mostly throughout the lifespan
of ipc, the spinlock causes most of the invalidations.  One example is
commit 31a7c4746e9 ("ipc/sem.c: cacheline align the ipc spinlock for
semaphores").  Therefore, extend this to all ipc.

The effect of cacheline alignment on sems can be seen in sembench, which
deals mostly with semtimedop wait/wakes is seen to improve raw
throughput (worker loops) between 8 to 12% on a 24-core x86 with over 4
threads.

Link: http://lkml.kernel.org/r/1486673582-6979-4-git-send-email-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc.h | 7 +++----
 include/linux/sem.h | 3 +--
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc.h b/include/linux/ipc.h
index 9d84942ae2e5..71fd92d81b26 100644
--- a/include/linux/ipc.h
+++ b/include/linux/ipc.h
@@ -8,8 +8,7 @@
 #define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
 
 /* used by in-kernel data structures */
-struct kern_ipc_perm
-{
+struct kern_ipc_perm {
 	spinlock_t	lock;
 	bool		deleted;
 	int		id;
@@ -18,9 +17,9 @@ struct kern_ipc_perm
 	kgid_t		gid;
 	kuid_t		cuid;
 	kgid_t		cgid;
-	umode_t		mode; 
+	umode_t		mode;
 	unsigned long	seq;
 	void		*security;
-};
+} ____cacheline_aligned_in_smp;
 
 #endif /* _LINUX_IPC_H */
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 4fc222f8755d..9edec926e9d9 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -10,8 +10,7 @@ struct task_struct;
 
 /* One sem_array data structure for each set of semaphores in the system. */
 struct sem_array {
-	struct kern_ipc_perm	____cacheline_aligned_in_smp
-				sem_perm;	/* permissions .. see ipc.h */
+	struct kern_ipc_perm	sem_perm;	/* permissions .. see ipc.h */
 	time_t			sem_ctime;	/* last change time */
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct list_head	pending_alter;	/* pending operations */
-- 
cgit v1.2.3


From a7c3e901a46ff54c016d040847eda598a9e3e653 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:09 -0700
Subject: mm: introduce kv[mz]alloc helpers

Patch series "kvmalloc", v5.

There are many open coded kmalloc with vmalloc fallback instances in the
tree.  Most of them are not careful enough or simply do not care about
the underlying semantic of the kmalloc/page allocator which means that
a) some vmalloc fallbacks are basically unreachable because the kmalloc
part will keep retrying until it succeeds b) the page allocator can
invoke a really disruptive steps like the OOM killer to move forward
which doesn't sound appropriate when we consider that the vmalloc
fallback is available.

As it can be seen implementing kvmalloc requires quite an intimate
knowledge if the page allocator and the memory reclaim internals which
strongly suggests that a helper should be implemented in the memory
subsystem proper.

Most callers, I could find, have been converted to use the helper
instead.  This is patch 6.  There are some more relying on __GFP_REPEAT
in the networking stack which I have converted as well and Eric Dumazet
was not opposed [2] to convert them as well.

[1] http://lkml.kernel.org/r/20170130094940.13546-1-mhocko@kernel.org
[2] http://lkml.kernel.org/r/1485273626.16328.301.camel@edumazet-glaptop3.roam.corp.google.com

This patch (of 9):

Using kmalloc with the vmalloc fallback for larger allocations is a
common pattern in the kernel code.  Yet we do not have any common helper
for that and so users have invented their own helpers.  Some of them are
really creative when doing so.  Let's just add kv[mz]alloc and make sure
it is implemented properly.  This implementation makes sure to not make
a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also
to not warn about allocation failures.  This also rules out the OOM
killer as the vmalloc is a more approapriate fallback than a disruptive
user visible action.

This patch also changes some existing users and removes helpers which
are specific for them.  In some cases this is not possible (e.g.
ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and
require GFP_NO{FS,IO} context which is not vmalloc compatible in general
(note that the page table allocation is GFP_KERNEL).  Those need to be
fixed separately.

While we are at it, document that __vmalloc{_node} about unsupported gfp
mask because there seems to be a lot of confusion out there.
kvmalloc_node will warn about GFP_KERNEL incompatible (which are not
superset) flags to catch new abusers.  Existing ones would have to die
slowly.

[sfr@canb.auug.org.au: f2fs fixup]
  Link: http://lkml.kernel.org/r/20170320163735.332e64b7@canb.auug.org.au
Link: http://lkml.kernel.org/r/20170306103032.2540-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>	[ext4 part]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kvm/lapic.c              |  4 ++--
 arch/x86/kvm/page_track.c         |  4 ++--
 arch/x86/kvm/x86.c                |  4 ++--
 drivers/md/dm-stats.c             |  7 +-----
 fs/ext4/mballoc.c                 |  2 +-
 fs/ext4/super.c                   |  4 ++--
 fs/f2fs/f2fs.h                    | 20 -----------------
 fs/f2fs/file.c                    |  4 ++--
 fs/f2fs/node.c                    |  6 +++---
 fs/f2fs/segment.c                 | 14 ++++++------
 fs/seq_file.c                     | 16 +-------------
 include/linux/kvm_host.h          |  2 --
 include/linux/mm.h                | 14 ++++++++++++
 include/linux/vmalloc.h           |  1 +
 ipc/util.c                        |  7 +-----
 mm/nommu.c                        |  5 +++++
 mm/util.c                         | 45 +++++++++++++++++++++++++++++++++++++++
 mm/vmalloc.c                      |  9 +++++++-
 security/apparmor/apparmorfs.c    |  2 +-
 security/apparmor/include/lib.h   | 11 ----------
 security/apparmor/lib.c           | 30 --------------------------
 security/apparmor/match.c         |  2 +-
 security/apparmor/policy_unpack.c |  2 +-
 virt/kvm/kvm_main.c               | 18 +++-------------
 24 files changed, 103 insertions(+), 130 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bad6a25067bc..d2a892fc92bf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -177,8 +177,8 @@ static void recalculate_apic_map(struct kvm *kvm)
 		if (kvm_apic_present(vcpu))
 			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
-	new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
-	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+	new = kvzalloc(sizeof(struct kvm_apic_map) +
+	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
 
 	if (!new)
 		goto out;
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 60168cdd0546..ea67dc876316 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -40,8 +40,8 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
 	int  i;
 
 	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
-		slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
-					    sizeof(*slot->arch.gfn_track[i]));
+		slot->arch.gfn_track[i] = kvzalloc(npages *
+					    sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL);
 		if (!slot->arch.gfn_track[i])
 			goto track_free;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ccbd45ecd41a..ee22226e3807 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8199,13 +8199,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 				      slot->base_gfn, level) + 1;
 
 		slot->arch.rmap[i] =
-			kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
+			kvzalloc(lpages * sizeof(*slot->arch.rmap[i]), GFP_KERNEL);
 		if (!slot->arch.rmap[i])
 			goto out_free;
 		if (i == 0)
 			continue;
 
-		linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+		linfo = kvzalloc(lpages * sizeof(*linfo), GFP_KERNEL);
 		if (!linfo)
 			goto out_free;
 
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 0250e7e521ab..6028d8247f58 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -146,12 +146,7 @@ static void *dm_kvzalloc(size_t alloc_size, int node)
 	if (!claim_shared_memory(alloc_size))
 		return NULL;
 
-	if (alloc_size <= KMALLOC_MAX_SIZE) {
-		p = kzalloc_node(alloc_size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN, node);
-		if (p)
-			return p;
-	}
-	p = vzalloc_node(alloc_size, node);
+	p = kvzalloc_node(alloc_size, GFP_KERNEL | __GFP_NOMEMALLOC, node);
 	if (p)
 		return p;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 354dc1a894c2..b60698c104fd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2393,7 +2393,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
 		return 0;
 
 	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
-	new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
+	new_groupinfo = kvzalloc(size, GFP_KERNEL);
 	if (!new_groupinfo) {
 		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
 		return -ENOMEM;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9c72e39a4ee..b2c74644d5de 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2153,7 +2153,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
 		return 0;
 
 	size = roundup_pow_of_two(size * sizeof(struct flex_groups));
-	new_groups = ext4_kvzalloc(size, GFP_KERNEL);
+	new_groups = kvzalloc(size, GFP_KERNEL);
 	if (!new_groups) {
 		ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
 			 size / (int) sizeof(struct flex_groups));
@@ -3887,7 +3887,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount;
 		}
 	}
-	sbi->s_group_desc = ext4_kvmalloc(db_count *
+	sbi->s_group_desc = kvmalloc(db_count *
 					  sizeof(struct buffer_head *),
 					  GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0a6e115562f6..1fc17a1fc5d0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2005,26 +2005,6 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 	return kmalloc(size, flags);
 }
 
-static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
-{
-	void *ret;
-
-	ret = kmalloc(size, flags | __GFP_NOWARN);
-	if (!ret)
-		ret = __vmalloc(size, flags, PAGE_KERNEL);
-	return ret;
-}
-
-static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
-{
-	void *ret;
-
-	ret = kzalloc(size, flags | __GFP_NOWARN);
-	if (!ret)
-		ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
-	return ret;
-}
-
 #define get_inode_mode(i) \
 	((is_inode_flag_set(i, FI_ACL_MODE)) ? \
 	 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5f7317875a67..0849af78381f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1012,11 +1012,11 @@ static int __exchange_data_block(struct inode *src_inode,
 	while (len) {
 		olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len);
 
-		src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
+		src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL);
 		if (!src_blkaddr)
 			return -ENOMEM;
 
-		do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL);
+		do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL);
 		if (!do_replace) {
 			kvfree(src_blkaddr);
 			return -ENOMEM;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 481aa8dc79f4..0ea1dca8a0e2 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2621,17 +2621,17 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
-	nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks *
+	nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
 					NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
 	if (!nm_i->free_nid_bitmap)
 		return -ENOMEM;
 
-	nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8,
+	nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
 								GFP_KERNEL);
 	if (!nm_i->nat_block_bitmap)
 		return -ENOMEM;
 
-	nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks *
+	nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
 					sizeof(unsigned short), GFP_KERNEL);
 	if (!nm_i->free_nid_count)
 		return -ENOMEM;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 29ef7088c558..13806f642ab5 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -2501,13 +2501,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 	SM_I(sbi)->sit_info = sit_i;
 
-	sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+	sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) *
 					sizeof(struct seg_entry), GFP_KERNEL);
 	if (!sit_i->sentries)
 		return -ENOMEM;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+	sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL);
 	if (!sit_i->dirty_sentries_bitmap)
 		return -ENOMEM;
 
@@ -2540,7 +2540,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 		return -ENOMEM;
 
 	if (sbi->segs_per_sec > 1) {
-		sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+		sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) *
 					sizeof(struct sec_entry), GFP_KERNEL);
 		if (!sit_i->sec_entries)
 			return -ENOMEM;
@@ -2591,12 +2591,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
 	SM_I(sbi)->free_info = free_i;
 
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
-	free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
+	free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL);
 	if (!free_i->free_segmap)
 		return -ENOMEM;
 
 	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
-	free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
+	free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL);
 	if (!free_i->free_secmap)
 		return -ENOMEM;
 
@@ -2764,7 +2764,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
 
-	dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+	dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL);
 	if (!dirty_i->victim_secmap)
 		return -ENOMEM;
 	return 0;
@@ -2786,7 +2786,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
 	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
 
 	for (i = 0; i < NR_DIRTY_TYPE; i++) {
-		dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
+		dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL);
 		if (!dirty_i->dirty_segmap[i])
 			return -ENOMEM;
 	}
diff --git a/fs/seq_file.c b/fs/seq_file.c
index ca69fb99e41a..dc7c2be963ed 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m)
 
 static void *seq_buf_alloc(unsigned long size)
 {
-	void *buf;
-	gfp_t gfp = GFP_KERNEL;
-
-	/*
-	 * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
-	 * it's better to fall back to vmalloc() than to kill things.  For small
-	 * allocations, just use GFP_KERNEL which will oom kill, thus no need
-	 * for vmalloc fallback.
-	 */
-	if (size > PAGE_SIZE)
-		gfp |= __GFP_NORETRY | __GFP_NOWARN;
-	buf = kmalloc(size, gfp);
-	if (!buf && size > PAGE_SIZE)
-		buf = vmalloc(size);
-	return buf;
+	return kvmalloc(size, GFP_KERNEL);
 }
 
 /**
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d0250744507a..5d9b2a08e553 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -767,8 +767,6 @@ void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
-void *kvm_kvzalloc(unsigned long size);
-
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5d22e69f51ea..08e2849d27ca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -518,6 +518,20 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+	return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+	return kvmalloc(size, flags | __GFP_ZERO);
+}
+
 extern void kvfree(const void *addr);
 
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d68edffbf142..46991ad3ddd5 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -80,6 +80,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller);
+extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
diff --git a/ipc/util.c b/ipc/util.c
index 3459a16a9df9..caec7b1bfaa3 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -403,12 +403,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp)
  */
 void *ipc_alloc(int size)
 {
-	void *out;
-	if (size > PAGE_SIZE)
-		out = vmalloc(size);
-	else
-		out = kmalloc(size, GFP_KERNEL);
-	return out;
+	return kvmalloc(size, GFP_KERNEL);
 }
 
 /**
diff --git a/mm/nommu.c b/mm/nommu.c
index 2d131b97a851..a80411d258fc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -237,6 +237,11 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+{
+	return __vmalloc(size, flags, PAGE_KERNEL);
+}
+
 void *vmalloc_user(unsigned long size)
 {
 	void *ret;
diff --git a/mm/util.c b/mm/util.c
index 656dc5e37a87..10a14a0ac3c2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -329,6 +329,51 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
 
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+	gfp_t kmalloc_flags = flags;
+	void *ret;
+
+	/*
+	 * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+	 * so the given set of flags has to be compatible.
+	 */
+	WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+
+	/*
+	 * Make sure that larger requests are not too disruptive - no OOM
+	 * killer and no allocation failure warnings as we have a fallback
+	 */
+	if (size > PAGE_SIZE)
+		kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN;
+
+	ret = kmalloc_node(size, kmalloc_flags, node);
+
+	/*
+	 * It doesn't really make sense to fallback to vmalloc for sub page
+	 * requests
+	 */
+	if (ret || size <= PAGE_SIZE)
+		return ret;
+
+	return __vmalloc_node_flags(size, node, flags | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
 void kvfree(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b52aeed3f58e..33603239560e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1786,6 +1786,13 @@ fail:
  *	Allocate enough pages to cover @size from the page level
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
+ *
+ *	Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT
+ *	and __GFP_NOFAIL are not supported
+ *
+ *	Any use of gfp flags outside of GFP_KERNEL should be consulted
+ *	with mm people.
+ *
  */
 static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
@@ -1802,7 +1809,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-static inline void *__vmalloc_node_flags(unsigned long size,
+void *__vmalloc_node_flags(unsigned long size,
 					int node, gfp_t flags)
 {
 	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 41073f70eb41..be0b49897a67 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -98,7 +98,7 @@ static struct aa_loaddata *aa_simple_write_to_buffer(const char __user *userbuf,
 		return ERR_PTR(-ESPIPE);
 
 	/* freed by caller to simple_write_to_buffer */
-	data = kvmalloc(sizeof(*data) + alloc_size);
+	data = kvmalloc(sizeof(*data) + alloc_size, GFP_KERNEL);
 	if (data == NULL)
 		return ERR_PTR(-ENOMEM);
 	kref_init(&data->count);
diff --git a/security/apparmor/include/lib.h b/security/apparmor/include/lib.h
index 0291ff3902f9..550a700563b4 100644
--- a/security/apparmor/include/lib.h
+++ b/security/apparmor/include/lib.h
@@ -64,17 +64,6 @@ char *aa_split_fqname(char *args, char **ns_name);
 const char *aa_splitn_fqname(const char *fqname, size_t n, const char **ns_name,
 			     size_t *ns_len);
 void aa_info_message(const char *str);
-void *__aa_kvmalloc(size_t size, gfp_t flags);
-
-static inline void *kvmalloc(size_t size)
-{
-	return __aa_kvmalloc(size, 0);
-}
-
-static inline void *kvzalloc(size_t size)
-{
-	return __aa_kvmalloc(size, __GFP_ZERO);
-}
 
 /**
  * aa_strneq - compare null terminated @str to a non null terminated substring
diff --git a/security/apparmor/lib.c b/security/apparmor/lib.c
index 32cafc12593e..7cd788a9445b 100644
--- a/security/apparmor/lib.c
+++ b/security/apparmor/lib.c
@@ -128,36 +128,6 @@ void aa_info_message(const char *str)
 	printk(KERN_INFO "AppArmor: %s\n", str);
 }
 
-/**
- * __aa_kvmalloc - do allocation preferring kmalloc but falling back to vmalloc
- * @size: how many bytes of memory are required
- * @flags: the type of memory to allocate (see kmalloc).
- *
- * Return: allocated buffer or NULL if failed
- *
- * It is possible that policy being loaded from the user is larger than
- * what can be allocated by kmalloc, in those cases fall back to vmalloc.
- */
-void *__aa_kvmalloc(size_t size, gfp_t flags)
-{
-	void *buffer = NULL;
-
-	if (size == 0)
-		return NULL;
-
-	/* do not attempt kmalloc if we need more than 16 pages at once */
-	if (size <= (16*PAGE_SIZE))
-		buffer = kmalloc(size, flags | GFP_KERNEL | __GFP_NORETRY |
-				 __GFP_NOWARN);
-	if (!buffer) {
-		if (flags & __GFP_ZERO)
-			buffer = vzalloc(size);
-		else
-			buffer = vmalloc(size);
-	}
-	return buffer;
-}
-
 /**
  * aa_policy_init - initialize a policy structure
  * @policy: policy to initialize  (NOT NULL)
diff --git a/security/apparmor/match.c b/security/apparmor/match.c
index eb0efef746f5..960c913381e2 100644
--- a/security/apparmor/match.c
+++ b/security/apparmor/match.c
@@ -88,7 +88,7 @@ static struct table_header *unpack_table(char *blob, size_t bsize)
 	if (bsize < tsize)
 		goto out;
 
-	table = kvzalloc(tsize);
+	table = kvzalloc(tsize, GFP_KERNEL);
 	if (table) {
 		table->td_id = th.td_id;
 		table->td_flags = th.td_flags;
diff --git a/security/apparmor/policy_unpack.c b/security/apparmor/policy_unpack.c
index 2e37c9c26bbd..f3422a91353c 100644
--- a/security/apparmor/policy_unpack.c
+++ b/security/apparmor/policy_unpack.c
@@ -487,7 +487,7 @@ fail:
 
 static void *kvmemdup(const void *src, size_t len)
 {
-	void *p = kvmalloc(len);
+	void *p = kvmalloc(len, GFP_KERNEL);
 
 	if (p)
 		memcpy(p, src, len);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 88257b311cb5..aca22d36be9c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -504,7 +504,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
 	int i;
 	struct kvm_memslots *slots;
 
-	slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
+	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 	if (!slots)
 		return NULL;
 
@@ -689,18 +689,6 @@ out_err_no_disable:
 	return ERR_PTR(r);
 }
 
-/*
- * Avoid using vmalloc for a small buffer.
- * Should not be used when the size is statically known.
- */
-void *kvm_kvzalloc(unsigned long size)
-{
-	if (size > PAGE_SIZE)
-		return vzalloc(size);
-	else
-		return kzalloc(size, GFP_KERNEL);
-}
-
 static void kvm_destroy_devices(struct kvm *kvm)
 {
 	struct kvm_device *dev, *tmp;
@@ -782,7 +770,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
-	memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
+	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
@@ -1008,7 +996,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			goto out_free;
 	}
 
-	slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
+	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
 	if (!slots)
 		goto out_free;
 	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
-- 
cgit v1.2.3


From 1f5307b1e094bfffa83c65c40ac6e3415c108780 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:12 -0700
Subject: mm, vmalloc: properly track vmalloc users

__vmalloc_node_flags used to be static inline but this has changed by
"mm: introduce kv[mz]alloc helpers" because kvmalloc_node needs to use
it as well and the code is outside of the vmalloc proper.  I haven't
realized that changing this will lead to a subtle bug though.  The
function is responsible to track the caller as well.  This caller is
then printed by /proc/vmallocinfo.  If __vmalloc_node_flags is not
inline then we would get only direct users of __vmalloc_node_flags as
callers (e.g.  v[mz]alloc) which reduces usefulness of this debugging
feature considerably.  It simply doesn't help to see that the given
range belongs to vmalloc as a caller:

  0xffffc90002c79000-0xffffc90002c7d000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N0=3
  0xffffc90002c81000-0xffffc90002c85000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3
  0xffffc90002c8d000-0xffffc90002c91000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3
  0xffffc90002c95000-0xffffc90002c99000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3

We really want to catch the _caller_ of the vmalloc function.  Fix this
issue by making __vmalloc_node_flags static inline again.

Link: http://lkml.kernel.org/r/20170502134657.12381-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 19 +++++++++++++++++++
 mm/vmalloc.c            | 12 +-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 46991ad3ddd5..0328ce003992 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -6,6 +6,7 @@
 #include <linux/list.h>
 #include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
+#include <asm/pgtable.h>	/* PAGE_KERNEL */
 #include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
@@ -80,7 +81,25 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller);
+#ifndef CONFIG_MMU
 extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
+#else
+extern void *__vmalloc_node(unsigned long size, unsigned long align,
+			    gfp_t gfp_mask, pgprot_t prot,
+			    int node, const void *caller);
+
+/*
+ * We really want to have this inlined due to caller tracking. This
+ * function is used by the highlevel vmalloc apis and so we want to track
+ * their callers and inlining will achieve that.
+ */
+static inline void *__vmalloc_node_flags(unsigned long size,
+					int node, gfp_t flags)
+{
+	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+					node, __builtin_return_address(0));
+}
+#endif
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 33603239560e..717b1e8b942c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1649,9 +1649,6 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
-static void *__vmalloc_node(unsigned long size, unsigned long align,
-			    gfp_t gfp_mask, pgprot_t prot,
-			    int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
@@ -1794,7 +1791,7 @@ fail:
  *	with mm people.
  *
  */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
 			    int node, const void *caller)
 {
@@ -1809,13 +1806,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-void *__vmalloc_node_flags(unsigned long size,
-					int node, gfp_t flags)
-{
-	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
-					node, __builtin_return_address(0));
-}
-
 /**
  *	vmalloc  -  allocate virtually contiguous memory
  *	@size:		allocation size
-- 
cgit v1.2.3


From 752ade68cbd81d0321dfecc188f655a945551b25 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:27 -0700
Subject: treewide: use kv[mz]alloc* rather than opencoded variants

There are many code paths opencoding kvmalloc.  Let's use the helper
instead.  The main difference to kvmalloc is that those users are
usually not considering all the aspects of the memory allocator.  E.g.
allocation requests <= 32kB (with 4kB pages) are basically never failing
and invoke OOM killer to satisfy the allocation.  This sounds too
disruptive for something that has a reasonable fallback - the vmalloc.
On the other hand those requests might fallback to vmalloc even when the
memory allocator would succeed after several more reclaim/compaction
attempts previously.  There is no guarantee something like that happens
though.

This patch converts many of those places to kv[mz]alloc* helpers because
they are more conservative.

Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390
Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim
Acked-by: David Sterba <dsterba@suse.com> # btrfs
Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph
Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4
Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Santosh Raspatur <santosh@chelsio.com>
Cc: Hariprasad S <hariprasad@chelsio.com>
Cc: Yishai Hadas <yishaih@mellanox.com>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: "Yan, Zheng" <zyan@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kvm/kvm-s390.c                           | 10 ++-----
 crypto/lzo.c                                       |  4 +--
 drivers/acpi/apei/erst.c                           |  8 ++----
 drivers/char/agp/generic.c                         |  8 +-----
 drivers/gpu/drm/nouveau/nouveau_gem.c              |  4 +--
 drivers/md/bcache/util.h                           | 12 ++------
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h    |  3 --
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c | 29 +++----------------
 drivers/net/ethernet/chelsio/cxgb3/l2t.c           |  8 +-----
 drivers/net/ethernet/chelsio/cxgb3/l2t.h           |  1 -
 drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c      | 12 ++++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  3 --
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 10 +++----
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c |  8 +++---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    | 31 ++++----------------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  | 14 ++++-----
 drivers/net/ethernet/chelsio/cxgb4/l2t.c           |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/sched.c         | 12 ++++----
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |  9 ++----
 drivers/net/ethernet/mellanox/mlx4/mr.c            |  9 ++----
 drivers/nvdimm/dimm_devs.c                         |  5 +---
 .../staging/lustre/lnet/libcfs/linux/linux-mem.c   | 11 +-------
 drivers/xen/evtchn.c                               | 14 +--------
 fs/btrfs/ctree.c                                   |  9 ++----
 fs/btrfs/ioctl.c                                   |  9 ++----
 fs/btrfs/send.c                                    | 27 ++++++------------
 fs/ceph/file.c                                     |  9 ++----
 fs/select.c                                        |  5 +---
 fs/xattr.c                                         | 27 ++++++------------
 include/linux/mlx5/driver.h                        |  7 +----
 include/linux/mm.h                                 |  8 ++++++
 lib/iov_iter.c                                     |  5 +---
 mm/frame_vector.c                                  |  5 +---
 net/ipv4/inet_hashtables.c                         |  6 +---
 net/ipv4/tcp_metrics.c                             |  5 +---
 net/mpls/af_mpls.c                                 |  5 +---
 net/netfilter/x_tables.c                           | 21 +++-----------
 net/netfilter/xt_recent.c                          |  5 +---
 net/sched/sch_choke.c                              |  5 +---
 net/sched/sch_fq_codel.c                           | 26 ++++-------------
 net/sched/sch_hhf.c                                | 33 ++++++----------------
 net/sched/sch_netem.c                              |  6 +---
 net/sched/sch_sfq.c                                |  6 +---
 security/keys/keyctl.c                             | 22 ++++-----------
 44 files changed, 128 insertions(+), 350 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d5c5c911821a..323297e55e80 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1166,10 +1166,7 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
 		return -EINVAL;
 
-	keys = kmalloc_array(args->count, sizeof(uint8_t),
-			     GFP_KERNEL | __GFP_NOWARN);
-	if (!keys)
-		keys = vmalloc(sizeof(uint8_t) * args->count);
+	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
 	if (!keys)
 		return -ENOMEM;
 
@@ -1211,10 +1208,7 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
 	if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX)
 		return -EINVAL;
 
-	keys = kmalloc_array(args->count, sizeof(uint8_t),
-			     GFP_KERNEL | __GFP_NOWARN);
-	if (!keys)
-		keys = vmalloc(sizeof(uint8_t) * args->count);
+	keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL);
 	if (!keys)
 		return -ENOMEM;
 
diff --git a/crypto/lzo.c b/crypto/lzo.c
index 168df784da84..218567d717d6 100644
--- a/crypto/lzo.c
+++ b/crypto/lzo.c
@@ -32,9 +32,7 @@ static void *lzo_alloc_ctx(struct crypto_scomp *tfm)
 {
 	void *ctx;
 
-	ctx = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL | __GFP_NOWARN);
-	if (!ctx)
-		ctx = vmalloc(LZO1X_MEM_COMPRESS);
+	ctx = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 7207e5fc9d3d..2c462beee551 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -513,7 +513,7 @@ retry:
 	if (i < erst_record_id_cache.len)
 		goto retry;
 	if (erst_record_id_cache.len >= erst_record_id_cache.size) {
-		int new_size, alloc_size;
+		int new_size;
 		u64 *new_entries;
 
 		new_size = erst_record_id_cache.size * 2;
@@ -524,11 +524,7 @@ retry:
 				pr_warn(FW_WARN "too many record IDs!\n");
 			return 0;
 		}
-		alloc_size = new_size * sizeof(entries[0]);
-		if (alloc_size < PAGE_SIZE)
-			new_entries = kmalloc(alloc_size, GFP_KERNEL);
-		else
-			new_entries = vmalloc(alloc_size);
+		new_entries = kvmalloc(new_size * sizeof(entries[0]), GFP_KERNEL);
 		if (!new_entries)
 			return -ENOMEM;
 		memcpy(new_entries, entries,
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index f002fa5d1887..bdf418cac8ef 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -88,13 +88,7 @@ static int agp_get_key(void)
 
 void agp_alloc_page_array(size_t size, struct agp_memory *mem)
 {
-	mem->pages = NULL;
-
-	if (size <= 2*PAGE_SIZE)
-		mem->pages = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
-	if (mem->pages == NULL) {
-		mem->pages = vmalloc(size);
-	}
+	mem->pages = kvmalloc(size, GFP_KERNEL);
 }
 EXPORT_SYMBOL(agp_alloc_page_array);
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index ca5397beb357..2170534101ca 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -568,9 +568,7 @@ u_memcpya(uint64_t user, unsigned nmemb, unsigned size)
 
 	size *= nmemb;
 
-	mem = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
-	if (!mem)
-		mem = vmalloc(size);
+	mem = kvmalloc(size, GFP_KERNEL);
 	if (!mem)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 5d13930f0f22..cb8d2ccbb6c6 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -43,11 +43,7 @@ struct closure;
 	(heap)->used = 0;						\
 	(heap)->size = (_size);						\
 	_bytes = (heap)->size * sizeof(*(heap)->data);			\
-	(heap)->data = NULL;						\
-	if (_bytes < KMALLOC_MAX_SIZE)					\
-		(heap)->data = kmalloc(_bytes, (gfp));			\
-	if ((!(heap)->data) && ((gfp) & GFP_KERNEL))			\
-		(heap)->data = vmalloc(_bytes);				\
+	(heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL);		\
 	(heap)->data;							\
 })
 
@@ -136,12 +132,8 @@ do {									\
 									\
 	(fifo)->mask = _allocated_size - 1;				\
 	(fifo)->front = (fifo)->back = 0;				\
-	(fifo)->data = NULL;						\
 									\
-	if (_bytes < KMALLOC_MAX_SIZE)					\
-		(fifo)->data = kmalloc(_bytes, (gfp));			\
-	if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))			\
-		(fifo)->data = vmalloc(_bytes);				\
+	(fifo)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL);		\
 	(fifo)->data;							\
 })
 
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
index 920d918ed193..f04e81f33795 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_defs.h
@@ -41,9 +41,6 @@
 
 #define VALIDATE_TID 1
 
-void *cxgb_alloc_mem(unsigned long size);
-void cxgb_free_mem(void *addr);
-
 /*
  * Map an ATID or STID to their entries in the corresponding TID tables.
  */
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
index 76684dcb874c..fa81445e334c 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
@@ -1151,27 +1151,6 @@ static void cxgb_redirect(struct dst_entry *old, struct dst_entry *new,
 	l2t_release(tdev, e);
 }
 
-/*
- * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc.
- * The allocated memory is cleared.
- */
-void *cxgb_alloc_mem(unsigned long size)
-{
-	void *p = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-
-	if (!p)
-		p = vzalloc(size);
-	return p;
-}
-
-/*
- * Free memory allocated through t3_alloc_mem().
- */
-void cxgb_free_mem(void *addr)
-{
-	kvfree(addr);
-}
-
 /*
  * Allocate and initialize the TID tables.  Returns 0 on success.
  */
@@ -1182,7 +1161,7 @@ static int init_tid_tabs(struct tid_info *t, unsigned int ntids,
 	unsigned long size = ntids * sizeof(*t->tid_tab) +
 	    natids * sizeof(*t->atid_tab) + nstids * sizeof(*t->stid_tab);
 
-	t->tid_tab = cxgb_alloc_mem(size);
+	t->tid_tab = kvzalloc(size, GFP_KERNEL);
 	if (!t->tid_tab)
 		return -ENOMEM;
 
@@ -1218,7 +1197,7 @@ static int init_tid_tabs(struct tid_info *t, unsigned int ntids,
 
 static void free_tid_maps(struct tid_info *t)
 {
-	cxgb_free_mem(t->tid_tab);
+	kvfree(t->tid_tab);
 }
 
 static inline void add_adapter(struct adapter *adap)
@@ -1293,7 +1272,7 @@ int cxgb3_offload_activate(struct adapter *adapter)
 	return 0;
 
 out_free_l2t:
-	t3_free_l2t(l2td);
+	kvfree(l2td);
 out_free:
 	kfree(t);
 	return err;
@@ -1302,7 +1281,7 @@ out_free:
 static void clean_l2_data(struct rcu_head *head)
 {
 	struct l2t_data *d = container_of(head, struct l2t_data, rcu_head);
-	t3_free_l2t(d);
+	kvfree(d);
 }
 
 
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.c b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
index 52063587e1e9..26264125865f 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.c
@@ -444,7 +444,7 @@ struct l2t_data *t3_init_l2t(unsigned int l2t_capacity)
 	struct l2t_data *d;
 	int i, size = sizeof(*d) + l2t_capacity * sizeof(struct l2t_entry);
 
-	d = cxgb_alloc_mem(size);
+	d = kvzalloc(size, GFP_KERNEL);
 	if (!d)
 		return NULL;
 
@@ -462,9 +462,3 @@ struct l2t_data *t3_init_l2t(unsigned int l2t_capacity)
 	}
 	return d;
 }
-
-void t3_free_l2t(struct l2t_data *d)
-{
-	cxgb_free_mem(d);
-}
-
diff --git a/drivers/net/ethernet/chelsio/cxgb3/l2t.h b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
index 8cffcdfd5678..c2fd323c4078 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/l2t.h
+++ b/drivers/net/ethernet/chelsio/cxgb3/l2t.h
@@ -115,7 +115,6 @@ int t3_l2t_send_slow(struct t3cdev *dev, struct sk_buff *skb,
 		     struct l2t_entry *e);
 void t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e);
 struct l2t_data *t3_init_l2t(unsigned int l2t_capacity);
-void t3_free_l2t(struct l2t_data *d);
 
 int cxgb3_ofld_send(struct t3cdev *dev, struct sk_buff *skb);
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c b/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c
index 7ad43af6bde1..3103ef9b561d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/clip_tbl.c
@@ -290,8 +290,8 @@ struct clip_tbl *t4_init_clip_tbl(unsigned int clipt_start,
 	if (clipt_size < CLIPT_MIN_HASH_BUCKETS)
 		return NULL;
 
-	ctbl = t4_alloc_mem(sizeof(*ctbl) +
-			    clipt_size*sizeof(struct list_head));
+	ctbl = kvzalloc(sizeof(*ctbl) +
+			    clipt_size*sizeof(struct list_head), GFP_KERNEL);
 	if (!ctbl)
 		return NULL;
 
@@ -305,9 +305,9 @@ struct clip_tbl *t4_init_clip_tbl(unsigned int clipt_start,
 	for (i = 0; i < ctbl->clipt_size; ++i)
 		INIT_LIST_HEAD(&ctbl->hash_list[i]);
 
-	cl_list = t4_alloc_mem(clipt_size*sizeof(struct clip_entry));
+	cl_list = kvzalloc(clipt_size*sizeof(struct clip_entry), GFP_KERNEL);
 	if (!cl_list) {
-		t4_free_mem(ctbl);
+		kvfree(ctbl);
 		return NULL;
 	}
 	ctbl->cl_list = (void *)cl_list;
@@ -326,8 +326,8 @@ void t4_cleanup_clip_tbl(struct adapter *adap)
 
 	if (ctbl) {
 		if (ctbl->cl_list)
-			t4_free_mem(ctbl->cl_list);
-		t4_free_mem(ctbl);
+			kvfree(ctbl->cl_list);
+		kvfree(ctbl);
 	}
 }
 EXPORT_SYMBOL(t4_cleanup_clip_tbl);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 163543b1ea0b..1d2be2dd19dd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1184,8 +1184,6 @@ extern const char cxgb4_driver_version[];
 void t4_os_portmod_changed(const struct adapter *adap, int port_id);
 void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
 
-void *t4_alloc_mem(size_t size);
-
 void t4_free_sge_resources(struct adapter *adap);
 void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q);
 irq_handler_t t4_intr_handler(struct adapter *adap);
@@ -1557,7 +1555,6 @@ int t4_sched_params(struct adapter *adapter, int type, int level, int mode,
 		    int rateunit, int ratemode, int channel, int class,
 		    int minrate, int maxrate, int weight, int pktsize);
 void t4_sge_decode_idma_state(struct adapter *adapter, int state);
-void t4_free_mem(void *addr);
 void t4_idma_monitor_init(struct adapter *adapter,
 			  struct sge_idma_monitor_state *idma);
 void t4_idma_monitor(struct adapter *adapter,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
index f6e739da7bb7..1fa34b009891 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c
@@ -2634,7 +2634,7 @@ static ssize_t mem_read(struct file *file, char __user *buf, size_t count,
 	if (count > avail - pos)
 		count = avail - pos;
 
-	data = t4_alloc_mem(count);
+	data = kvzalloc(count, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
 
@@ -2642,12 +2642,12 @@ static ssize_t mem_read(struct file *file, char __user *buf, size_t count,
 	ret = t4_memory_rw(adap, 0, mem, pos, count, data, T4_MEMORY_READ);
 	spin_unlock(&adap->win0_lock);
 	if (ret) {
-		t4_free_mem(data);
+		kvfree(data);
 		return ret;
 	}
 	ret = copy_to_user(buf, data, count);
 
-	t4_free_mem(data);
+	kvfree(data);
 	if (ret)
 		return -EFAULT;
 
@@ -2753,7 +2753,7 @@ static ssize_t blocked_fl_read(struct file *filp, char __user *ubuf,
 		       adap->sge.egr_sz, adap->sge.blocked_fl);
 	len += sprintf(buf + len, "\n");
 	size = simple_read_from_buffer(ubuf, count, ppos, buf, len);
-	t4_free_mem(buf);
+	kvfree(buf);
 	return size;
 }
 
@@ -2773,7 +2773,7 @@ static ssize_t blocked_fl_write(struct file *filp, const char __user *ubuf,
 		return err;
 
 	bitmap_copy(adap->sge.blocked_fl, t, adap->sge.egr_sz);
-	t4_free_mem(t);
+	kvfree(t);
 	return count;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 02f80febeb91..0ba7866c8259 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -969,7 +969,7 @@ static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
 {
 	int i, err = 0;
 	struct adapter *adapter = netdev2adap(dev);
-	u8 *buf = t4_alloc_mem(EEPROMSIZE);
+	u8 *buf = kvzalloc(EEPROMSIZE, GFP_KERNEL);
 
 	if (!buf)
 		return -ENOMEM;
@@ -980,7 +980,7 @@ static int get_eeprom(struct net_device *dev, struct ethtool_eeprom *e,
 
 	if (!err)
 		memcpy(data, buf + e->offset, e->len);
-	t4_free_mem(buf);
+	kvfree(buf);
 	return err;
 }
 
@@ -1009,7 +1009,7 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
 	if (aligned_offset != eeprom->offset || aligned_len != eeprom->len) {
 		/* RMW possibly needed for first or last words.
 		 */
-		buf = t4_alloc_mem(aligned_len);
+		buf = kvzalloc(aligned_len, GFP_KERNEL);
 		if (!buf)
 			return -ENOMEM;
 		err = eeprom_rd_phys(adapter, aligned_offset, (u32 *)buf);
@@ -1037,7 +1037,7 @@ static int set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom,
 		err = t4_seeprom_wp(adapter, true);
 out:
 	if (buf != data)
-		t4_free_mem(buf);
+		kvfree(buf);
 	return err;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index c12c4a3b82b5..38a5c6764bb5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -880,27 +880,6 @@ freeout:
 	return err;
 }
 
-/*
- * Allocate a chunk of memory using kmalloc or, if that fails, vmalloc.
- * The allocated memory is cleared.
- */
-void *t4_alloc_mem(size_t size)
-{
-	void *p = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-
-	if (!p)
-		p = vzalloc(size);
-	return p;
-}
-
-/*
- * Free memory allocated through alloc_mem().
- */
-void t4_free_mem(void *addr)
-{
-	kvfree(addr);
-}
-
 static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb,
 			     void *accel_priv, select_queue_fallback_t fallback)
 {
@@ -1299,7 +1278,7 @@ static int tid_init(struct tid_info *t)
 	       max_ftids * sizeof(*t->ftid_tab) +
 	       ftid_bmap_size * sizeof(long);
 
-	t->tid_tab = t4_alloc_mem(size);
+	t->tid_tab = kvzalloc(size, GFP_KERNEL);
 	if (!t->tid_tab)
 		return -ENOMEM;
 
@@ -3445,7 +3424,7 @@ static int adap_init0(struct adapter *adap)
 		/* allocate memory to read the header of the firmware on the
 		 * card
 		 */
-		card_fw = t4_alloc_mem(sizeof(*card_fw));
+		card_fw = kvzalloc(sizeof(*card_fw), GFP_KERNEL);
 
 		/* Get FW from from /lib/firmware/ */
 		ret = request_firmware(&fw, fw_info->fw_mod_name,
@@ -3465,7 +3444,7 @@ static int adap_init0(struct adapter *adap)
 
 		/* Cleaning up */
 		release_firmware(fw);
-		t4_free_mem(card_fw);
+		kvfree(card_fw);
 
 		if (ret < 0)
 			goto bye;
@@ -4470,9 +4449,9 @@ static void free_some_resources(struct adapter *adapter)
 {
 	unsigned int i;
 
-	t4_free_mem(adapter->l2t);
+	kvfree(adapter->l2t);
 	t4_cleanup_sched(adapter);
-	t4_free_mem(adapter->tids.tid_tab);
+	kvfree(adapter->tids.tid_tab);
 	cxgb4_cleanup_tc_u32(adapter);
 	kfree(adapter->sge.egr_map);
 	kfree(adapter->sge.ingr_map);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index a1b19422b339..ef06ce8247ab 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -432,9 +432,9 @@ void cxgb4_cleanup_tc_u32(struct adapter *adap)
 	for (i = 0; i < t->size; i++) {
 		struct cxgb4_link *link = &t->table[i];
 
-		t4_free_mem(link->tid_map);
+		kvfree(link->tid_map);
 	}
-	t4_free_mem(adap->tc_u32);
+	kvfree(adap->tc_u32);
 }
 
 struct cxgb4_tc_u32_table *cxgb4_init_tc_u32(struct adapter *adap)
@@ -446,8 +446,8 @@ struct cxgb4_tc_u32_table *cxgb4_init_tc_u32(struct adapter *adap)
 	if (!max_tids)
 		return NULL;
 
-	t = t4_alloc_mem(sizeof(*t) +
-			 (max_tids * sizeof(struct cxgb4_link)));
+	t = kvzalloc(sizeof(*t) +
+			 (max_tids * sizeof(struct cxgb4_link)), GFP_KERNEL);
 	if (!t)
 		return NULL;
 
@@ -458,7 +458,7 @@ struct cxgb4_tc_u32_table *cxgb4_init_tc_u32(struct adapter *adap)
 		unsigned int bmap_size;
 
 		bmap_size = BITS_TO_LONGS(max_tids);
-		link->tid_map = t4_alloc_mem(sizeof(unsigned long) * bmap_size);
+		link->tid_map = kvzalloc(sizeof(unsigned long) * bmap_size, GFP_KERNEL);
 		if (!link->tid_map)
 			goto out_no_mem;
 		bitmap_zero(link->tid_map, max_tids);
@@ -471,11 +471,11 @@ out_no_mem:
 		struct cxgb4_link *link = &t->table[i];
 
 		if (link->tid_map)
-			t4_free_mem(link->tid_map);
+			kvfree(link->tid_map);
 	}
 
 	if (t)
-		t4_free_mem(t);
+		kvfree(t);
 
 	return NULL;
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/l2t.c b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
index 7c8c5b9a3c22..6f3692db29af 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/l2t.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/l2t.c
@@ -646,7 +646,7 @@ struct l2t_data *t4_init_l2t(unsigned int l2t_start, unsigned int l2t_end)
 	if (l2t_size < L2T_MIN_HASH_BUCKETS)
 		return NULL;
 
-	d = t4_alloc_mem(sizeof(*d) + l2t_size * sizeof(struct l2t_entry));
+	d = kvzalloc(sizeof(*d) + l2t_size * sizeof(struct l2t_entry), GFP_KERNEL);
 	if (!d)
 		return NULL;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c b/drivers/net/ethernet/chelsio/cxgb4/sched.c
index c9026352a842..02acff741f11 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sched.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c
@@ -177,7 +177,7 @@ static int t4_sched_queue_unbind(struct port_info *pi, struct ch_sched_queue *p)
 		}
 
 		list_del(&qe->list);
-		t4_free_mem(qe);
+		kvfree(qe);
 		if (atomic_dec_and_test(&e->refcnt)) {
 			e->state = SCHED_STATE_UNUSED;
 			memset(&e->info, 0, sizeof(e->info));
@@ -201,7 +201,7 @@ static int t4_sched_queue_bind(struct port_info *pi, struct ch_sched_queue *p)
 	if (p->queue < 0 || p->queue >= pi->nqsets)
 		return -ERANGE;
 
-	qe = t4_alloc_mem(sizeof(struct sched_queue_entry));
+	qe = kvzalloc(sizeof(struct sched_queue_entry), GFP_KERNEL);
 	if (!qe)
 		return -ENOMEM;
 
@@ -211,7 +211,7 @@ static int t4_sched_queue_bind(struct port_info *pi, struct ch_sched_queue *p)
 	/* Unbind queue from any existing class */
 	err = t4_sched_queue_unbind(pi, p);
 	if (err) {
-		t4_free_mem(qe);
+		kvfree(qe);
 		goto out;
 	}
 
@@ -224,7 +224,7 @@ static int t4_sched_queue_bind(struct port_info *pi, struct ch_sched_queue *p)
 	spin_lock(&e->lock);
 	err = t4_sched_bind_unbind_op(pi, (void *)qe, SCHED_QUEUE, true);
 	if (err) {
-		t4_free_mem(qe);
+		kvfree(qe);
 		spin_unlock(&e->lock);
 		goto out;
 	}
@@ -512,7 +512,7 @@ struct sched_table *t4_init_sched(unsigned int sched_size)
 	struct sched_table *s;
 	unsigned int i;
 
-	s = t4_alloc_mem(sizeof(*s) + sched_size * sizeof(struct sched_class));
+	s = kvzalloc(sizeof(*s) + sched_size * sizeof(struct sched_class), GFP_KERNEL);
 	if (!s)
 		return NULL;
 
@@ -548,6 +548,6 @@ void t4_cleanup_sched(struct adapter *adap)
 				t4_sched_class_free(pi, e);
 			write_unlock(&s->rw_lock);
 		}
-		t4_free_mem(s);
+		kvfree(s);
 	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 3ba89bc43d74..6ffd1849a604 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -70,13 +70,10 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS;
 
 	tmp = size * sizeof(struct mlx4_en_tx_info);
-	ring->tx_info = kmalloc_node(tmp, GFP_KERNEL | __GFP_NOWARN, node);
+	ring->tx_info = kvmalloc_node(tmp, GFP_KERNEL, node);
 	if (!ring->tx_info) {
-		ring->tx_info = vmalloc(tmp);
-		if (!ring->tx_info) {
-			err = -ENOMEM;
-			goto err_ring;
-		}
+		err = -ENOMEM;
+		goto err_ring;
 	}
 
 	en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n",
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index db65f72879e9..ce852ca22a96 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -115,12 +115,9 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order)
 
 	for (i = 0; i <= buddy->max_order; ++i) {
 		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
-		buddy->bits[i] = kcalloc(s, sizeof (long), GFP_KERNEL | __GFP_NOWARN);
-		if (!buddy->bits[i]) {
-			buddy->bits[i] = vzalloc(s * sizeof(long));
-			if (!buddy->bits[i])
-				goto err_out_free;
-		}
+		buddy->bits[i] = kvmalloc_array(s, sizeof(long), GFP_KERNEL | __GFP_ZERO);
+		if (!buddy->bits[i])
+			goto err_out_free;
 	}
 
 	set_bit(0, buddy->bits[buddy->max_order]);
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index fac1e9fbd11d..9852a3355509 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -106,10 +106,7 @@ int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
 		return -ENXIO;
 	}
 
-	ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL);
-	if (!ndd->data)
-		ndd->data = vmalloc(ndd->nsarea.config_size);
-
+	ndd->data = kvmalloc(ndd->nsarea.config_size, GFP_KERNEL);
 	if (!ndd->data)
 		return -ENOMEM;
 
diff --git a/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c b/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c
index a6a76a681ea9..8f638267e704 100644
--- a/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c
+++ b/drivers/staging/lustre/lnet/libcfs/linux/linux-mem.c
@@ -45,15 +45,6 @@ EXPORT_SYMBOL(libcfs_kvzalloc);
 void *libcfs_kvzalloc_cpt(struct cfs_cpt_table *cptab, int cpt, size_t size,
 			  gfp_t flags)
 {
-	void *ret;
-
-	ret = kzalloc_node(size, flags | __GFP_NOWARN,
-			   cfs_cpt_spread_node(cptab, cpt));
-	if (!ret) {
-		WARN_ON(!(flags & (__GFP_FS | __GFP_HIGH)));
-		ret = vmalloc_node(size, cfs_cpt_spread_node(cptab, cpt));
-	}
-
-	return ret;
+	return kvzalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt));
 }
 EXPORT_SYMBOL(libcfs_kvzalloc_cpt);
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 6890897a6f30..10f1ef582659 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -87,18 +87,6 @@ struct user_evtchn {
 	bool enabled;
 };
 
-static evtchn_port_t *evtchn_alloc_ring(unsigned int size)
-{
-	evtchn_port_t *ring;
-	size_t s = size * sizeof(*ring);
-
-	ring = kmalloc(s, GFP_KERNEL);
-	if (!ring)
-		ring = vmalloc(s);
-
-	return ring;
-}
-
 static void evtchn_free_ring(evtchn_port_t *ring)
 {
 	kvfree(ring);
@@ -334,7 +322,7 @@ static int evtchn_resize_ring(struct per_user_data *u)
 	else
 		new_size = 2 * u->ring_size;
 
-	new_ring = evtchn_alloc_ring(new_size);
+	new_ring = kvmalloc(new_size * sizeof(*new_ring), GFP_KERNEL);
 	if (!new_ring)
 		return -ENOMEM;
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7dc8844037e0..1c3b6c54d5ee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 		goto out;
 	}
 
-	tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
+	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
 	if (!tmp_buf) {
-		tmp_buf = vmalloc(fs_info->nodesize);
-		if (!tmp_buf) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	left_path->search_commit_root = 1;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dabfc7ac48a6..922a66fce401 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3539,12 +3539,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 	u64 last_dest_end = destoff;
 
 	ret = -ENOMEM;
-	buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN);
-	if (!buf) {
-		buf = vmalloc(fs_info->nodesize);
-		if (!buf)
-			return ret;
-	}
+	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
+	if (!buf)
+		return ret;
 
 	path = btrfs_alloc_path();
 	if (!path) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a60d5bfb8a49..3f645cd67b54 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6360,22 +6360,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	sctx->clone_roots_cnt = arg->clone_sources_count;
 
 	sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
-	sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
+	sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
 	if (!sctx->send_buf) {
-		sctx->send_buf = vmalloc(sctx->send_max_size);
-		if (!sctx->send_buf) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
+	sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
 	if (!sctx->read_buf) {
-		sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
-		if (!sctx->read_buf) {
-			ret = -ENOMEM;
-			goto out;
-		}
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	sctx->pending_dir_moves = RB_ROOT;
@@ -6396,13 +6390,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
 
 	if (arg->clone_sources_count) {
-		clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
+		clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
 		if (!clone_sources_tmp) {
-			clone_sources_tmp = vmalloc(alloc_size);
-			if (!clone_sources_tmp) {
-				ret = -ENOMEM;
-				goto out;
-			}
+			ret = -ENOMEM;
+			goto out;
 		}
 
 		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 26cc95421cca..18c045e2ead6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -74,12 +74,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
 	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
 		(PAGE_SIZE - 1);
 	npages = calc_pages_for(align, nbytes);
-	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
-	if (!pages) {
-		pages = vmalloc(sizeof(*pages) * npages);
-		if (!pages)
-			return ERR_PTR(-ENOMEM);
-	}
+	pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+	if (!pages)
+		return ERR_PTR(-ENOMEM);
 
 	for (idx = 0; idx < npages; ) {
 		size_t start;
diff --git a/fs/select.c b/fs/select.c
index bd4b2ccfd346..d6c652a31e99 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -633,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			goto out_nofds;
 
 		alloc_size = 6 * size;
-		bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN);
-		if (!bits && alloc_size > PAGE_SIZE)
-			bits = vmalloc(alloc_size);
-
+		bits = kvmalloc(alloc_size, GFP_KERNEL);
 		if (!bits)
 			goto out_nofds;
 	}
diff --git a/fs/xattr.c b/fs/xattr.c
index 94f49a082dd2..464c94bf65f9 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 	if (size) {
 		if (size > XATTR_SIZE_MAX)
 			return -E2BIG;
-		kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
-		if (!kvalue) {
-			kvalue = vmalloc(size);
-			if (!kvalue)
-				return -ENOMEM;
-		}
+		kvalue = kvmalloc(size, GFP_KERNEL);
+		if (!kvalue)
+			return -ENOMEM;
 		if (copy_from_user(kvalue, value, size)) {
 			error = -EFAULT;
 			goto out;
@@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	if (size) {
 		if (size > XATTR_SIZE_MAX)
 			size = XATTR_SIZE_MAX;
-		kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-		if (!kvalue) {
-			kvalue = vzalloc(size);
-			if (!kvalue)
-				return -ENOMEM;
-		}
+		kvalue = kvzalloc(size, GFP_KERNEL);
+		if (!kvalue)
+			return -ENOMEM;
 	}
 
 	error = vfs_getxattr(d, kname, kvalue, size);
@@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 	if (size) {
 		if (size > XATTR_LIST_MAX)
 			size = XATTR_LIST_MAX;
-		klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
-		if (!klist) {
-			klist = vmalloc(size);
-			if (!klist)
-				return -ENOMEM;
-		}
+		klist = kvmalloc(size, GFP_KERNEL);
+		if (!klist)
+			return -ENOMEM;
 	}
 
 	error = vfs_listxattr(d, klist, size);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 3fece51dcf13..18fc65b84b79 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -892,12 +892,7 @@ static inline u16 cmdif_rev(struct mlx5_core_dev *dev)
 
 static inline void *mlx5_vzalloc(unsigned long size)
 {
-	void *rtn;
-
-	rtn = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-	if (!rtn)
-		rtn = vzalloc(size);
-	return rtn;
+	return kvzalloc(size, GFP_KERNEL);
 }
 
 static inline u32 mlx5_base_mkey(const u32 key)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 08e2849d27ca..7cb17c6b97de 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -532,6 +532,14 @@ static inline void *kvzalloc(size_t size, gfp_t flags)
 	return kvmalloc(size, flags | __GFP_ZERO);
 }
 
+static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	if (size != 0 && n > SIZE_MAX / size)
+		return NULL;
+
+	return kvmalloc(n * size, flags);
+}
+
 extern void kvfree(const void *addr);
 
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 4952311422c1..ae82d9cea553 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1028,10 +1028,7 @@ EXPORT_SYMBOL(iov_iter_get_pages);
 
 static struct page **get_pages_array(size_t n)
 {
-	struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
-	if (!p)
-		p = vmalloc(n * sizeof(struct page *));
-	return p;
+	return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
 }
 
 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index db77dcb38afd..72ebec18629c 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -200,10 +200,7 @@ struct frame_vector *frame_vector_create(unsigned int nr_frames)
 	 * Avoid higher order allocations, use vmalloc instead. It should
 	 * be rare anyway.
 	 */
-	if (size <= PAGE_SIZE)
-		vec = kmalloc(size, GFP_KERNEL);
-	else
-		vec = vmalloc(size);
+	vec = kvmalloc(size, GFP_KERNEL);
 	if (!vec)
 		return NULL;
 	vec->nr_allocated = nr_frames;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 8bea74298173..e9a59d2d91d4 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -678,11 +678,7 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
 		/* no more locks than number of hash buckets */
 		nblocks = min(nblocks, hashinfo->ehash_mask + 1);
 
-		hashinfo->ehash_locks =	kmalloc_array(nblocks, locksz,
-						      GFP_KERNEL | __GFP_NOWARN);
-		if (!hashinfo->ehash_locks)
-			hashinfo->ehash_locks = vmalloc(nblocks * locksz);
-
+		hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
 		if (!hashinfo->ehash_locks)
 			return -ENOMEM;
 
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 9d0d4f39e42b..653bbd67e3a3 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1011,10 +1011,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
 	tcp_metrics_hash_log = order_base_2(slots);
 	size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log;
 
-	tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
-	if (!tcp_metrics_hash)
-		tcp_metrics_hash = vzalloc(size);
-
+	tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
 	if (!tcp_metrics_hash)
 		return -ENOMEM;
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 088e2b459d0f..257ec66009da 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2005,10 +2005,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	unsigned index;
 
 	if (size) {
-		labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
-		if (!labels)
-			labels = vzalloc(size);
-
+		labels = kvzalloc(size, GFP_KERNEL);
 		if (!labels)
 			goto nolabels;
 	}
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f134d384852f..3d0584665b5d 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -763,17 +763,8 @@ EXPORT_SYMBOL(xt_check_entry_offsets);
  */
 unsigned int *xt_alloc_entry_offsets(unsigned int size)
 {
-	unsigned int *off;
+	return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO);
 
-	off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN);
-
-	if (off)
-		return off;
-
-	if (size < (SIZE_MAX / sizeof(unsigned int)))
-		off = vmalloc(size * sizeof(unsigned int));
-
-	return off;
 }
 EXPORT_SYMBOL(xt_alloc_entry_offsets);
 
@@ -1116,7 +1107,7 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 
 	size = sizeof(void **) * nr_cpu_ids;
 	if (size > PAGE_SIZE)
-		i->jumpstack = vzalloc(size);
+		i->jumpstack = kvzalloc(size, GFP_KERNEL);
 	else
 		i->jumpstack = kzalloc(size, GFP_KERNEL);
 	if (i->jumpstack == NULL)
@@ -1138,12 +1129,8 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	 */
 	size = sizeof(void *) * i->stacksize * 2u;
 	for_each_possible_cpu(cpu) {
-		if (size > PAGE_SIZE)
-			i->jumpstack[cpu] = vmalloc_node(size,
-				cpu_to_node(cpu));
-		else
-			i->jumpstack[cpu] = kmalloc_node(size,
-				GFP_KERNEL, cpu_to_node(cpu));
+		i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL,
+			cpu_to_node(cpu));
 		if (i->jumpstack[cpu] == NULL)
 			/*
 			 * Freeing will be done later on by the callers. The
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 37d581a31cff..3f6c4fa78bdb 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -388,10 +388,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 	}
 
 	sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size;
-	if (sz <= PAGE_SIZE)
-		t = kzalloc(sz, GFP_KERNEL);
-	else
-		t = vzalloc(sz);
+	t = kvzalloc(sz, GFP_KERNEL);
 	if (t == NULL) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index d00f4c7c2f3a..b30a2c70bd48 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -376,10 +376,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
 	if (mask != q->tab_mask) {
 		struct sk_buff **ntab;
 
-		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
-			       GFP_KERNEL | __GFP_NOWARN);
-		if (!ntab)
-			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
+		ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO);
 		if (!ntab)
 			return -ENOMEM;
 
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 18bbb5476c83..9201abce928c 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -446,27 +446,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
 	return 0;
 }
 
-static void *fq_codel_zalloc(size_t sz)
-{
-	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
-
-	if (!ptr)
-		ptr = vzalloc(sz);
-	return ptr;
-}
-
-static void fq_codel_free(void *addr)
-{
-	kvfree(addr);
-}
-
 static void fq_codel_destroy(struct Qdisc *sch)
 {
 	struct fq_codel_sched_data *q = qdisc_priv(sch);
 
 	tcf_destroy_chain(&q->filter_list);
-	fq_codel_free(q->backlogs);
-	fq_codel_free(q->flows);
+	kvfree(q->backlogs);
+	kvfree(q->flows);
 }
 
 static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
@@ -493,13 +479,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
 	}
 
 	if (!q->flows) {
-		q->flows = fq_codel_zalloc(q->flows_cnt *
-					   sizeof(struct fq_codel_flow));
+		q->flows = kvzalloc(q->flows_cnt *
+					   sizeof(struct fq_codel_flow), GFP_KERNEL);
 		if (!q->flows)
 			return -ENOMEM;
-		q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
+		q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL);
 		if (!q->backlogs) {
-			fq_codel_free(q->flows);
+			kvfree(q->flows);
 			return -ENOMEM;
 		}
 		for (i = 0; i < q->flows_cnt; i++) {
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index c19d346e6c5a..51d3ba682af9 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -467,29 +467,14 @@ static void hhf_reset(struct Qdisc *sch)
 		rtnl_kfree_skbs(skb, skb);
 }
 
-static void *hhf_zalloc(size_t sz)
-{
-	void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
-
-	if (!ptr)
-		ptr = vzalloc(sz);
-
-	return ptr;
-}
-
-static void hhf_free(void *addr)
-{
-	kvfree(addr);
-}
-
 static void hhf_destroy(struct Qdisc *sch)
 {
 	int i;
 	struct hhf_sched_data *q = qdisc_priv(sch);
 
 	for (i = 0; i < HHF_ARRAYS_CNT; i++) {
-		hhf_free(q->hhf_arrays[i]);
-		hhf_free(q->hhf_valid_bits[i]);
+		kvfree(q->hhf_arrays[i]);
+		kvfree(q->hhf_valid_bits[i]);
 	}
 
 	for (i = 0; i < HH_FLOWS_CNT; i++) {
@@ -503,7 +488,7 @@ static void hhf_destroy(struct Qdisc *sch)
 			kfree(flow);
 		}
 	}
-	hhf_free(q->hh_flows);
+	kvfree(q->hh_flows);
 }
 
 static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
@@ -609,8 +594,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 
 	if (!q->hh_flows) {
 		/* Initialize heavy-hitter flow table. */
-		q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
-					 sizeof(struct list_head));
+		q->hh_flows = kvzalloc(HH_FLOWS_CNT *
+					 sizeof(struct list_head), GFP_KERNEL);
 		if (!q->hh_flows)
 			return -ENOMEM;
 		for (i = 0; i < HH_FLOWS_CNT; i++)
@@ -624,8 +609,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 
 		/* Initialize heavy-hitter filter arrays. */
 		for (i = 0; i < HHF_ARRAYS_CNT; i++) {
-			q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
-						      sizeof(u32));
+			q->hhf_arrays[i] = kvzalloc(HHF_ARRAYS_LEN *
+						      sizeof(u32), GFP_KERNEL);
 			if (!q->hhf_arrays[i]) {
 				/* Note: hhf_destroy() will be called
 				 * by our caller.
@@ -637,8 +622,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
 
 		/* Initialize valid bits of heavy-hitter filter arrays. */
 		for (i = 0; i < HHF_ARRAYS_CNT; i++) {
-			q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
-							  BITS_PER_BYTE);
+			q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN /
+							  BITS_PER_BYTE, GFP_KERNEL);
 			if (!q->hhf_valid_bits[i]) {
 				/* Note: hhf_destroy() will be called
 				 * by our caller.
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index f0ce4780f395..1b3dd6190e93 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -702,15 +702,11 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 	spinlock_t *root_lock;
 	struct disttable *d;
 	int i;
-	size_t s;
 
 	if (n > NETEM_DIST_MAX)
 		return -EINVAL;
 
-	s = sizeof(struct disttable) + n * sizeof(s16);
-	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
-	if (!d)
-		d = vmalloc(s);
+	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
 	if (!d)
 		return -ENOMEM;
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index b00e02c139de..332d94be6e1c 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -685,11 +685,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 
 static void *sfq_alloc(size_t sz)
 {
-	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
-
-	if (!ptr)
-		ptr = vmalloc(sz);
-	return ptr;
+	return  kvmalloc(sz, GFP_KERNEL);
 }
 
 static void sfq_free(void *addr)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 82a9e1851108..447a7d5cee0f 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -101,14 +101,9 @@ SYSCALL_DEFINE5(add_key, const char __user *, _type,
 
 	if (_payload) {
 		ret = -ENOMEM;
-		payload = kmalloc(plen, GFP_KERNEL | __GFP_NOWARN);
-		if (!payload) {
-			if (plen <= PAGE_SIZE)
-				goto error2;
-			payload = vmalloc(plen);
-			if (!payload)
-				goto error2;
-		}
+		payload = kvmalloc(plen, GFP_KERNEL);
+		if (!payload)
+			goto error2;
 
 		ret = -EFAULT;
 		if (copy_from_user(payload, _payload, plen) != 0)
@@ -1071,14 +1066,9 @@ long keyctl_instantiate_key_common(key_serial_t id,
 
 	if (from) {
 		ret = -ENOMEM;
-		payload = kmalloc(plen, GFP_KERNEL);
-		if (!payload) {
-			if (plen <= PAGE_SIZE)
-				goto error;
-			payload = vmalloc(plen);
-			if (!payload)
-				goto error;
-		}
+		payload = kvmalloc(plen, GFP_KERNEL);
+		if (!payload)
+			goto error;
 
 		ret = -EFAULT;
 		if (!copy_from_iter_full(payload, plen, from))
-- 
cgit v1.2.3


From ad61dd303a0f2439bb104349e2d2ec91a3010ce0 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 8 May 2017 15:57:50 -0700
Subject: scripts/spelling.txt: add regsiter -> register spelling mistake

This typo is quite common.  Fix it and add it to the spelling file so
that checkpatch catches it earlier.

Link: http://lkml.kernel.org/r/20170317011131.6881-2-sboyd@codeaurora.org
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/kernel/unwind.c                              | 2 +-
 arch/arm/kernel/kgdb.c                                | 2 +-
 arch/arm/mach-ixp4xx/common-pci.c                     | 4 ++--
 arch/m68k/ifpsp060/src/ilsp.S                         | 2 +-
 arch/m68k/ifpsp060/src/isp.S                          | 2 +-
 arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c | 2 +-
 arch/mips/include/asm/octeon/cvmx-helper-rgmii.h      | 2 +-
 arch/parisc/kernel/entry.S                            | 2 +-
 arch/powerpc/mm/icswx.c                               | 2 +-
 drivers/acpi/cppc_acpi.c                              | 2 +-
 drivers/clk/qcom/common.c                             | 2 +-
 drivers/cpufreq/sti-cpufreq.c                         | 4 ++--
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c            | 2 +-
 drivers/infiniband/hw/hns/hns_roce_mr.c               | 2 +-
 drivers/net/can/rcar/rcar_canfd.c                     | 2 +-
 drivers/net/ethernet/amd/amd8111e.h                   | 4 ++--
 drivers/net/ethernet/atheros/atl1c/atl1c_hw.c         | 2 +-
 drivers/net/ethernet/intel/igb/e1000_phy.c            | 2 +-
 drivers/scsi/isci/registers.h                         | 4 ++--
 drivers/scsi/mpt3sas/mpt3sas_base.h                   | 2 +-
 include/linux/bcma/bcma_driver_pci.h                  | 2 +-
 include/linux/ftrace.h                                | 2 +-
 include/uapi/linux/ipmi.h                             | 2 +-
 scripts/spelling.txt                                  | 1 +
 sound/soc/soc-core.c                                  | 2 +-
 25 files changed, 29 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c
index b6e4f7a7419b..333daab7def0 100644
--- a/arch/arc/kernel/unwind.c
+++ b/arch/arc/kernel/unwind.c
@@ -845,7 +845,7 @@ static int processCFI(const u8 *start, const u8 *end, unsigned long targetLoc,
 				    * state->dataAlign;
 				break;
 			case DW_CFA_def_cfa_register:
-				unw_debug("cfa_def_cfa_regsiter: ");
+				unw_debug("cfa_def_cfa_register: ");
 				state->cfa.reg = get_uleb128(&ptr.p8, end);
 				break;
 				/*todo case DW_CFA_def_cfa_expression: */
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index 9232caee7060..1bb4c40a3135 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -269,7 +269,7 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 
 /*
  * Register our undef instruction hooks with ARM undef core.
- * We regsiter a hook specifically looking for the KGB break inst
+ * We register a hook specifically looking for the KGB break inst
  * and we handle the normal undef case within the do_undefinstr
  * handler.
  */
diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c
index 4977296f0c78..bcf3df59f71b 100644
--- a/arch/arm/mach-ixp4xx/common-pci.c
+++ b/arch/arm/mach-ixp4xx/common-pci.c
@@ -43,14 +43,14 @@
 int (*ixp4xx_pci_read)(u32 addr, u32 cmd, u32* data);
 
 /*
- * Base address for PCI regsiter region
+ * Base address for PCI register region
  */
 unsigned long ixp4xx_pci_reg_base = 0;
 
 /*
  * PCI cfg an I/O routines are done by programming a 
  * command/byte enable register, and then read/writing
- * the data from a data regsiter. We need to ensure
+ * the data from a data register. We need to ensure
  * these transactions are atomic or we will end up
  * with corrupt data on the bus or in a driver.
  */
diff --git a/arch/m68k/ifpsp060/src/ilsp.S b/arch/m68k/ifpsp060/src/ilsp.S
index 970abaf3303e..dd5b2c357e95 100644
--- a/arch/m68k/ifpsp060/src/ilsp.S
+++ b/arch/m68k/ifpsp060/src/ilsp.S
@@ -776,7 +776,7 @@ muls64_zero:
 # ALGORITHM ***********************************************************	#
 #	In the interest of simplicity, all operands are converted to	#
 # longword size whether the operation is byte, word, or long. The	#
-# bounds are sign extended accordingly. If Rn is a data regsiter, Rn is #
+# bounds are sign extended accordingly. If Rn is a data register, Rn is #
 # also sign extended. If Rn is an address register, it need not be sign #
 # extended since the full register is always used.			#
 #	The condition codes are set correctly before the final "rts".	#
diff --git a/arch/m68k/ifpsp060/src/isp.S b/arch/m68k/ifpsp060/src/isp.S
index b865c1a052ba..29a9f8629b9d 100644
--- a/arch/m68k/ifpsp060/src/isp.S
+++ b/arch/m68k/ifpsp060/src/isp.S
@@ -1876,7 +1876,7 @@ movp_read_err:
 # word, or longword sized operands. Then, in the interest of		#
 # simplicity, all operands are converted to longword size whether the	#
 # operation is byte, word, or long. The bounds are sign extended	#
-# accordingly. If Rn is a data regsiter, Rn is also sign extended. If	#
+# accordingly. If Rn is a data register, Rn is also sign extended. If	#
 # Rn is an address register, it need not be sign extended since the	#
 # full register is always used.						#
 #	The comparisons are made and the condition codes calculated.	#
diff --git a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c
index ba4753c23b03..d18ed5af62f4 100644
--- a/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c
+++ b/arch/mips/cavium-octeon/executive/cvmx-helper-rgmii.c
@@ -152,7 +152,7 @@ static int __cvmx_helper_errata_asx_pass1(int interface, int port,
 }
 
 /**
- * Configure all of the ASX, GMX, and PKO regsiters required
+ * Configure all of the ASX, GMX, and PKO registers required
  * to get RGMII to function on the supplied interface.
  *
  * @interface: PKO Interface to configure (0 or 1)
diff --git a/arch/mips/include/asm/octeon/cvmx-helper-rgmii.h b/arch/mips/include/asm/octeon/cvmx-helper-rgmii.h
index f89775be7654..f7a95d7de140 100644
--- a/arch/mips/include/asm/octeon/cvmx-helper-rgmii.h
+++ b/arch/mips/include/asm/octeon/cvmx-helper-rgmii.h
@@ -55,7 +55,7 @@ extern int __cvmx_helper_rgmii_probe(int interface);
 extern void cvmx_helper_rgmii_internal_loopback(int port);
 
 /**
- * Configure all of the ASX, GMX, and PKO regsiters required
+ * Configure all of the ASX, GMX, and PKO registers required
  * to get RGMII to function on the supplied interface.
  *
  * @interface: PKO Interface to configure (0 or 1)
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index ad4cb1613c57..a4fd296c958e 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -1369,7 +1369,7 @@ nadtlb_nullify:
 
 	/* 
 		When there is no translation for the probe address then we
-		must nullify the insn and return zero in the target regsiter.
+		must nullify the insn and return zero in the target register.
 		This will indicate to the calling code that it does not have 
 		write/read privileges to this address.
 
diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c
index 915412e4d5ba..1fa794d7d59f 100644
--- a/arch/powerpc/mm/icswx.c
+++ b/arch/powerpc/mm/icswx.c
@@ -186,7 +186,7 @@ static u32 acop_get_inst(struct pt_regs *regs)
 }
 
 /**
- * @regs: regsiters at time of interrupt
+ * @regs: registers at time of interrupt
  * @address: storage address
  * @error_code: Fault code, usually the DSISR or ESR depending on
  *		processor type
diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 6cbe6036da99..e5b47f032d9a 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -95,7 +95,7 @@ static DEFINE_PER_CPU(struct cpc_desc *, cpc_desc_ptr);
 /* pcc mapped address + header size + offset within PCC subspace */
 #define GET_PCC_VADDR(offs) (pcc_data.pcc_comm_addr + 0x8 + (offs))
 
-/* Check if a CPC regsiter is in PCC */
+/* Check if a CPC register is in PCC */
 #define CPC_IN_PCC(cpc) ((cpc)->type == ACPI_TYPE_BUFFER &&		\
 				(cpc)->cpc_entry.reg.space_id ==	\
 				ACPI_ADR_SPACE_PLATFORM_COMM)
diff --git a/drivers/clk/qcom/common.c b/drivers/clk/qcom/common.c
index 03f9d316f969..d523991c945f 100644
--- a/drivers/clk/qcom/common.c
+++ b/drivers/clk/qcom/common.c
@@ -128,7 +128,7 @@ static void qcom_cc_gdsc_unregister(void *data)
 
 /*
  * Backwards compatibility with old DTs. Register a pass-through factor 1/1
- * clock to translate 'path' clk into 'name' clk and regsiter the 'path'
+ * clock to translate 'path' clk into 'name' clk and register the 'path'
  * clk as a fixed rate clock if it isn't present.
  */
 static int _qcom_cc_register_board_clk(struct device *dev, const char *path,
diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c
index a7db9011d5fe..d2d0430d09d4 100644
--- a/drivers/cpufreq/sti-cpufreq.c
+++ b/drivers/cpufreq/sti-cpufreq.c
@@ -236,7 +236,7 @@ use_defaults:
 	return 0;
 }
 
-static int sti_cpufreq_fetch_syscon_regsiters(void)
+static int sti_cpufreq_fetch_syscon_registers(void)
 {
 	struct device *dev = ddata.cpu;
 	struct device_node *np = dev->of_node;
@@ -275,7 +275,7 @@ static int sti_cpufreq_init(void)
 		goto skip_voltage_scaling;
 	}
 
-	ret = sti_cpufreq_fetch_syscon_regsiters();
+	ret = sti_cpufreq_fetch_syscon_registers();
 	if (ret)
 		goto skip_voltage_scaling;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 157adf381c18..37d5d29597a4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -1721,7 +1721,7 @@ int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
 	roce_set_field(mpt_entry->mpt_byte_64, MPT_BYTE_64_L_KEY_IDX_H_M,
 		       MPT_BYTE_64_L_KEY_IDX_H_S, mtpt_idx >> MTPT_IDX_SHIFT);
 
-	/* DMA memory regsiter */
+	/* DMA memory register */
 	if (mr->type == MR_TYPE_DMA)
 		return 0;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index dc5c97c8f070..80fc01ffd8bd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -205,7 +205,7 @@ int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift,
 		return 0;
 	}
 
-	/* Note: if page_shift is zero, FAST memory regsiter */
+	/* Note: if page_shift is zero, FAST memory register */
 	mtt->page_shift = page_shift;
 
 	/* Compute MTT entry necessary */
diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c
index 4ef07d97156d..602c19e23f05 100644
--- a/drivers/net/can/rcar/rcar_canfd.c
+++ b/drivers/net/can/rcar/rcar_canfd.c
@@ -413,7 +413,7 @@
 /* RSCFDnRPGACCr */
 #define RCANFD_C_RPGACC(r)		(0x1900 + (0x04 * (r)))
 
-/* CAN FD mode specific regsiter map */
+/* CAN FD mode specific register map */
 
 /* RSCFDnCFDCmXXX -> RCANFD_F_XXX(m) */
 #define RCANFD_F_DCFG(m)		(0x0500 + (0x20 * (m)))
diff --git a/drivers/net/ethernet/amd/amd8111e.h b/drivers/net/ethernet/amd/amd8111e.h
index 7cdb18512407..2a57b46fd6a6 100644
--- a/drivers/net/ethernet/amd/amd8111e.h
+++ b/drivers/net/ethernet/amd/amd8111e.h
@@ -48,7 +48,7 @@ eg., if the value 10011010b is written into the least significant byte of a comm
 /* 32 bit registers */
 
 #define  ASF_STAT		0x00	/* ASF status register */
-#define CHIPID			0x04	/* Chip ID regsiter */
+#define CHIPID			0x04	/* Chip ID register */
 #define	MIB_DATA		0x10	/* MIB data register */
 #define MIB_ADDR		0x14	/* MIB address register */
 #define STAT0			0x30	/* Status0 register */
@@ -648,7 +648,7 @@ typedef enum {
 /* driver ioctl parameters */
 #define AMD8111E_REG_DUMP_LEN	 13*sizeof(u32)
 
-/* amd8111e desriptor format */
+/* amd8111e descriptor format */
 
 struct amd8111e_tx_dr{
 
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c b/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
index a8b80c56ac25..73efdb05a490 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_hw.c
@@ -307,7 +307,7 @@ void atl1c_start_phy_polling(struct atl1c_hw *hw, u16 clk_sel)
 
 /*
  * atl1c_read_phy_core
- * core function to read register in PHY via MDIO control regsiter.
+ * core function to read register in PHY via MDIO control register.
  * ext: extension register (see IEEE 802.3)
  * dev: device address (see IEEE 802.3 DEVAD, PRTAD is fixed to 0)
  * reg: reg to read
diff --git a/drivers/net/ethernet/intel/igb/e1000_phy.c b/drivers/net/ethernet/intel/igb/e1000_phy.c
index 68812d783f33..413025bdcb50 100644
--- a/drivers/net/ethernet/intel/igb/e1000_phy.c
+++ b/drivers/net/ethernet/intel/igb/e1000_phy.c
@@ -127,7 +127,7 @@ out:
  *  @offset: register offset to be read
  *  @data: pointer to the read data
  *
- *  Reads the MDI control regsiter in the PHY at offset and stores the
+ *  Reads the MDI control register in the PHY at offset and stores the
  *  information read to data.
  **/
 s32 igb_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data)
diff --git a/drivers/scsi/isci/registers.h b/drivers/scsi/isci/registers.h
index 97f3ceb8d724..63468cfe3e4a 100644
--- a/drivers/scsi/isci/registers.h
+++ b/drivers/scsi/isci/registers.h
@@ -652,7 +652,7 @@ struct scu_iit_entry {
 
 
 /*
- * TODO: Where is the SAS_LNKTOV regsiter?
+ * TODO: Where is the SAS_LNKTOV register?
  * TODO: Where is the SAS_PHYTOV register? */
 
 #define SCU_SAS_TRANSMIT_IDENTIFICATION_SMP_TARGET_SHIFT            (1)
@@ -1827,7 +1827,7 @@ struct scu_peg_registers {
 };
 
 /**
- * struct scu_registers - SCU regsiters including both PEG registers if we turn
+ * struct scu_registers - SCU registers including both PEG registers if we turn
  *    on that compile option. All of these registers are in the memory mapped
  *    space returned from BAR1.
  *
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 8981806fb13f..099ab4ca7edf 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -1421,7 +1421,7 @@ void mpt3sas_ctl_add_to_event_log(struct MPT3SAS_ADAPTER *ioc,
 	Mpi2EventNotificationReply_t *mpi_reply);
 
 void mpt3sas_enable_diag_buffer(struct MPT3SAS_ADAPTER *ioc,
-	u8 bits_to_regsiter);
+	u8 bits_to_register);
 int mpt3sas_send_diag_release(struct MPT3SAS_ADAPTER *ioc, u8 buffer_type,
 	u8 *issue_reset);
 
diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index 9657f11d48a7..bca6a5e4ca3d 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -80,7 +80,7 @@ struct pci_dev;
 #define  BCMA_CORE_PCI_MDIODATA_DEV_TX		0x1e	/* SERDES TX Dev */
 #define  BCMA_CORE_PCI_MDIODATA_DEV_RX		0x1f	/* SERDES RX Dev */
 #define BCMA_CORE_PCI_PCIEIND_ADDR		0x0130	/* indirect access to the internal register */
-#define BCMA_CORE_PCI_PCIEIND_DATA		0x0134	/* Data to/from the internal regsiter */
+#define BCMA_CORE_PCI_PCIEIND_DATA		0x0134	/* Data to/from the internal register */
 #define BCMA_CORE_PCI_CLKREQENCTRL		0x0138	/*  >= rev 6, Clkreq rdma control */
 #define BCMA_CORE_PCI_PCICFG0			0x0400	/* PCI config space 0 (rev >= 8) */
 #define BCMA_CORE_PCI_PCICFG1			0x0500	/* PCI config space 1 (rev >= 8) */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 6d2a63e4ea52..473f088aabea 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -72,7 +72,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION_SAFE, STUB and
  * IPMODIFY are a kind of attribute flags which can be set only before
  * registering the ftrace_ops, and can not be modified while registered.
- * Changing those attribute flags after regsitering ftrace_ops will
+ * Changing those attribute flags after registering ftrace_ops will
  * cause unexpected results.
  *
  * ENABLED - set/unset when ftrace_ops is registered/unregistered
diff --git a/include/uapi/linux/ipmi.h b/include/uapi/linux/ipmi.h
index 7b26a62e5707..b9095a27a08a 100644
--- a/include/uapi/linux/ipmi.h
+++ b/include/uapi/linux/ipmi.h
@@ -355,7 +355,7 @@ struct ipmi_cmdspec {
 #define IPMICTL_REGISTER_FOR_CMD	_IOR(IPMI_IOC_MAGIC, 14,	\
 					     struct ipmi_cmdspec)
 /*
- * Unregister a regsitered command.  error values:
+ * Unregister a registered command.  error values:
  *  - EFAULT - an address supplied was invalid.
  *  - ENOENT - The netfn/cmd was not found registered for this user.
  */
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index d778d7b42a78..7b6d25a86202 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -891,6 +891,7 @@ registerd||registered
 registeresd||registered
 registes||registers
 registraration||registration
+regsiter||register
 regster||register
 regualar||regular
 reguator||regulator
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 525f2f397b4c..aae099c0e502 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -936,7 +936,7 @@ static struct snd_soc_component *soc_find_component(
  *
  * @dlc: name of the DAI and optional component info to match
  *
- * This function will search all regsitered components and their DAIs to
+ * This function will search all registered components and their DAIs to
  * find the DAI of the same name. The component's of_node and name
  * should also match if being specified.
  *
-- 
cgit v1.2.3


From 8ac1ed791401790968fd00ca63ca4fa814677199 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 8 May 2017 15:57:56 -0700
Subject: treewide: spelling: correct diffrent[iate] and banlance typos

Add these misspellings to scripts/spelling.txt too

Link: http://lkml.kernel.org/r/962aace119675e5fe87be2a88ddac1a5486f8e60.1490931810.git.joe@perches.com
Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Mauro Carvalho Chehab <mchehab@s-opensource.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c      | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_int.c           | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_main.c          | 2 +-
 drivers/net/ethernet/qlogic/qed/qed_sriov.c         | 2 +-
 include/linux/mlx4/device.h                         | 2 +-
 scripts/spelling.txt                                | 3 +++
 7 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h b/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h
index 354ec07eae87..23ae72468025 100644
--- a/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h
+++ b/drivers/media/dvb-frontends/drx39xyj/drx_dap_fasi.h
@@ -70,7 +70,7 @@
 * (3) both long and short but short preferred and long only when necesarry
 *
 * These modes must be selected compile time via compile switches.
-* Compile switch settings for the diffrent modes:
+* Compile switch settings for the different modes:
 * (1) DRXDAPFASI_LONG_ADDR_ALLOWED=0, DRXDAPFASI_SHORT_ADDR_ALLOWED=1
 * (2) DRXDAPFASI_LONG_ADDR_ALLOWED=1, DRXDAPFASI_SHORT_ADDR_ALLOWED=0
 * (3) DRXDAPFASI_LONG_ADDR_ALLOWED=1, DRXDAPFASI_SHORT_ADDR_ALLOWED=1
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index cea6bdcde33f..8baf9d3eb4b1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -1591,7 +1591,7 @@ static int __bnx2x_vlan_mac_execute_step(struct bnx2x *bp,
 	if (rc != 0) {
 		__bnx2x_vlan_mac_h_pend(bp, o, *ramrod_flags);
 
-		/* Calling function should not diffrentiate between this case
+		/* Calling function should not differentiate between this case
 		 * and the case in which there is already a pending ramrod
 		 */
 		rc = 1;
diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
index 0ed24d6e6c65..40f057edeafc 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -3058,7 +3058,7 @@ int qed_int_igu_read_cam(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	/* There's a possibility the igu_sb_cnt_iov doesn't properly reflect
 	 * the number of VF SBs [especially for first VF on engine, as we can't
-	 * diffrentiate between empty entries and its entries].
+	 * differentiate between empty entries and its entries].
 	 * Since we don't really support more SBs than VFs today, prevent any
 	 * such configuration by sanitizing the number of SBs to equal the
 	 * number of VFs.
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index b7ad36b91e12..c67ff1411799 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -978,7 +978,7 @@ static int qed_slowpath_start(struct qed_dev *cdev,
 		if (rc)
 			goto err2;
 
-		/* First Dword used to diffrentiate between various sources */
+		/* First Dword used to differentiate between various sources */
 		data = cdev->firmware->data + sizeof(u32);
 
 		qed_dbg_pf_init(cdev);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_sriov.c b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
index d5df29f787c5..f5ed54d611ec 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_sriov.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_sriov.c
@@ -625,7 +625,7 @@ int qed_iov_hw_info(struct qed_hwfn *p_hwfn)
 	 *  - If !ARI, VFs would start on next device.
 	 *    so offset - (256 - pf_id) would provide the number.
 	 * Utilize the fact that (256 - pf_id) is achieved only by later
-	 * to diffrentiate between the two.
+	 * to differentiate between the two.
 	 */
 
 	if (p_hwfn->cdev->p_iov_info->offset < (256 - p_hwfn->abs_pf_id)) {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 74b765ce48ab..d5bed0875d30 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -108,7 +108,7 @@ enum {
 	MLX4_MFUNC_EQE_MASK     = (MLX4_MFUNC_MAX_EQES - 1)
 };
 
-/* Driver supports 3 diffrent device methods to manage traffic steering:
+/* Driver supports 3 different device methods to manage traffic steering:
  *	-device managed - High level API for ib and eth flow steering. FW is
  *			  managing flow steering tables.
  *	- B0 steering mode - Common low level API for ib and (if supported) eth.
diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index aeca2c25de32..eb38f49d4b75 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -179,6 +179,7 @@ bakup||backup
 baloon||balloon
 baloons||balloons
 bandwith||bandwidth
+banlance||balance
 batery||battery
 beacuse||because
 becasue||because
@@ -375,6 +376,8 @@ dictionnary||dictionary
 didnt||didn't
 diferent||different
 differrence||difference
+diffrent||different
+diffrentiate||differentiate
 difinition||definition
 diplay||display
 direectly||directly
-- 
cgit v1.2.3


From 2d0bde57f3527ffac9279b4c8ba61060ba395b1a Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Mon, 8 May 2017 15:58:26 -0700
Subject: include/linux/filter.h: use set_memory.h header

set_memory_* functions have moved to set_memory.h.  Switch to this
explicitly.

Link: http://lkml.kernel.org/r/1488920133-27229-11-git-send-email-labbott@redhat.com
Signed-off-by: Laura Abbott <labbott@redhat.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/filter.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9a7786db14fa..56197f82af45 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -19,7 +19,9 @@
 
 #include <net/sch_generic.h>
 
-#include <asm/cacheflush.h>
+#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+#include <asm/set_memory.h>
+#endif
 
 #include <uapi/linux/filter.h>
 #include <uapi/linux/bpf.h>
-- 
cgit v1.2.3


From ec48c940da6cb96c4be6638d0f2efade24d5242a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 8 May 2017 15:58:50 -0700
Subject: kref: remove WARN_ON for NULL release functions

The kref functions check for NULL release functions.  This WARN_ON seems
rather pointless.  We will eventually release and then just crash
nicely.  It is also somewhat expensive because these functions are
inlined in a lot of places.  Removing the WARN_ONs saves around 2.3k in
this kernel (likely more in others with more drivers)

     text    data     bss     dec     hex filename
  9083992 5367600 11116544        25568136        1862388 vmlinux-before-load-avg
  9070166 5367600 11116544        25554310        185ed86 vmlinux-load-avg

Link: http://lkml.kernel.org/r/20170315021431.13107-5-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kref.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index f4156f88f557..29220724bf1c 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -66,8 +66,6 @@ static inline void kref_get(struct kref *kref)
  */
 static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
 {
-	WARN_ON(release == NULL);
-
 	if (refcount_dec_and_test(&kref->refcount)) {
 		release(kref);
 		return 1;
@@ -79,8 +77,6 @@ static inline int kref_put_mutex(struct kref *kref,
 				 void (*release)(struct kref *kref),
 				 struct mutex *lock)
 {
-	WARN_ON(release == NULL);
-
 	if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
@@ -92,8 +88,6 @@ static inline int kref_put_lock(struct kref *kref,
 				void (*release)(struct kref *kref),
 				spinlock_t *lock)
 {
-	WARN_ON(release == NULL);
-
 	if (refcount_dec_and_lock(&kref->refcount, lock)) {
 		release(kref);
 		return 1;
-- 
cgit v1.2.3


From f44a2920c84af809883ecbbd08d47fb5fe47c8ad Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 8 May 2017 15:58:56 -0700
Subject: include/linux/uaccess.h: remove expensive WARN_ON in
 pagefault_disabled_dec

pagefault_disabled_dec is frequently used inline, and it has a WARN_ON
for underflow that expands to about 6.5k of extra code.  The warning
doesn't seem to be that useful and worth so much code so remove it.

If it was needed could make it depending on some debug kernel option.

Saves ~6.5k in my kernel

     text    data     bss     dec     hex filename
  9039417 5367568 11116544        25523529        1857549 vmlinux-before-pf
  9032805 5367568 11116544        25516917        1855b75 vmlinux-pf

Link: http://lkml.kernel.org/r/20170315021431.13107-8-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uaccess.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index e0cbfb09e60f..201418d5e15c 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -203,7 +203,6 @@ static __always_inline void pagefault_disabled_inc(void)
 static __always_inline void pagefault_disabled_dec(void)
 {
 	current->pagefault_disabled--;
-	WARN_ON(current->pagefault_disabled < 0);
 }
 
 /*
-- 
cgit v1.2.3


From c718a97514e4d77c97a35734b728aaf541a0621b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 8 May 2017 15:58:59 -0700
Subject: fs: semove set but not checked AOP_FLAG_UNINTERRUPTIBLE flag

Commit afddba49d18f ("fs: introduce write_begin, write_end, and
perform_write aops") introduced AOP_FLAG_UNINTERRUPTIBLE flag which was
checked in pagecache_write_begin(), but that check was removed by
4e02ed4b4a2f ("fs: remove prepare_write/commit_write").

Between these two commits, commit d9414774dc0c ("cifs: Convert cifs to
new aops.") added a check in cifs_write_begin(), but that check was soon
removed by commit a98ee8c1c707 ("[CIFS] fix regression in
cifs_write_begin/cifs_write_end").

Therefore, AOP_FLAG_UNINTERRUPTIBLE flag is checked nowhere.  Let's
remove this flag.  This patch has no functionality changes.

Link: http://lkml.kernel.org/r/1489294781-53494-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/vfs.txt |  3 +--
 fs/buffer.c                       | 13 +++++--------
 fs/exofs/dir.c                    |  3 +--
 fs/hfs/extent.c                   |  4 ++--
 fs/hfsplus/extents.c              |  5 ++---
 fs/iomap.c                        | 13 +++----------
 fs/namei.c                        |  2 +-
 include/linux/fs.h                |  5 ++---
 mm/filemap.c                      |  6 ------
 9 files changed, 17 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 94dd27ef4a76..f42b90687d40 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -694,8 +694,7 @@ struct address_space_operations {
 
   write_end: After a successful write_begin, and data copy, write_end must
         be called. len is the original len passed to write_begin, and copied
-        is the amount that was able to be copied (copied == len is always true
-	if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
+        is the amount that was able to be copied.
 
         The filesystem must take care of unlocking the page and releasing it
         refcount, and updating i_size.
diff --git a/fs/buffer.c b/fs/buffer.c
index 9196f2a270da..c3c7455efa3f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2379,8 +2379,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
 		goto out;
 
 	err = pagecache_write_begin(NULL, mapping, size, 0,
-				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
-				&page, &fsdata);
+				    AOP_FLAG_CONT_EXPAND, &page, &fsdata);
 	if (err)
 		goto out;
 
@@ -2415,9 +2414,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 		}
 		len = PAGE_SIZE - zerofrom;
 
-		err = pagecache_write_begin(file, mapping, curpos, len,
-						AOP_FLAG_UNINTERRUPTIBLE,
-						&page, &fsdata);
+		err = pagecache_write_begin(file, mapping, curpos, len, 0,
+					    &page, &fsdata);
 		if (err)
 			goto out;
 		zero_user(page, zerofrom, len);
@@ -2449,9 +2447,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 		}
 		len = offset - zerofrom;
 
-		err = pagecache_write_begin(file, mapping, curpos, len,
-						AOP_FLAG_UNINTERRUPTIBLE,
-						&page, &fsdata);
+		err = pagecache_write_begin(file, mapping, curpos, len, 0,
+					    &page, &fsdata);
 		if (err)
 			goto out;
 		zero_user(page, zerofrom, len);
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 42f9a0a0c4ca..8eeb694332fe 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
 	int err;
 
 	lock_page(page);
-	err = exofs_write_begin(NULL, page->mapping, pos, len,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL);
 	if (err)
 		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n",
 			  err);
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index e33a0d36a93e..5d0182654580 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode)
 
 		/* XXX: Can use generic_cont_expand? */
 		size = inode->i_size - 1;
-		res = pagecache_write_begin(NULL, mapping, size+1, 0,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+		res = pagecache_write_begin(NULL, mapping, size+1, 0, 0,
+					    &page, &fsdata);
 		if (!res) {
 			res = pagecache_write_end(NULL, mapping, size+1, 0, 0,
 					page, fsdata);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index feca524ce2a5..a3eb640b4f8f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode)
 		void *fsdata;
 		loff_t size = inode->i_size;
 
-		res = pagecache_write_begin(NULL, mapping, size, 0,
-						AOP_FLAG_UNINTERRUPTIBLE,
-						&page, &fsdata);
+		res = pagecache_write_begin(NULL, mapping, size, 0, 0,
+					    &page, &fsdata);
 		if (res)
 			return;
 		res = pagecache_write_end(NULL, mapping, size,
diff --git a/fs/iomap.c b/fs/iomap.c
index 1faabe09b8fd..4b10892967a5 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	ssize_t written = 0;
 	unsigned int flags = AOP_FLAG_NOFS;
 
-	/*
-	 * Copies from kernel address space cannot fail (NFSD is a big user).
-	 */
-	if (!iter_is_iovec(i))
-		flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
 	do {
 		struct page *page;
 		unsigned long offset;	/* Offset into pagecache page */
@@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			return PTR_ERR(rpage);
 
 		status = iomap_write_begin(inode, pos, bytes,
-				AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
-				&page, iomap);
+					   AOP_FLAG_NOFS, &page, iomap);
 		put_page(rpage);
 		if (unlikely(status))
 			return status;
@@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 	struct page *page;
 	int status;
 
-	status = iomap_write_begin(inode, pos, bytes,
-			AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
+	status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
+				   iomap);
 	if (status)
 		return status;
 
diff --git a/fs/namei.c b/fs/namei.c
index 9a7f8bd748d8..7286f87ce863 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4766,7 +4766,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 	struct page *page;
 	void *fsdata;
 	int err;
-	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+	unsigned int flags = 0;
 	if (nofs)
 		flags |= AOP_FLAG_NOFS;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5d62d2c47939..249dad4e8d26 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -250,9 +250,8 @@ enum positive_aop_returns {
 	AOP_TRUNCATED_PAGE	= 0x80001,
 };
 
-#define AOP_FLAG_UNINTERRUPTIBLE	0x0001 /* will not do a short write */
-#define AOP_FLAG_CONT_EXPAND		0x0002 /* called from cont_expand */
-#define AOP_FLAG_NOFS			0x0004 /* used by filesystem to direct
+#define AOP_FLAG_CONT_EXPAND		0x0001 /* called from cont_expand */
+#define AOP_FLAG_NOFS			0x0002 /* used by filesystem to direct
 						* helper code (eg buffer layer)
 						* to clear GFP_FS from alloc */
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 681da61080bc..b7b973b47d8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2791,12 +2791,6 @@ ssize_t generic_perform_write(struct file *file,
 	ssize_t written = 0;
 	unsigned int flags = 0;
 
-	/*
-	 * Copies from kernel address space cannot fail (NFSD is a big user).
-	 */
-	if (!iter_is_iovec(i))
-		flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
 	do {
 		struct page *page;
 		unsigned long offset;	/* Offset into pagecache page */
-- 
cgit v1.2.3


From bfe1c566453a0979c0b3cd3728d0de962272f034 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Mon, 8 May 2017 15:59:37 -0700
Subject: time: delete CURRENT_TIME_SEC and CURRENT_TIME

All uses of CURRENT_TIME_SEC and CURRENT_TIME macros have been replaced
by other time functions.  These macros are also not y2038 safe.  And,
all their use cases can be fulfilled by y2038 safe ktime_get_* variants.

Link: http://lkml.kernel.org/r/1491613030-11599-12-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 23f0f5ce3090..c0543f5f25de 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -151,9 +151,6 @@ static inline bool timespec_inject_offset_valid(const struct timespec *ts)
 	return true;
 }
 
-#define CURRENT_TIME		(current_kernel_time())
-#define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
-
 /* Some architectures do not supply their own clocksource.
  * This is mainly the case in architectures that get their
  * inter-tick times by reading the counter on their interval
-- 
cgit v1.2.3


From 499118e966f1d2150bd66647c8932343c4e9a0b8 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:59:50 -0700
Subject: mm: introduce memalloc_noreclaim_{save,restore}

The previous patch ("mm: prevent potential recursive reclaim due to
clearing PF_MEMALLOC") has shown that simply setting and clearing
PF_MEMALLOC in current->flags can result in wrongly clearing a
pre-existing PF_MEMALLOC flag and potentially lead to recursive reclaim.
Let's introduce helpers that support proper nesting by saving the
previous stat of the flag, similar to the existing memalloc_noio_* and
memalloc_nofs_* helpers.  Convert existing setting/clearing of
PF_MEMALLOC within mm to the new helpers.

There are no known issues with the converted code, but the change makes
it more robust.

Link: http://lkml.kernel.org/r/20170405074700.29871-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Chris Leech <cleech@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Lee Duncan <lduncan@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/mm.h | 12 ++++++++++++
 mm/page_alloc.c          | 11 ++++++-----
 mm/vmscan.c              | 17 +++++++++++------
 3 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 9daabe138c99..2b24a6974847 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -191,4 +191,16 @@ static inline void memalloc_nofs_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags;
 }
 
+static inline unsigned int memalloc_noreclaim_save(void)
+{
+	unsigned int flags = current->flags & PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC;
+	return flags;
+}
+
+static inline void memalloc_noreclaim_restore(unsigned int flags)
+{
+	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
+}
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1daf509722c7..f9e450c6b6e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3283,15 +3283,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum compact_priority prio, enum compact_result *compact_result)
 {
 	struct page *page;
-	unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
+	unsigned int noreclaim_flag;
 
 	if (!order)
 		return NULL;
 
-	current->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
 									prio);
-	current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	if (*compact_result <= COMPACT_INACTIVE)
 		return NULL;
@@ -3438,12 +3438,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 {
 	struct reclaim_state reclaim_state;
 	int progress;
+	unsigned int noreclaim_flag;
 
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	current->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
@@ -3453,7 +3454,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
-	current->flags &= ~PF_MEMALLOC;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	cond_resched();
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4e7ed65842af..2f45c0520f43 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3036,6 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	int nid;
+	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
@@ -3062,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					    sc.gfp_mask,
 					    sc.reclaim_idx);
 
-	current->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-	current->flags &= ~PF_MEMALLOC;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
@@ -3589,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
+	unsigned int noreclaim_flag;
 
-	p->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	lockdep_set_current_reclaim_state(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
@@ -3599,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
-	p->flags &= ~PF_MEMALLOC;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	return nr_reclaimed;
 }
@@ -3764,6 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int classzone_idx = gfp_zone(gfp_mask);
+	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
@@ -3781,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
 	 * and RECLAIM_UNMAP.
 	 */
-	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+	noreclaim_flag = memalloc_noreclaim_save();
+	p->flags |= PF_SWAPWRITE;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
@@ -3797,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	}
 
 	p->reclaim_state = NULL;
-	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	current->flags &= ~PF_SWAPWRITE;
+	memalloc_noreclaim_restore(noreclaim_flag);
 	lockdep_clear_current_reclaim_state();
 	return sc.nr_reclaimed >= nr_pages;
 }
-- 
cgit v1.2.3


From 497d72d80a789501501cccabdad6b145f9e31371 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Mon, 8 May 2017 20:38:40 +0200
Subject: KVM: Add kvm_vcpu_get_idx to get vcpu index in kvm->vcpus

There are occasional needs to use the index of vcpu in the kvm->vcpus
array to map something related to a VCPU.  For example, unlike the
vcpu->vcpu_id, the vcpu index is guaranteed to not be sparse across all
vcpus which is useful when allocating a memory area for each vcpu.

Signed-off-by: Christoffer Dall <cdall@linaro.org>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
---
 include/linux/kvm_host.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c14ad9809da..12eb26d665e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -490,6 +490,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
 	return NULL;
 }
 
+static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu *tmp;
+	int idx;
+
+	kvm_for_each_vcpu(idx, tmp, vcpu->kvm)
+		if (tmp == vcpu)
+			return idx;
+	BUG();
+}
+
 #define kvm_for_each_memslot(memslot, slots)	\
 	for (memslot = &slots->memslots[0];	\
 	      memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\
-- 
cgit v1.2.3


From ea47dd191d543f81e0912b5dc0471b48346b016e Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 19 Apr 2017 05:12:18 +1000
Subject: of/fdt: introduce of_scan_flat_dt_subnodes and of_get_flat_dt_phandle

Introduce primitives for FDT parsing. These will be used for powerpc
cpufeatures node scanning, which has quite complex structure but should
be processed early.

Cc: devicetree@vger.kernel.org
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 drivers/of/fdt.c       | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/of_fdt.h |  6 ++++++
 2 files changed, 44 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index e5ce4b59e162..961ca97072a9 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -753,6 +753,36 @@ int __init of_scan_flat_dt(int (*it)(unsigned long node,
 	return rc;
 }
 
+/**
+ * of_scan_flat_dt_subnodes - scan sub-nodes of a node call callback on each.
+ * @it: callback function
+ * @data: context data pointer
+ *
+ * This function is used to scan sub-nodes of a node.
+ */
+int __init of_scan_flat_dt_subnodes(unsigned long parent,
+				    int (*it)(unsigned long node,
+					      const char *uname,
+					      void *data),
+				    void *data)
+{
+	const void *blob = initial_boot_params;
+	int node;
+
+	fdt_for_each_subnode(node, blob, parent) {
+		const char *pathp;
+		int rc;
+
+		pathp = fdt_get_name(blob, node, NULL);
+		if (*pathp == '/')
+			pathp = kbasename(pathp);
+		rc = it(node, pathp, data);
+		if (rc)
+			return rc;
+	}
+	return 0;
+}
+
 /**
  * of_get_flat_dt_subnode_by_name - get the subnode by given name
  *
@@ -812,6 +842,14 @@ int __init of_flat_dt_match(unsigned long node, const char *const *compat)
 	return of_fdt_match(initial_boot_params, node, compat);
 }
 
+/**
+ * of_get_flat_dt_prop - Given a node in the flat blob, return the phandle
+ */
+uint32_t __init of_get_flat_dt_phandle(unsigned long node)
+{
+	return fdt_get_phandle(initial_boot_params, node);
+}
+
 struct fdt_scan_status {
 	const char *name;
 	int namelen;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 271b3fdf0070..1dfbfd0d8040 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -54,6 +54,11 @@ extern char __dtb_end[];
 extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname,
 				     int depth, void *data),
 			   void *data);
+extern int of_scan_flat_dt_subnodes(unsigned long node,
+				    int (*it)(unsigned long node,
+					      const char *uname,
+					      void *data),
+				    void *data);
 extern int of_get_flat_dt_subnode_by_name(unsigned long node,
 					  const char *uname);
 extern const void *of_get_flat_dt_prop(unsigned long node, const char *name,
@@ -62,6 +67,7 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name);
 extern int of_flat_dt_match(unsigned long node, const char *const *matches);
 extern unsigned long of_get_flat_dt_root(void);
 extern int of_get_flat_dt_size(void);
+extern uint32_t of_get_flat_dt_phandle(unsigned long node);
 
 extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
 				     int depth, void *data);
-- 
cgit v1.2.3


From 9ea762a5ae45235eb3e5ec9c05c33cf37db78d70 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Thu, 30 Mar 2017 13:13:33 +0200
Subject: virtio: virtio_driver doc

Add comments for the virtio_driver members that were not documented.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ed04753278d4..28b0e965360f 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -165,9 +165,13 @@ int virtio_device_restore(struct virtio_device *dev);
  * @feature_table_legacy: same as feature_table but when working in legacy mode.
  * @feature_table_size_legacy: number of entries in feature table legacy array.
  * @probe: the function to call when a device is found.  Returns 0 or -errno.
+ * @scan: optional function to call after successful probe; intended
+ *    for virtio-scsi to invoke a scan.
  * @remove: the function to call when a device is removed.
  * @config_changed: optional function to call when the device configuration
  *    changes; may be called in interrupt context.
+ * @freeze: optional function to call during suspend/hibernation.
+ * @restore: optional function to call on resume.
  */
 struct virtio_driver {
 	struct device_driver driver;
-- 
cgit v1.2.3


From fb9de9704775d6190c204f4ddf8da4cfdac26be1 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 7 Apr 2017 08:25:09 +0300
Subject: ptr_ring: batch ring zeroing

A known weakness in ptr_ring design is that it does not handle well the
situation when ring is almost full: as entries are consumed they are
immediately used again by the producer, so consumer and producer are
writing to a shared cache line.

To fix this, add batching to consume calls: as entries are
consumed do not write NULL into the ring until we get
a multiple (in current implementation 2x) of cache lines
away from the producer. At that point, write them all out.

We do the write out in the reverse order to keep
producer from sharing cache with consumer for as long
as possible.

Writeout also triggers when ring wraps around - there's
no special reason to do this but it helps keep the code
a bit simpler.

What should we do if getting away from producer by 2 cache lines
would mean we are keeping the ring moe than half empty?
Maybe we should reduce the batching in this case,
current patch simply reduces the batching.

Notes:
- it is no longer true that a call to consume guarantees
  that the following call to produce will succeed.
  No users seem to assume that.
- batching can also in theory reduce the signalling rate:
  users that would previously send interrups to the producer
  to wake it up after consuming each entry would now only
  need to do this once in a batch.
  Doing this would be easy by returning a flag to the caller.
  No users seem to do signalling on consume yet so this was not
  implemented yet.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
 include/linux/ptr_ring.h | 63 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 54 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 6c70444da3b9..6b2e0dd88569 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -34,11 +34,13 @@
 struct ptr_ring {
 	int producer ____cacheline_aligned_in_smp;
 	spinlock_t producer_lock;
-	int consumer ____cacheline_aligned_in_smp;
+	int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
+	int consumer_tail; /* next entry to invalidate */
 	spinlock_t consumer_lock;
 	/* Shared consumer/producer data */
 	/* Read-only by both the producer and the consumer */
 	int size ____cacheline_aligned_in_smp; /* max entries in queue */
+	int batch; /* number of entries to consume in a batch */
 	void **queue;
 };
 
@@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
 static inline void *__ptr_ring_peek(struct ptr_ring *r)
 {
 	if (likely(r->size))
-		return r->queue[r->consumer];
+		return r->queue[r->consumer_head];
 	return NULL;
 }
 
@@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
 /* Must only be called after __ptr_ring_peek returned !NULL */
 static inline void __ptr_ring_discard_one(struct ptr_ring *r)
 {
-	r->queue[r->consumer++] = NULL;
-	if (unlikely(r->consumer >= r->size))
-		r->consumer = 0;
+	/* Fundamentally, what we want to do is update consumer
+	 * index and zero out the entry so producer can reuse it.
+	 * Doing it naively at each consume would be as simple as:
+	 *       r->queue[r->consumer++] = NULL;
+	 *       if (unlikely(r->consumer >= r->size))
+	 *               r->consumer = 0;
+	 * but that is suboptimal when the ring is full as producer is writing
+	 * out new entries in the same cache line.  Defer these updates until a
+	 * batch of entries has been consumed.
+	 */
+	int head = r->consumer_head++;
+
+	/* Once we have processed enough entries invalidate them in
+	 * the ring all at once so producer can reuse their space in the ring.
+	 * We also do this when we reach end of the ring - not mandatory
+	 * but helps keep the implementation simple.
+	 */
+	if (unlikely(r->consumer_head - r->consumer_tail >= r->batch ||
+		     r->consumer_head >= r->size)) {
+		/* Zero out entries in the reverse order: this way we touch the
+		 * cache line that producer might currently be reading the last;
+		 * producer won't make progress and touch other cache lines
+		 * besides the first one until we write out all entries.
+		 */
+		while (likely(head >= r->consumer_tail))
+			r->queue[head--] = NULL;
+		r->consumer_tail = r->consumer_head;
+	}
+	if (unlikely(r->consumer_head >= r->size)) {
+		r->consumer_head = 0;
+		r->consumer_tail = 0;
+	}
 }
 
 static inline void *__ptr_ring_consume(struct ptr_ring *r)
@@ -345,14 +376,27 @@ static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp)
 	return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp);
 }
 
+static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
+{
+	r->size = size;
+	r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue));
+	/* We need to set batch at least to 1 to make logic
+	 * in __ptr_ring_discard_one work correctly.
+	 * Batching too much (because ring is small) would cause a lot of
+	 * burstiness. Needs tuning, for now disable batching.
+	 */
+	if (r->batch > r->size / 2 || !r->batch)
+		r->batch = 1;
+}
+
 static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
 {
 	r->queue = __ptr_ring_init_queue_alloc(size, gfp);
 	if (!r->queue)
 		return -ENOMEM;
 
-	r->size = size;
-	r->producer = r->consumer = 0;
+	__ptr_ring_set_size(r, size);
+	r->producer = r->consumer_head = r->consumer_tail = 0;
 	spin_lock_init(&r->producer_lock);
 	spin_lock_init(&r->consumer_lock);
 
@@ -373,9 +417,10 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
 		else if (destroy)
 			destroy(ptr);
 
-	r->size = size;
+	__ptr_ring_set_size(r, size);
 	r->producer = producer;
-	r->consumer = 0;
+	r->consumer_head = 0;
+	r->consumer_tail = 0;
 	old = r->queue;
 	r->queue = queue;
 
-- 
cgit v1.2.3


From 3ae3d67ba705c754a3c91ac009f9ce73a0e7286a Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 10 May 2017 15:01:30 -0600
Subject: libnvdimm: add an atomic vs process context flag to rw_bytes

nsio_rw_bytes can clear media errors, but this cannot be done while we
are in an atomic context due to locking within ACPI. From the BTT,
->rw_bytes may be called either from atomic or process context depending
on whether the calls happen during initialization or during IO.

During init, we want to ensure error clearing happens, and the flag
marking process context allows nsio_rw_bytes to do that. When called
during IO, we're in atomic context, and error clearing can be skipped.

Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/nvdimm/blk.c      |  3 ++-
 drivers/nvdimm/btt.c      | 67 +++++++++++++++++++++++++----------------------
 drivers/nvdimm/btt_devs.c |  2 +-
 drivers/nvdimm/claim.c    |  6 +++--
 drivers/nvdimm/nd.h       |  1 +
 drivers/nvdimm/pfn_devs.c |  4 +--
 include/linux/nd.h        | 12 +++++----
 7 files changed, 53 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 9faaa9694d87..822198a75e96 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -218,7 +218,8 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 }
 
 static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *iobuf, size_t n, int rw)
+		resource_size_t offset, void *iobuf, size_t n, int rw,
+		unsigned long flags)
 {
 	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
 	struct nd_blk_region *ndbr = to_ndbr(nsblk);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 368795aad5c9..aa977cd4869d 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -32,25 +32,25 @@ enum log_ent_request {
 };
 
 static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
-		void *buf, size_t n)
+		void *buf, size_t n, unsigned long flags)
 {
 	struct nd_btt *nd_btt = arena->nd_btt;
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
 	/* arena offsets are 4K from the base of the device */
 	offset += SZ_4K;
-	return nvdimm_read_bytes(ndns, offset, buf, n);
+	return nvdimm_read_bytes(ndns, offset, buf, n, flags);
 }
 
 static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
-		void *buf, size_t n)
+		void *buf, size_t n, unsigned long flags)
 {
 	struct nd_btt *nd_btt = arena->nd_btt;
 	struct nd_namespace_common *ndns = nd_btt->ndns;
 
 	/* arena offsets are 4K from the base of the device */
 	offset += SZ_4K;
-	return nvdimm_write_bytes(ndns, offset, buf, n);
+	return nvdimm_write_bytes(ndns, offset, buf, n, flags);
 }
 
 static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
@@ -58,19 +58,19 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
 	int ret;
 
 	ret = arena_write_bytes(arena, arena->info2off, super,
-			sizeof(struct btt_sb));
+			sizeof(struct btt_sb), 0);
 	if (ret)
 		return ret;
 
 	return arena_write_bytes(arena, arena->infooff, super,
-			sizeof(struct btt_sb));
+			sizeof(struct btt_sb), 0);
 }
 
 static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
 {
 	WARN_ON(!super);
 	return arena_read_bytes(arena, arena->infooff, super,
-			sizeof(struct btt_sb));
+			sizeof(struct btt_sb), 0);
 }
 
 /*
@@ -79,16 +79,17 @@ static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
  *   mapping is in little-endian
  *   mapping contains 'E' and 'Z' flags as desired
  */
-static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping)
+static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
+		unsigned long flags)
 {
 	u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
 
 	WARN_ON(lba >= arena->external_nlba);
-	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE);
+	return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
 }
 
 static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
-			u32 z_flag, u32 e_flag)
+			u32 z_flag, u32 e_flag, unsigned long rwb_flags)
 {
 	u32 ze;
 	__le32 mapping_le;
@@ -127,11 +128,11 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
 	}
 
 	mapping_le = cpu_to_le32(mapping);
-	return __btt_map_write(arena, lba, mapping_le);
+	return __btt_map_write(arena, lba, mapping_le, rwb_flags);
 }
 
 static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
-			int *trim, int *error)
+			int *trim, int *error, unsigned long rwb_flags)
 {
 	int ret;
 	__le32 in;
@@ -140,7 +141,7 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
 
 	WARN_ON(lba >= arena->external_nlba);
 
-	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE);
+	ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
 	if (ret)
 		return ret;
 
@@ -189,7 +190,7 @@ static int btt_log_read_pair(struct arena_info *arena, u32 lane,
 	WARN_ON(!ent);
 	return arena_read_bytes(arena,
 			arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
-			2 * LOG_ENT_SIZE);
+			2 * LOG_ENT_SIZE, 0);
 }
 
 static struct dentry *debugfs_root;
@@ -335,7 +336,7 @@ static int btt_log_read(struct arena_info *arena, u32 lane,
  * btt_flog_write is the wrapper for updating the freelist elements
  */
 static int __btt_log_write(struct arena_info *arena, u32 lane,
-			u32 sub, struct log_entry *ent)
+			u32 sub, struct log_entry *ent, unsigned long flags)
 {
 	int ret;
 	/*
@@ -350,13 +351,13 @@ static int __btt_log_write(struct arena_info *arena, u32 lane,
 	void *src = ent;
 
 	/* split the 16B write into atomic, durable halves */
-	ret = arena_write_bytes(arena, ns_off, src, log_half);
+	ret = arena_write_bytes(arena, ns_off, src, log_half, flags);
 	if (ret)
 		return ret;
 
 	ns_off += log_half;
 	src += log_half;
-	return arena_write_bytes(arena, ns_off, src, log_half);
+	return arena_write_bytes(arena, ns_off, src, log_half, flags);
 }
 
 static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
@@ -364,7 +365,7 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
 {
 	int ret;
 
-	ret = __btt_log_write(arena, lane, sub, ent);
+	ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC);
 	if (ret)
 		return ret;
 
@@ -397,7 +398,7 @@ static int btt_map_init(struct arena_info *arena)
 		size_t size = min(mapsize, chunk_size);
 
 		ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
-				size);
+				size, 0);
 		if (ret)
 			goto free;
 
@@ -428,10 +429,10 @@ static int btt_log_init(struct arena_info *arena)
 		log.old_map = cpu_to_le32(arena->external_nlba + i);
 		log.new_map = cpu_to_le32(arena->external_nlba + i);
 		log.seq = cpu_to_le32(LOG_SEQ_INIT);
-		ret = __btt_log_write(arena, i, 0, &log);
+		ret = __btt_log_write(arena, i, 0, &log, 0);
 		if (ret)
 			return ret;
-		ret = __btt_log_write(arena, i, 1, &zerolog);
+		ret = __btt_log_write(arena, i, 1, &zerolog, 0);
 		if (ret)
 			return ret;
 	}
@@ -470,7 +471,7 @@ static int btt_freelist_init(struct arena_info *arena)
 
 		/* Check if map recovery is needed */
 		ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry,
-				NULL, NULL);
+				NULL, NULL, 0);
 		if (ret)
 			return ret;
 		if ((le32_to_cpu(log_new.new_map) != map_entry) &&
@@ -480,7 +481,7 @@ static int btt_freelist_init(struct arena_info *arena)
 			 * to complete the map write. So fix up the map.
 			 */
 			ret = btt_map_write(arena, le32_to_cpu(log_new.lba),
-					le32_to_cpu(log_new.new_map), 0, 0);
+					le32_to_cpu(log_new.new_map), 0, 0, 0);
 			if (ret)
 				return ret;
 		}
@@ -875,7 +876,7 @@ static int btt_data_read(struct arena_info *arena, struct page *page,
 	u64 nsoff = to_namespace_offset(arena, lba);
 	void *mem = kmap_atomic(page);
 
-	ret = arena_read_bytes(arena, nsoff, mem + off, len);
+	ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
 	kunmap_atomic(mem);
 
 	return ret;
@@ -888,7 +889,7 @@ static int btt_data_write(struct arena_info *arena, u32 lba,
 	u64 nsoff = to_namespace_offset(arena, lba);
 	void *mem = kmap_atomic(page);
 
-	ret = arena_write_bytes(arena, nsoff, mem + off, len);
+	ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC);
 	kunmap_atomic(mem);
 
 	return ret;
@@ -931,10 +932,12 @@ static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip,
 		mem = kmap_atomic(bv.bv_page);
 		if (rw)
 			ret = arena_write_bytes(arena, meta_nsoff,
-					mem + bv.bv_offset, cur_len);
+					mem + bv.bv_offset, cur_len,
+					NVDIMM_IO_ATOMIC);
 		else
 			ret = arena_read_bytes(arena, meta_nsoff,
-					mem + bv.bv_offset, cur_len);
+					mem + bv.bv_offset, cur_len,
+					NVDIMM_IO_ATOMIC);
 
 		kunmap_atomic(mem);
 		if (ret)
@@ -976,7 +979,8 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 
 		cur_len = min(btt->sector_size, len);
 
-		ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag);
+		ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag,
+				NVDIMM_IO_ATOMIC);
 		if (ret)
 			goto out_lane;
 
@@ -1006,7 +1010,7 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
 			barrier();
 
 			ret = btt_map_read(arena, premap, &new_map, &t_flag,
-						&e_flag);
+						&e_flag, NVDIMM_IO_ATOMIC);
 			if (ret)
 				goto out_rtt;
 
@@ -1093,7 +1097,8 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		}
 
 		lock_map(arena, premap);
-		ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL);
+		ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL,
+				NVDIMM_IO_ATOMIC);
 		if (ret)
 			goto out_map;
 		if (old_postmap >= arena->internal_nlba) {
@@ -1110,7 +1115,7 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 		if (ret)
 			goto out_map;
 
-		ret = btt_map_write(arena, premap, new_postmap, 0, 0);
+		ret = btt_map_write(arena, premap, new_postmap, 0, 0, 0);
 		if (ret)
 			goto out_map;
 
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 4b76af2b8715..ae00dc0d9791 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -273,7 +273,7 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
 	if (!btt_sb || !ndns || !nd_btt)
 		return -ENODEV;
 
-	if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb)))
+	if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb), 0))
 		return -ENXIO;
 
 	if (nvdimm_namespace_capacity(ndns) < SZ_16M)
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 93d128da1c92..7ceb5fa4f2a1 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -228,7 +228,8 @@ u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
 EXPORT_SYMBOL(nd_sb_checksum);
 
 static int nsio_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size, int rw)
+		resource_size_t offset, void *buf, size_t size, int rw,
+		unsigned long flags)
 {
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
@@ -259,7 +260,8 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 		 * work around this collision.
 		 */
 		if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
-				&& (!ndns->claim || !is_nd_btt(ndns->claim))) {
+				&& !(flags & NVDIMM_IO_ATOMIC)
+				&& !ndns->claim) {
 			long cleared;
 
 			cleared = nvdimm_clear_poison(&ndns->dev,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 77d032192bf7..03852d738eec 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -31,6 +31,7 @@ enum {
 	ND_MAX_LANES = 256,
 	SECTOR_SHIFT = 9,
 	INT_LBASIZE_ALIGNMENT = 64,
+	NVDIMM_IO_ATOMIC = 1,
 };
 
 struct nd_poison {
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 335c8175410b..a6c403600d19 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -357,7 +357,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 	if (!is_nd_pmem(nd_pfn->dev.parent))
 		return -ENODEV;
 
-	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
+	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0))
 		return -ENXIO;
 
 	if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
@@ -662,7 +662,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
 
-	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
+	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb), 0);
 }
 
 /*
diff --git a/include/linux/nd.h b/include/linux/nd.h
index fa66aeed441a..194b8e002ea7 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -48,7 +48,7 @@ struct nd_namespace_common {
 	struct device dev;
 	struct device *claim;
 	int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset,
-			void *buf, size_t size, int rw);
+			void *buf, size_t size, int rw, unsigned long flags);
 };
 
 static inline struct nd_namespace_common *to_ndns(struct device *dev)
@@ -134,9 +134,10 @@ static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device *
  * @buf is up-to-date upon return from this routine.
  */
 static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size)
+		resource_size_t offset, void *buf, size_t size,
+		unsigned long flags)
 {
-	return ndns->rw_bytes(ndns, offset, buf, size, READ);
+	return ndns->rw_bytes(ndns, offset, buf, size, READ, flags);
 }
 
 /**
@@ -152,9 +153,10 @@ static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns,
  * to media is handled internal to the @ndns driver, if at all.
  */
 static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size)
+		resource_size_t offset, void *buf, size_t size,
+		unsigned long flags)
 {
-	return ndns->rw_bytes(ndns, offset, buf, size, WRITE);
+	return ndns->rw_bytes(ndns, offset, buf, size, WRITE, flags);
 }
 
 #define MODULE_ALIAS_ND_DEVICE(type) \
-- 
cgit v1.2.3


From df3ed932394488e57e72dd0e73c224d1804fdc8f Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Thu, 11 May 2017 10:15:10 -0500
Subject: Partially Revert "of: fix sparse warnings in fdt, irq, reserved mem,
 and resolver code"

A change to function pointers that was meant to address a sparse warning
turned out to cause hundreds of new gcc-7 warnings:

include/linux/of_irq.h:11:13: error: type qualifiers ignored on function return type [-Werror=ignored-qualifiers]
drivers/of/of_reserved_mem.c: In function '__reserved_mem_init_node':
drivers/of/of_reserved_mem.c:200:7: error: type qualifiers ignored on function return type [-Werror=ignored-qualifiers]
   int const (*initfn)(struct reserved_mem *rmem) = i->data;

Turns out the sparse warnings were spurious and have been fixed in
upstream sparse since 0.5.0 in commit "sparse: treat function pointers
as pointers to const data".

This partially reverts commit 17a70355ea576843a7ac851f1db26872a50b2850.

Fixes: 17a70355ea57 ("of: fix sparse warnings in fdt, irq, reserved mem, and resolver code")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 drivers/of/of_reserved_mem.c | 2 +-
 include/linux/of_irq.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 4dec07ea510f..d507c3569a88 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -197,7 +197,7 @@ static int __init __reserved_mem_init_node(struct reserved_mem *rmem)
 	const struct of_device_id *i;
 
 	for (i = __reservedmem_of_table; i < &__rmem_of_table_sentinel; i++) {
-		int const (*initfn)(struct reserved_mem *rmem) = i->data;
+		reservedmem_of_init_fn initfn = i->data;
 		const char *compat = i->compatible;
 
 		if (!of_flat_dt_is_compatible(rmem->fdt_node, compat))
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index ec6b11deb773..1e0deb8e8494 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -8,7 +8,7 @@
 #include <linux/ioport.h>
 #include <linux/of.h>
 
-typedef int const (*of_irq_init_cb_t)(struct device_node *, struct device_node *);
+typedef int (*of_irq_init_cb_t)(struct device_node *, struct device_node *);
 
 /*
  * Workarounds only applied to 32bit powermac machines
-- 
cgit v1.2.3


From d1174416747d790d750742d0514915deeed93acf Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 10 May 2017 11:22:52 -0700
Subject: bpf: Track alignment of register values in the verifier.

Currently if we add only constant values to pointers we can fully
validate the alignment, and properly check if we need to reject the
program on !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS architectures.

However, once an unknown value is introduced we only allow byte sized
memory accesses which is too restrictive.

Add logic to track the known minimum alignment of register values,
and propagate this state into registers containing pointers.

The most common paradigm that makes use of this new logic is computing
the transport header using the IP header length field.  For example:

	struct ethhdr *ep = skb->data;
	struct iphdr *iph = (struct iphdr *) (ep + 1);
	struct tcphdr *th;
 ...
	n = iph->ihl;
	th = ((void *)iph + (n * 4));
	port = th->dest;

The existing code will reject the load of th->dest because it cannot
validate that the alignment is at least 2 once "n * 4" is added the
the packet pointer.

In the new code, the register holding "n * 4" will have a reg->min_align
value of 4, because any value multiplied by 4 will be at least 4 byte
aligned.  (actually, the eBPF code emitted by the compiler in this case
is most likely to use a shift left by 2, but the end result is identical)

At the critical addition:

	th = ((void *)iph + (n * 4));

The register holding 'th' will start with reg->off value of 14.  The
pointer addition will transform that reg into something that looks like:

	reg->aux_off = 14
	reg->aux_off_align = 4

Next, the verifier will look at the th->dest load, and it will see
a load offset of 2, and first check:

	if (reg->aux_off_align % size)

which will pass because aux_off_align is 4.  reg_off will be computed:

	reg_off = reg->off;
 ...
		reg_off += reg->aux_off;

plus we have off==2, and it will thus check:

	if ((NET_IP_ALIGN + reg_off + off) % size != 0)

which evaluates to:

	if ((NET_IP_ALIGN + 14 + 2) % size != 0)

On strict alignment architectures, NET_IP_ALIGN is 2, thus:

	if ((2 + 14 + 2) % size != 0)

which passes.

These pointer transformations and checks work regardless of whether
the constant offset or the variable with known alignment is added
first to the pointer register.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |   3 ++
 kernel/bpf/verifier.c        | 108 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 92 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5efb4db44e1e..7c6a51924afc 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -40,6 +40,9 @@ struct bpf_reg_state {
 	 */
 	s64 min_value;
 	u64 max_value;
+	u32 min_align;
+	u32 aux_off;
+	u32 aux_off_align;
 };
 
 enum bpf_stack_slot_type {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c5b56c92f8e2..cc7b626fa447 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -241,6 +241,12 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		if (reg->max_value != BPF_REGISTER_MAX_RANGE)
 			verbose(",max_value=%llu",
 				(unsigned long long)reg->max_value);
+		if (reg->min_align)
+			verbose(",min_align=%u", reg->min_align);
+		if (reg->aux_off)
+			verbose(",aux_off=%u", reg->aux_off);
+		if (reg->aux_off_align)
+			verbose(",aux_off_align=%u", reg->aux_off_align);
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] == STACK_SPILL)
@@ -466,6 +472,9 @@ static void init_reg_state(struct bpf_reg_state *regs)
 		regs[i].imm = 0;
 		regs[i].min_value = BPF_REGISTER_MIN_RANGE;
 		regs[i].max_value = BPF_REGISTER_MAX_RANGE;
+		regs[i].min_align = 0;
+		regs[i].aux_off = 0;
+		regs[i].aux_off_align = 0;
 	}
 
 	/* frame pointer */
@@ -492,6 +501,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 {
 	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
 	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+	regs[regno].min_align = 0;
 }
 
 static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
@@ -779,17 +789,28 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 }
 
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
-				   int off, int size)
+				   int off, int size, bool strict)
 {
-	if (reg->id && size != 1) {
-		verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
-		return -EACCES;
+	int reg_off;
+
+	/* Byte size accesses are always allowed. */
+	if (!strict || size == 1)
+		return 0;
+
+	reg_off = reg->off;
+	if (reg->id) {
+		if (reg->aux_off_align % size) {
+			verbose("Packet access is only %u byte aligned, %d byte access not allowed\n",
+				reg->aux_off_align, size);
+			return -EACCES;
+		}
+		reg_off += reg->aux_off;
 	}
 
 	/* skb->data is NET_IP_ALIGN-ed */
-	if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+	if ((NET_IP_ALIGN + reg_off + off) % size != 0) {
 		verbose("misaligned packet access off %d+%d+%d size %d\n",
-			NET_IP_ALIGN, reg->off, off, size);
+			NET_IP_ALIGN, reg_off, off, size);
 		return -EACCES;
 	}
 
@@ -797,9 +818,9 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 }
 
 static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
-				   int size)
+				   int size, bool strict)
 {
-	if (size != 1) {
+	if (strict && size != 1) {
 		verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
 		return -EACCES;
 	}
@@ -810,13 +831,16 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
 static int check_ptr_alignment(const struct bpf_reg_state *reg,
 			       int off, int size)
 {
+	bool strict = false;
+
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+		strict = true;
+
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
-		       check_pkt_ptr_alignment(reg, off, size);
+		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE_ADJ:
-		return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
-		       check_val_ptr_alignment(reg, size);
+		return check_val_ptr_alignment(reg, size, strict);
 	default:
 		if (off % size != 0) {
 			verbose("misaligned access off %d size %d\n",
@@ -883,6 +907,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 							 value_regno);
 			/* note that reg.[id|off|range] == 0 */
 			state->regs[value_regno].type = reg_type;
+			state->regs[value_regno].aux_off = 0;
+			state->regs[value_regno].aux_off_align = 0;
 		}
 
 	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
@@ -1455,6 +1481,8 @@ add_imm:
 		 */
 		dst_reg->off += imm;
 	} else {
+		bool had_id;
+
 		if (src_reg->type == PTR_TO_PACKET) {
 			/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
 			tmp_reg = *dst_reg;  /* save r7 state */
@@ -1488,14 +1516,23 @@ add_imm:
 				src_reg->imm);
 			return -EACCES;
 		}
+
+		had_id = (dst_reg->id != 0);
+
 		/* dst_reg stays as pkt_ptr type and since some positive
 		 * integer value was added to the pointer, increment its 'id'
 		 */
 		dst_reg->id = ++env->id_gen;
 
-		/* something was added to pkt_ptr, set range and off to zero */
+		/* something was added to pkt_ptr, set range to zero */
+		dst_reg->aux_off = dst_reg->off;
 		dst_reg->off = 0;
 		dst_reg->range = 0;
+		if (had_id)
+			dst_reg->aux_off_align = min(dst_reg->aux_off_align,
+						     src_reg->min_align);
+		else
+			dst_reg->aux_off_align = src_reg->min_align;
 	}
 	return 0;
 }
@@ -1669,6 +1706,13 @@ static void check_reg_overflow(struct bpf_reg_state *reg)
 		reg->min_value = BPF_REGISTER_MIN_RANGE;
 }
 
+static u32 calc_align(u32 imm)
+{
+	if (!imm)
+		return 1U << 31;
+	return imm - ((imm - 1) & imm);
+}
+
 static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				    struct bpf_insn *insn)
 {
@@ -1676,8 +1720,10 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	s64 min_val = BPF_REGISTER_MIN_RANGE;
 	u64 max_val = BPF_REGISTER_MAX_RANGE;
 	u8 opcode = BPF_OP(insn->code);
+	u32 dst_align, src_align;
 
 	dst_reg = &regs[insn->dst_reg];
+	src_align = 0;
 	if (BPF_SRC(insn->code) == BPF_X) {
 		check_reg_overflow(&regs[insn->src_reg]);
 		min_val = regs[insn->src_reg].min_value;
@@ -1693,12 +1739,18 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		    regs[insn->src_reg].type != UNKNOWN_VALUE) {
 			min_val = BPF_REGISTER_MIN_RANGE;
 			max_val = BPF_REGISTER_MAX_RANGE;
+			src_align = 0;
+		} else {
+			src_align = regs[insn->src_reg].min_align;
 		}
 	} else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
 		   (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
 		min_val = max_val = insn->imm;
+		src_align = calc_align(insn->imm);
 	}
 
+	dst_align = dst_reg->min_align;
+
 	/* We don't know anything about what was done to this register, mark it
 	 * as unknown.
 	 */
@@ -1723,18 +1775,21 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->min_value += min_val;
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value += max_val;
+		dst_reg->min_align = min(src_align, dst_align);
 		break;
 	case BPF_SUB:
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value -= min_val;
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value -= max_val;
+		dst_reg->min_align = min(src_align, dst_align);
 		break;
 	case BPF_MUL:
 		if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
 			dst_reg->min_value *= min_val;
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value *= max_val;
+		dst_reg->min_align = max(src_align, dst_align);
 		break;
 	case BPF_AND:
 		/* Disallow AND'ing of negative numbers, ain't nobody got time
@@ -1746,17 +1801,23 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		else
 			dst_reg->min_value = 0;
 		dst_reg->max_value = max_val;
+		dst_reg->min_align = max(src_align, dst_align);
 		break;
 	case BPF_LSH:
 		/* Gotta have special overflow logic here, if we're shifting
 		 * more than MAX_RANGE then just assume we have an invalid
 		 * range.
 		 */
-		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+		if (min_val > ilog2(BPF_REGISTER_MAX_RANGE)) {
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
-			dst_reg->min_value <<= min_val;
-
+			dst_reg->min_align = 1;
+		} else {
+			if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+				dst_reg->min_value <<= min_val;
+			if (!dst_reg->min_align)
+				dst_reg->min_align = 1;
+			dst_reg->min_align <<= min_val;
+		}
 		if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
 			dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
 		else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
@@ -1766,11 +1827,19 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 		/* RSH by a negative number is undefined, and the BPF_RSH is an
 		 * unsigned shift, so make the appropriate casts.
 		 */
-		if (min_val < 0 || dst_reg->min_value < 0)
+		if (min_val < 0 || dst_reg->min_value < 0) {
 			dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
-		else
+		} else {
 			dst_reg->min_value =
 				(u64)(dst_reg->min_value) >> min_val;
+		}
+		if (min_val < 0) {
+			dst_reg->min_align = 1;
+		} else {
+			dst_reg->min_align >>= (u64) min_val;
+			if (!dst_reg->min_align)
+				dst_reg->min_align = 1;
+		}
 		if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
 			dst_reg->max_value >>= max_val;
 		break;
@@ -1872,6 +1941,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			regs[insn->dst_reg].imm = insn->imm;
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
+			regs[insn->dst_reg].min_align = calc_align(insn->imm);
 		}
 
 	} else if (opcode > BPF_END) {
-- 
cgit v1.2.3


From e07b98d9bffe410019dfcf62c3428d4a96c56a2c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 10 May 2017 11:38:07 -0700
Subject: bpf: Add strict alignment flag for BPF_PROG_LOAD.

Add a new field, "prog_flags", and an initial flag value
BPF_F_STRICT_ALIGNMENT.

When set, the verifier will enforce strict pointer alignment
regardless of the setting of CONFIG_EFFICIENT_UNALIGNED_ACCESS.

The verifier, in this mode, will also use a fixed value of "2" in
place of NET_IP_ALIGN.

This facilitates test cases that will exercise and validate this part
of the verifier even when run on architectures where alignment doesn't
matter.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h   |  1 +
 include/uapi/linux/bpf.h       |  8 ++++++++
 kernel/bpf/syscall.c           |  5 ++++-
 kernel/bpf/verifier.c          | 23 +++++++++++++++++------
 tools/build/feature/test-bpf.c |  1 +
 tools/include/uapi/linux/bpf.h | 11 +++++++++--
 6 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7c6a51924afc..d5093b52b485 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -90,6 +90,7 @@ struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
+	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 945a1f5f63c5..94dfa9def355 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,13 @@ enum bpf_attach_type {
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
 
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -177,6 +184,7 @@ union bpf_attr {
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		prog_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fd2411fd6914..265a0d854e33 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -783,7 +783,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD kern_version
+#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -796,6 +796,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_LOAD))
 		return -EINVAL;
 
+	if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
+		return -EINVAL;
+
 	/* copy eBPF program license from user space */
 	if (strncpy_from_user(license, u64_to_user_ptr(attr->license),
 			      sizeof(license) - 1) < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ff2bfe1d656a..e74fb1b87855 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -791,6 +791,7 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
+	int ip_align;
 	int reg_off;
 
 	/* Byte size accesses are always allowed. */
@@ -807,10 +808,14 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		reg_off += reg->aux_off;
 	}
 
-	/* skb->data is NET_IP_ALIGN-ed */
-	if ((NET_IP_ALIGN + reg_off + off) % size != 0) {
+	/* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking
+	 * we force this to 2 which is universally what architectures use
+	 * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+	 */
+	ip_align = strict ? 2 : NET_IP_ALIGN;
+	if ((ip_align + reg_off + off) % size != 0) {
 		verbose("misaligned packet access off %d+%d+%d size %d\n",
-			NET_IP_ALIGN, reg_off, off, size);
+			ip_align, reg_off, off, size);
 		return -EACCES;
 	}
 
@@ -828,10 +833,11 @@ static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+			       const struct bpf_reg_state *reg,
 			       int off, int size)
 {
-	bool strict = false;
+	bool strict = env->strict_alignment;
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		strict = true;
@@ -873,7 +879,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 	if (size < 0)
 		return size;
 
-	err = check_ptr_alignment(reg, off, size);
+	err = check_ptr_alignment(env, reg, off, size);
 	if (err)
 		return err;
 
@@ -3568,6 +3574,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	} else {
 		log_level = 0;
 	}
+	if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT)
+		env->strict_alignment = true;
+	else
+		env->strict_alignment = false;
 
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
@@ -3673,6 +3683,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	mutex_lock(&bpf_verifier_lock);
 
 	log_level = 0;
+	env->strict_alignment = false;
 
 	env->explored_states = kcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index ebc6dceddb58..7598361ef1f1 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -29,6 +29,7 @@ int main(void)
 	attr.log_size = 0;
 	attr.log_level = 0;
 	attr.kern_version = 0;
+	attr.prog_flags = 0;
 
 	/*
 	 * Test existence of __NR_bpf and BPF_PROG_LOAD.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e553529929f6..94dfa9def355 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -132,6 +132,13 @@ enum bpf_attach_type {
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
 
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT	(1U << 0)
+
 #define BPF_PSEUDO_MAP_FD	1
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -177,6 +184,7 @@ union bpf_attr {
 		__u32		log_size;	/* size of user buffer */
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
+		__u32		prog_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -481,8 +489,7 @@ union bpf_attr {
  * u32 bpf_get_socket_uid(skb)
  *     Get the owner uid of the socket stored inside sk_buff.
  *     @skb: pointer to skb
- *     Return: uid of the socket owner on success or 0 if the socket pointer
- *     inside sk_buff is NULL
+ *     Return: uid of the socket owner on success or overflowuid if failed.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
-- 
cgit v1.2.3


From d67b9cd28c1d7f82c2e5e727731ea7c89b23a0a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 12 May 2017 01:04:46 +0200
Subject: xdp: refine xdp api with regards to generic xdp

While working on the iproute2 generic XDP frontend, I noticed that
as of right now it's possible to have native *and* generic XDP
programs loaded both at the same time for the case when a driver
supports native XDP.

The intended model for generic XDP from b5cdae3291f7 ("net: Generic
XDP") is, however, that only one out of the two can be present at
once which is also indicated as such in the XDP netlink dump part.
The main rationale for generic XDP is to ease accessibility (in
case a driver does not yet have XDP support) and to generically
provide a semantical model as an example for driver developers
wanting to add XDP support. The generic XDP option for an XDP
aware driver can still be useful for comparing and testing both
implementations.

However, it is not intended to have a second XDP processing stage
or layer with exactly the same functionality of the first native
stage. Only reason could be to have a partial fallback for future
XDP features that are not supported yet in the native implementation
and we probably also shouldn't strive for such fallback and instead
encourage native feature support in the first place. Given there's
currently no such fallback issue or use case, lets not go there yet
if we don't need to.

Therefore, change semantics for loading XDP and bail out if the
user tries to load a generic XDP program when a native one is
present and vice versa. Another alternative to bailing out would
be to handle the transition from one flavor to another gracefully,
but that would require to bring the device down, exchange both
types of programs, and bring it up again in order to avoid a tiny
window where a packet could hit both hooks. Given this complicates
the logic for just a debugging feature in the native case, I went
with the simpler variant.

For the dump, remove IFLA_XDP_FLAGS that was added with b5cdae3291f7
and reuse IFLA_XDP_ATTACHED for indicating the mode. Dumping all
or just a subset of flags that were used for loading the XDP prog
is suboptimal in the long run since not all flags are useful for
dumping and if we start to reuse the same flag definitions for
load and dump, then we'll waste bit space. What we really just
want is to dump the mode for now.

Current IFLA_XDP_ATTACHED semantics are: nothing was installed (0),
a program is running at the native driver layer (1). Thus, add a
mode that says that a program is running at generic XDP layer (2).
Applications will handle this fine in that older binaries will
just indicate that something is attached at XDP layer, effectively
this is similar to IFLA_XDP_FLAGS attr that we would have had
modulo the redundancy.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  8 +++++--
 include/uapi/linux/if_link.h |  7 ++++++
 net/core/dev.c               | 55 +++++++++++++++++++++++++++++---------------
 net/core/rtnetlink.c         | 40 +++++++++++++++-----------------
 4 files changed, 67 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c23bd2efb56..3f39d27decf4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3296,11 +3296,15 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
+
+typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, u32 flags);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op);
+
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 549ac8a98b44..15ac20382aba 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -894,6 +894,13 @@ enum {
 					 XDP_FLAGS_SKB_MODE | \
 					 XDP_FLAGS_DRV_MODE)
 
+/* These are stored into IFLA_XDP_ATTACHED on dump. */
+enum {
+	XDP_ATTACHED_NONE = 0,
+	XDP_ATTACHED_DRV,
+	XDP_ATTACHED_SKB,
+};
+
 enum {
 	IFLA_XDP_UNSPEC,
 	IFLA_XDP_FD,
diff --git a/net/core/dev.c b/net/core/dev.c
index e56cb71351d4..fca407b4a6ea 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6852,6 +6852,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+
+	/* Query must always succeed. */
+	WARN_ON(xdp_op(dev, &xdp) < 0);
+	return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+			   struct netlink_ext_ack *extack,
+			   struct bpf_prog *prog)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_SETUP_PROG;
+	xdp.extack = extack;
+	xdp.prog = prog;
+
+	return xdp_op(dev, &xdp);
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -6864,43 +6890,34 @@ EXPORT_SYMBOL(dev_change_proto_down);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
-	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	struct netdev_xdp xdp;
+	xdp_op_t xdp_op, xdp_chk;
 	int err;
 
 	ASSERT_RTNL();
 
-	xdp_op = ops->ndo_xdp;
+	xdp_op = xdp_chk = ops->ndo_xdp;
 	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
 		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
+	if (xdp_op == xdp_chk)
+		xdp_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
-			memset(&xdp, 0, sizeof(xdp));
-			xdp.command = XDP_QUERY_PROG;
-
-			err = xdp_op(dev, &xdp);
-			if (err < 0)
-				return err;
-			if (xdp.prog_attached)
-				return -EBUSY;
-		}
+		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+			return -EEXIST;
+		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+		    __dev_xdp_attached(dev, xdp_op))
+			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
 
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_SETUP_PROG;
-	xdp.extack = extack;
-	xdp.prog = prog;
-
-	err = xdp_op(dev, &xdp);
+	err = dev_xdp_install(dev, xdp_op, extack, prog);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dda9f1636356..d7f82c3450b1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -899,8 +899,7 @@ static size_t rtnl_port_size(const struct net_device *dev,
 static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
-			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4);	/* XDP_FLAGS */
+			  nla_total_size(1);	/* XDP_ATTACHED */
 
 	return xdp_size;
 }
@@ -1247,37 +1246,34 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static u8 rtnl_xdp_attached_mode(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	ASSERT_RTNL();
+
+	if (rcu_access_pointer(dev->xdp_prog))
+		return XDP_ATTACHED_SKB;
+	if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp))
+		return XDP_ATTACHED_DRV;
+
+	return XDP_ATTACHED_NONE;
+}
+
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	struct nlattr *xdp;
-	u32 xdp_flags = 0;
-	u8 val = 0;
 	int err;
 
 	xdp = nla_nest_start(skb, IFLA_XDP);
 	if (!xdp)
 		return -EMSGSIZE;
-	if (rcu_access_pointer(dev->xdp_prog)) {
-		xdp_flags = XDP_FLAGS_SKB_MODE;
-		val = 1;
-	} else if (dev->netdev_ops->ndo_xdp) {
-		struct netdev_xdp xdp_op = {};
-
-		xdp_op.command = XDP_QUERY_PROG;
-		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
-		if (err)
-			goto err_cancel;
-		val = xdp_op.prog_attached;
-	}
-	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
+
+	err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
+			 rtnl_xdp_attached_mode(dev));
 	if (err)
 		goto err_cancel;
 
-	if (xdp_flags) {
-		err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
-		if (err)
-			goto err_cancel;
-	}
 	nla_nest_end(skb, xdp);
 	return 0;
 
-- 
cgit v1.2.3


From 572e0ca9b909339fbe017aaff1a225efb6db3b61 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Fri, 12 May 2017 15:46:29 -0700
Subject: time: delete current_fs_time()

All uses of the current_fs_time() function have been replaced by other
time interfaces.

And, its use cases can be fulfilled by current_time() or ktime_get_*
variants.

Link: http://lkml.kernel.org/r/1491613030-11599-13-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h |  1 -
 kernel/time/time.c | 14 --------------
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ad325ed71e8..803e5a9b2654 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1431,7 +1431,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
 	inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
 }
 
-extern struct timespec current_fs_time(struct super_block *sb);
 extern struct timespec current_time(struct inode *inode);
 
 /*
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 6574bba44b55..49c73c6ed648 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
 	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
 }
 
-/**
- * current_fs_time - Return FS time
- * @sb: Superblock.
- *
- * Return the current time truncated to the time granularity supported by
- * the fs.
- */
-struct timespec current_fs_time(struct super_block *sb)
-{
-	struct timespec now = current_kernel_time();
-	return timespec_trunc(now, sb->s_time_gran);
-}
-EXPORT_SYMBOL(current_fs_time);
-
 /*
  * Convert jiffies to milliseconds and back.
  *
-- 
cgit v1.2.3


From 8594a21cf7a8318baedbedc3fcd2967a17ddeec0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 12 May 2017 15:46:41 -0700
Subject: mm, vmalloc: fix vmalloc users tracking properly

Commit 1f5307b1e094 ("mm, vmalloc: properly track vmalloc users") has
pulled asm/pgtable.h include dependency to linux/vmalloc.h and that
turned out to be a bad idea for some architectures.  E.g.  m68k fails
with

   In file included from arch/m68k/include/asm/pgtable_mm.h:145:0,
                    from arch/m68k/include/asm/pgtable.h:4,
                    from include/linux/vmalloc.h:9,
                    from arch/m68k/kernel/module.c:9:
   arch/m68k/include/asm/mcf_pgtable.h: In function 'nocache_page':
>> arch/m68k/include/asm/mcf_pgtable.h:339:43: error: 'init_mm' undeclared (first use in this function)
    #define pgd_offset_k(address) pgd_offset(&init_mm, address)

as spotted by kernel build bot. nios2 fails for other reason

  In file included from include/asm-generic/io.h:767:0,
                   from arch/nios2/include/asm/io.h:61,
                   from include/linux/io.h:25,
                   from arch/nios2/include/asm/pgtable.h:18,
                   from include/linux/mm.h:70,
                   from include/linux/pid_namespace.h:6,
                   from include/linux/ptrace.h:9,
                   from arch/nios2/include/uapi/asm/elf.h:23,
                   from arch/nios2/include/asm/elf.h:22,
                   from include/linux/elf.h:4,
                   from include/linux/module.h:15,
                   from init/main.c:16:
  include/linux/vmalloc.h: In function '__vmalloc_node_flags':
  include/linux/vmalloc.h:99:40: error: 'PAGE_KERNEL' undeclared (first use in this function); did you mean 'GFP_KERNEL'?

which is due to the newly added #include <asm/pgtable.h>, which on nios2
includes <linux/io.h> and thus <asm/io.h> and <asm-generic/io.h> which
again includes <linux/vmalloc.h>.

Tweaking that around just turns out a bigger headache than necessary.
This patch reverts 1f5307b1e094 and reimplements the original fix in a
different way.  __vmalloc_node_flags can stay static inline which will
cover vmalloc* functions.  We only have one external user
(kvmalloc_node) and we can export __vmalloc_node_flags_caller and
provide the caller directly.  This is much simpler and it doesn't really
need any games with header files.

[akpm@linux-foundation.org: coding-style fixes]
[mhocko@kernel.org: revert old comment]
  Link: http://lkml.kernel.org/r/20170509211054.GB16325@dhcp22.suse.cz
Fixes: 1f5307b1e094 ("mm, vmalloc: properly track vmalloc users")
Link: http://lkml.kernel.org/r/20170509153702.GR6481@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Tobias Klauser <tklauser@distanz.ch>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 21 ++++++---------------
 mm/util.c               |  3 ++-
 mm/vmalloc.c            | 19 ++++++++++++++++++-
 3 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 0328ce003992..2d92dd002abd 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -6,7 +6,6 @@
 #include <linux/list.h>
 #include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
-#include <asm/pgtable.h>	/* PAGE_KERNEL */
 #include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
@@ -83,22 +82,14 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			const void *caller);
 #ifndef CONFIG_MMU
 extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
-#else
-extern void *__vmalloc_node(unsigned long size, unsigned long align,
-			    gfp_t gfp_mask, pgprot_t prot,
-			    int node, const void *caller);
-
-/*
- * We really want to have this inlined due to caller tracking. This
- * function is used by the highlevel vmalloc apis and so we want to track
- * their callers and inlining will achieve that.
- */
-static inline void *__vmalloc_node_flags(unsigned long size,
-					int node, gfp_t flags)
+static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
+						gfp_t flags, void *caller)
 {
-	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
-					node, __builtin_return_address(0));
+	return __vmalloc_node_flags(size, node, flags);
 }
+#else
+extern void *__vmalloc_node_flags_caller(unsigned long size,
+					 int node, gfp_t flags, void *caller);
 #endif
 
 extern void vfree(const void *addr);
diff --git a/mm/util.c b/mm/util.c
index 718154debc87..464df3489903 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -382,7 +382,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	if (ret || size <= PAGE_SIZE)
 		return ret;
 
-	return __vmalloc_node_flags(size, node, flags);
+	return __vmalloc_node_flags_caller(size, node, flags,
+			__builtin_return_address(0));
 }
 EXPORT_SYMBOL(kvmalloc_node);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 194c22eccb9d..34a1c3e46ed7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1649,6 +1649,9 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+			    gfp_t gfp_mask, pgprot_t prot,
+			    int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
@@ -1791,7 +1794,7 @@ fail:
  *	with mm people.
  *
  */
-void *__vmalloc_node(unsigned long size, unsigned long align,
+static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
 			    int node, const void *caller)
 {
@@ -1806,6 +1809,20 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+static inline void *__vmalloc_node_flags(unsigned long size,
+					int node, gfp_t flags)
+{
+	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+					node, __builtin_return_address(0));
+}
+
+
+void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
+				  void *caller)
+{
+	return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
+}
+
 /**
  *	vmalloc  -  allocate virtually contiguous memory
  *	@size:		allocation size
-- 
cgit v1.2.3


From 4636e70bb0a8b871998b6841a2e4b205cf2bc863 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Fri, 12 May 2017 15:46:47 -0700
Subject: dax: prevent invalidation of mapped DAX entries

Patch series "mm,dax: Fix data corruption due to mmap inconsistency",
v4.

This series fixes data corruption that can happen for DAX mounts when
page faults race with write(2) and as a result page tables get out of
sync with block mappings in the filesystem and thus data seen through
mmap is different from data seen through read(2).

The series passes testing with t_mmap_stale test program from Ross and
also other mmap related tests on DAX filesystem.

This patch (of 4):

dax_invalidate_mapping_entry() currently removes DAX exceptional entries
only if they are clean and unlocked.  This is done via:

  invalidate_mapping_pages()
    invalidate_exceptional_entry()
      dax_invalidate_mapping_entry()

However, for page cache pages removed in invalidate_mapping_pages()
there is an additional criteria which is that the page must not be
mapped.  This is noted in the comments above invalidate_mapping_pages()
and is checked in invalidate_inode_page().

For DAX entries this means that we can can end up in a situation where a
DAX exceptional entry, either a huge zero page or a regular DAX entry,
could end up mapped but without an associated radix tree entry.  This is
inconsistent with the rest of the DAX code and with what happens in the
page cache case.

We aren't able to unmap the DAX exceptional entry because according to
its comments invalidate_mapping_pages() isn't allowed to block, and
unmap_mapping_range() takes a write lock on the mapping->i_mmap_rwsem.

Since we essentially never have unmapped DAX entries to evict from the
radix tree, just remove dax_invalidate_mapping_entry().

Fixes: c6dcf52c23d2 ("mm: Invalidate DAX radix tree entries only if appropriate")
Link: http://lkml.kernel.org/r/20170510085419.27601-2-jack@suse.cz
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Jan Kara <jack@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: <stable@vger.kernel.org>    [4.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c            | 29 -----------------------------
 include/linux/dax.h |  1 -
 mm/truncate.c       |  9 +++------
 3 files changed, 3 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index 66d79067eedf..38deebb8c86e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -460,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 	return ret;
 }
 
-/*
- * Invalidate exceptional DAX entry if easily possible. This handles DAX
- * entries for invalidate_inode_pages() so we evict the entry only if we can
- * do so without blocking.
- */
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
-{
-	int ret = 0;
-	void *entry, **slot;
-	struct radix_tree_root *page_tree = &mapping->page_tree;
-
-	spin_lock_irq(&mapping->tree_lock);
-	entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
-	if (!entry || !radix_tree_exceptional_entry(entry) ||
-	    slot_locked(mapping, slot))
-		goto out;
-	if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
-	    radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-		goto out;
-	radix_tree_delete(page_tree, index);
-	mapping->nrexceptional--;
-	ret = 1;
-out:
-	spin_unlock_irq(&mapping->tree_lock);
-	if (ret)
-		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
-	return ret;
-}
-
 /*
  * Invalidate exceptional DAX entry if it is clean.
  */
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d3158e74a59e..d1236d16ef00 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -63,7 +63,6 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
 		    const struct iomap_ops *ops);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
-int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
diff --git a/mm/truncate.c b/mm/truncate.c
index 83a059e8cd1d..706cff171a15 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -67,17 +67,14 @@ static void truncate_exceptional_entry(struct address_space *mapping,
 
 /*
  * Invalidate exceptional entry if easily possible. This handles exceptional
- * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
- * clean entries.
+ * entries for invalidate_inode_pages().
  */
 static int invalidate_exceptional_entry(struct address_space *mapping,
 					pgoff_t index, void *entry)
 {
-	/* Handled by shmem itself */
-	if (shmem_mapping(mapping))
+	/* Handled by shmem itself, or for DAX we do nothing. */
+	if (shmem_mapping(mapping) || dax_mapping(mapping))
 		return 1;
-	if (dax_mapping(mapping))
-		return dax_invalidate_mapping_entry(mapping, index);
 	clear_shadow_entry(mapping, index, entry);
 	return 1;
 }
-- 
cgit v1.2.3


From f5705aa8cfed142d980ecac12bee0d81b756479e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 13 May 2017 16:31:05 -0700
Subject: dax, xfs, ext4: compile out iomap-dax paths in the FS_DAX=n case

Tetsuo reports:

  fs/built-in.o: In function `xfs_file_iomap_end':
  xfs_iomap.c:(.text+0xe0ef9): undefined reference to `put_dax'
  fs/built-in.o: In function `xfs_file_iomap_begin':
  xfs_iomap.c:(.text+0xe1a7f): undefined reference to `dax_get_by_host'
  make: *** [vmlinux] Error 1
  $ grep DAX .config
  CONFIG_DAX=m
  # CONFIG_DEV_DAX is not set
  # CONFIG_FS_DAX is not set

When FS_DAX=n we can/must throw away the dax code in filesystems.
Implement 'fs_' versions of dax_get_by_host() and put_dax() that are
nops in the FS_DAX=n case.

Cc: <linux-xfs@vger.kernel.org>
Cc: <linux-ext4@vger.kernel.org>
Cc: Jan Kara <jack@suse.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Fixes: ef51042472f5 ("block, dax: move 'select DAX' from BLOCK to FS_DAX")
Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 fs/ext2/inode.c     |  4 ++--
 fs/ext4/inode.c     |  4 ++--
 fs/xfs/xfs_iomap.c  |  4 ++--
 include/linux/dax.h | 34 +++++++++++++++++++++++++++-------
 4 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 26d77f9f8c12..2dcbd5698884 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -817,7 +817,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	iomap->bdev = bdev;
 	iomap->offset = (u64)first_block << blkbits;
 	if (blk_queue_dax(bdev->bd_queue))
-		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+		iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
 	else
 		iomap->dax_dev = NULL;
 
@@ -841,7 +841,7 @@ static int
 ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 		ssize_t written, unsigned flags, struct iomap *iomap)
 {
-	put_dax(iomap->dax_dev);
+	fs_put_dax(iomap->dax_dev);
 	if (iomap->type == IOMAP_MAPPED &&
 	    written < length &&
 	    (flags & IOMAP_WRITE))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5834c4d76be8..1bd0bfa547f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3412,7 +3412,7 @@ retry:
 	bdev = inode->i_sb->s_bdev;
 	iomap->bdev = bdev;
 	if (blk_queue_dax(bdev->bd_queue))
-		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+		iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
 	else
 		iomap->dax_dev = NULL;
 	iomap->offset = first_block << blkbits;
@@ -3447,7 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	int blkbits = inode->i_blkbits;
 	bool truncate = false;
 
-	put_dax(iomap->dax_dev);
+	fs_put_dax(iomap->dax_dev);
 	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
 		return 0;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a63f61c256bd..94e5bdf7304c 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1068,7 +1068,7 @@ xfs_file_iomap_begin(
 	/* optionally associate a dax device with the iomap bdev */
 	bdev = iomap->bdev;
 	if (blk_queue_dax(bdev->bd_queue))
-		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+		iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
 	else
 		iomap->dax_dev = NULL;
 
@@ -1149,7 +1149,7 @@ xfs_file_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
-	put_dax(iomap->dax_dev);
+	fs_put_dax(iomap->dax_dev);
 	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
 		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
 				length, written, iomap);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 00ebac854bb7..5ec1f6c47716 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -18,6 +18,20 @@ struct dax_operations {
 			void **, pfn_t *);
 };
 
+#if IS_ENABLED(CONFIG_DAX)
+struct dax_device *dax_get_by_host(const char *host);
+void put_dax(struct dax_device *dax_dev);
+#else
+static inline struct dax_device *dax_get_by_host(const char *host)
+{
+	return NULL;
+}
+
+static inline void put_dax(struct dax_device *dax_dev)
+{
+}
+#endif
+
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
 int __bdev_dax_supported(struct super_block *sb, int blocksize);
@@ -25,23 +39,29 @@ static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
 	return __bdev_dax_supported(sb, blocksize);
 }
+
+static inline struct dax_device *fs_dax_get_by_host(const char *host)
+{
+	return dax_get_by_host(host);
+}
+
+static inline void fs_put_dax(struct dax_device *dax_dev)
+{
+	put_dax(dax_dev);
+}
+
 #else
 static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
 	return -EOPNOTSUPP;
 }
-#endif
 
-#if IS_ENABLED(CONFIG_DAX)
-struct dax_device *dax_get_by_host(const char *host);
-void put_dax(struct dax_device *dax_dev);
-#else
-static inline struct dax_device *dax_get_by_host(const char *host)
+static inline struct dax_device *fs_dax_get_by_host(const char *host)
 {
 	return NULL;
 }
 
-static inline void put_dax(struct dax_device *dax_dev)
+static inline void fs_put_dax(struct dax_device *dax_dev)
 {
 }
 #endif
-- 
cgit v1.2.3


From 508541146af18e43072e41a31aa62fac2b01aac1 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Tue, 25 Apr 2017 10:39:57 +0300
Subject: net/mlx5: Use underlay QPN from the root name space

Root flow table is dynamically changed by the underlying flow steering
layer, and IPoIB/ULPs have no idea what will be the root flow table in
the future, hence we need a dynamic infrastructure to move Underlay QPs
with the root flow table.

Fixes: b3ba51498bdd ("net/mlx5: Refactor create flow table method to accept underlay QP")
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c   |  5 ++---
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c  |  9 +++-----
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h  |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 25 +++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/ipoib.c   |  7 +++++--
 include/linux/mlx5/fs.h                           |  4 +++-
 8 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 0099a3e397bc..2fd044b23875 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1003,7 +1003,7 @@ int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
 void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
 void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
 
-int mlx5e_create_ttc_table(struct mlx5e_priv *priv, u32 underlay_qpn);
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv);
 void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv);
 
 int mlx5e_create_tis(struct mlx5_core_dev *mdev, int tc,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 576d6787b484..53ed58320a24 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -800,7 +800,7 @@ void mlx5e_destroy_ttc_table(struct mlx5e_priv *priv)
 	mlx5e_destroy_flow_table(&ttc->ft);
 }
 
-int mlx5e_create_ttc_table(struct mlx5e_priv *priv, u32 underlay_qpn)
+int mlx5e_create_ttc_table(struct mlx5e_priv *priv)
 {
 	struct mlx5e_ttc_table *ttc = &priv->fs.ttc;
 	struct mlx5_flow_table_attr ft_attr = {};
@@ -810,7 +810,6 @@ int mlx5e_create_ttc_table(struct mlx5e_priv *priv, u32 underlay_qpn)
 	ft_attr.max_fte = MLX5E_TTC_TABLE_SIZE;
 	ft_attr.level = MLX5E_TTC_FT_LEVEL;
 	ft_attr.prio = MLX5E_NIC_PRIO;
-	ft_attr.underlay_qpn = underlay_qpn;
 
 	ft->t = mlx5_create_flow_table(priv->fs.ns, &ft_attr);
 	if (IS_ERR(ft->t)) {
@@ -1147,7 +1146,7 @@ int mlx5e_create_flow_steering(struct mlx5e_priv *priv)
 		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
 	}
 
-	err = mlx5e_create_ttc_table(priv, 0);
+	err = mlx5e_create_ttc_table(priv);
 	if (err) {
 		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
 			   err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 19e3d2fc2099..fcec7bedd3cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -40,28 +40,25 @@
 #include "eswitch.h"
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
-			    struct mlx5_flow_table *ft)
+			    struct mlx5_flow_table *ft, u32 underlay_qpn)
 {
 	u32 in[MLX5_ST_SZ_DW(set_flow_table_root_in)]   = {0};
 	u32 out[MLX5_ST_SZ_DW(set_flow_table_root_out)] = {0};
 
 	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
-	    ft->underlay_qpn == 0)
+	    underlay_qpn == 0)
 		return 0;
 
 	MLX5_SET(set_flow_table_root_in, in, opcode,
 		 MLX5_CMD_OP_SET_FLOW_TABLE_ROOT);
 	MLX5_SET(set_flow_table_root_in, in, table_type, ft->type);
 	MLX5_SET(set_flow_table_root_in, in, table_id, ft->id);
+	MLX5_SET(set_flow_table_root_in, in, underlay_qpn, underlay_qpn);
 	if (ft->vport) {
 		MLX5_SET(set_flow_table_root_in, in, vport_number, ft->vport);
 		MLX5_SET(set_flow_table_root_in, in, other_vport, 1);
 	}
 
-	if ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) &&
-	    ft->underlay_qpn != 0)
-		MLX5_SET(set_flow_table_root_in, in, underlay_qpn, ft->underlay_qpn);
-
 	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
index 8fad80688536..0f98a7cf4877 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.h
@@ -71,7 +71,8 @@ int mlx5_cmd_delete_fte(struct mlx5_core_dev *dev,
 			unsigned int index);
 
 int mlx5_cmd_update_root_ft(struct mlx5_core_dev *dev,
-			    struct mlx5_flow_table *ft);
+			    struct mlx5_flow_table *ft,
+			    u32 underlay_qpn);
 
 int mlx5_cmd_fc_alloc(struct mlx5_core_dev *dev, u16 *id);
 int mlx5_cmd_fc_free(struct mlx5_core_dev *dev, u16 id);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index b8a176503d38..0e487e8ca634 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -650,7 +650,7 @@ static int update_root_ft_create(struct mlx5_flow_table *ft, struct fs_prio
 	if (ft->level >= min_level)
 		return 0;
 
-	err = mlx5_cmd_update_root_ft(root->dev, ft);
+	err = mlx5_cmd_update_root_ft(root->dev, ft, root->underlay_qpn);
 	if (err)
 		mlx5_core_warn(root->dev, "Update root flow table of id=%u failed\n",
 			       ft->id);
@@ -818,8 +818,6 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa
 		goto unlock_root;
 	}
 
-	ft->underlay_qpn = ft_attr->underlay_qpn;
-
 	tree_init_node(&ft->node, 1, del_flow_table);
 	log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0;
 	next_ft = find_next_chained_ft(fs_prio);
@@ -1489,7 +1487,8 @@ static int update_root_ft_destroy(struct mlx5_flow_table *ft)
 
 	new_root_ft = find_next_ft(ft);
 	if (new_root_ft) {
-		int err = mlx5_cmd_update_root_ft(root->dev, new_root_ft);
+		int err = mlx5_cmd_update_root_ft(root->dev, new_root_ft,
+						  root->underlay_qpn);
 
 		if (err) {
 			mlx5_core_warn(root->dev, "Update root flow table of id=%u failed\n",
@@ -2062,3 +2061,21 @@ err:
 	mlx5_cleanup_fs(dev);
 	return err;
 }
+
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+
+	root->underlay_qpn = underlay_qpn;
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_fs_add_rx_underlay_qpn);
+
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn)
+{
+	struct mlx5_flow_root_namespace *root = dev->priv.steering->root_ns;
+
+	root->underlay_qpn = 0;
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_fs_remove_rx_underlay_qpn);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
index 81eafc7b9dd9..990acee6fb09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -118,7 +118,6 @@ struct mlx5_flow_table {
 	/* FWD rules that point on this flow table */
 	struct list_head		fwd_rules;
 	u32				flags;
-	u32				underlay_qpn;
 };
 
 struct mlx5_fc_cache {
@@ -195,6 +194,7 @@ struct mlx5_flow_root_namespace {
 	struct mlx5_flow_table		*root_ft;
 	/* Should be held when chaining flow tables */
 	struct mutex			chain_lock;
+	u32				underlay_qpn;
 };
 
 int mlx5_init_fc_stats(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
index 56bff3540954..cc1858752e70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib.c
@@ -160,6 +160,8 @@ out:
 
 static void mlx5i_destroy_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp)
 {
+	mlx5_fs_remove_rx_underlay_qpn(mdev, qp->qpn);
+
 	mlx5_core_destroy_qp(mdev, qp);
 }
 
@@ -174,6 +176,8 @@ static int mlx5i_init_tx(struct mlx5e_priv *priv)
 		return err;
 	}
 
+	mlx5_fs_add_rx_underlay_qpn(priv->mdev, ipriv->qp.qpn);
+
 	err = mlx5e_create_tis(priv->mdev, 0 /* tc */, ipriv->qp.qpn, &priv->tisn[0]);
 	if (err) {
 		mlx5_core_warn(priv->mdev, "create tis failed, %d\n", err);
@@ -193,7 +197,6 @@ static void mlx5i_cleanup_tx(struct mlx5e_priv *priv)
 
 static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
 {
-	struct mlx5i_priv *ipriv = priv->ppriv;
 	int err;
 
 	priv->fs.ns = mlx5_get_flow_namespace(priv->mdev,
@@ -209,7 +212,7 @@ static int mlx5i_create_flow_steering(struct mlx5e_priv *priv)
 		priv->netdev->hw_features &= ~NETIF_F_NTUPLE;
 	}
 
-	err = mlx5e_create_ttc_table(priv, ipriv->qp.qpn);
+	err = mlx5e_create_ttc_table(priv);
 	if (err) {
 		netdev_err(priv->netdev, "Failed to create ttc table, err=%d\n",
 			   err);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 1b166d2e19c5..b25e7baa273e 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -109,7 +109,6 @@ struct mlx5_flow_table_attr {
 	int max_fte;
 	u32 level;
 	u32 flags;
-	u32 underlay_qpn;
 };
 
 struct mlx5_flow_table *
@@ -167,4 +166,7 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging);
 void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter);
 void mlx5_fc_query_cached(struct mlx5_fc *counter,
 			  u64 *bytes, u64 *packets, u64 *lastuse);
+int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
+int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn);
+
 #endif
-- 
cgit v1.2.3


From 324318f0248c31be8a08984146e7e4dd7cdd091d Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Tue, 9 May 2017 16:17:37 -0400
Subject: netfilter: xtables: zero padding in data_to_user

When looking up an iptables rule, the iptables binary compares the
aligned match and target data (XT_ALIGN). In some cases this can
exceed the actual data size to include padding bytes.

Before commit f77bc5b23fb1 ("iptables: use match, target and data
copy_to_user helpers") the malloc()ed bytes were overwritten by the
kernel with kzalloced contents, zeroing the padding and making the
comparison succeed. After this patch, the kernel copies and clears
only data, leaving the padding bytes undefined.

Extend the clear operation from data size to aligned data size to
include the padding bytes, if any.

Padding bytes can be observed in both match and target, and the bug
triggered, by issuing a rule with match icmp and target ACCEPT:

  iptables -t mangle -A INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT
  iptables -t mangle -D INPUT -i lo -p icmp --icmp-type 1 -j ACCEPT

Fixes: f77bc5b23fb1 ("iptables: use match, target and data copy_to_user helpers")
Reported-by: Paul Moore <pmoore@redhat.com>
Reported-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 2 +-
 net/bridge/netfilter/ebtables.c    | 9 ++++++---
 net/netfilter/x_tables.c           | 9 ++++++---
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index be378cf47fcc..b3044c2c62cb 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -294,7 +294,7 @@ int xt_match_to_user(const struct xt_entry_match *m,
 int xt_target_to_user(const struct xt_entry_target *t,
 		      struct xt_entry_target __user *u);
 int xt_data_to_user(void __user *dst, const void *src,
-		    int usersize, int size);
+		    int usersize, int size, int aligned_size);
 
 void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 				 struct xt_counters_info *info, bool compat);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9ec0c9f908fa..9c6e619f452b 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1373,7 +1373,8 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name,
 	strlcpy(name, _name, sizeof(name));
 	if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
 	    put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
-	    xt_data_to_user(um + entrysize, data, usersize, datasize))
+	    xt_data_to_user(um + entrysize, data, usersize, datasize,
+			    XT_ALIGN(datasize)))
 		return -EFAULT;
 
 	return 0;
@@ -1658,7 +1659,8 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
 		if (match->compat_to_user(cm->data, m->data))
 			return -EFAULT;
 	} else {
-		if (xt_data_to_user(cm->data, m->data, match->usersize, msize))
+		if (xt_data_to_user(cm->data, m->data, match->usersize, msize,
+				    COMPAT_XT_ALIGN(msize)))
 			return -EFAULT;
 	}
 
@@ -1687,7 +1689,8 @@ static int compat_target_to_user(struct ebt_entry_target *t,
 		if (target->compat_to_user(cm->data, t->data))
 			return -EFAULT;
 	} else {
-		if (xt_data_to_user(cm->data, t->data, target->usersize, tsize))
+		if (xt_data_to_user(cm->data, t->data, target->usersize, tsize,
+				    COMPAT_XT_ALIGN(tsize)))
 			return -EFAULT;
 	}
 
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8876b7da6884..d17769599c10 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -283,12 +283,13 @@ static int xt_obj_to_user(u16 __user *psize, u16 size,
 		       &U->u.user.revision, K->u.kernel.TYPE->revision)
 
 int xt_data_to_user(void __user *dst, const void *src,
-		    int usersize, int size)
+		    int usersize, int size, int aligned_size)
 {
 	usersize = usersize ? : size;
 	if (copy_to_user(dst, src, usersize))
 		return -EFAULT;
-	if (usersize != size && clear_user(dst + usersize, size - usersize))
+	if (usersize != aligned_size &&
+	    clear_user(dst + usersize, aligned_size - usersize))
 		return -EFAULT;
 
 	return 0;
@@ -298,7 +299,9 @@ EXPORT_SYMBOL_GPL(xt_data_to_user);
 #define XT_DATA_TO_USER(U, K, TYPE, C_SIZE)				\
 	xt_data_to_user(U->data, K->data,				\
 			K->u.kernel.TYPE->usersize,			\
-			C_SIZE ? : K->u.kernel.TYPE->TYPE##size)
+			C_SIZE ? : K->u.kernel.TYPE->TYPE##size,	\
+			C_SIZE ? COMPAT_XT_ALIGN(C_SIZE) :		\
+				 XT_ALIGN(K->u.kernel.TYPE->TYPE##size))
 
 int xt_match_to_user(const struct xt_entry_match *m,
 		     struct xt_entry_match __user *u)
-- 
cgit v1.2.3


From c953d63548207a085abcb12a15fefc8a11ffdf0a Mon Sep 17 00:00:00 2001
From: Gao Feng <gfree.wind@vip.163.com>
Date: Tue, 16 May 2017 09:30:18 +0800
Subject: ebtables: arpreply: Add the standard target sanity check

The info->target comes from userspace and it would be used directly.
So we need to add the sanity check to make sure it is a valid standard
target, although the ebtables tool has already checked it. Kernel needs
to validate anything coming from userspace.

If the target is set as an evil value, it would break the ebtables
and cause a panic. Because the non-standard target is treated as one
offset.

Now add one helper function ebt_invalid_target, and we would replace
the macro INVALID_TARGET later.

Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_bridge/ebtables.h | 5 +++++
 net/bridge/netfilter/ebt_arpreply.c       | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h
index a30efb437e6d..e0cbf17af780 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -125,4 +125,9 @@ extern unsigned int ebt_do_table(struct sk_buff *skb,
 /* True if the target is not a standard target */
 #define INVALID_TARGET (info->target < -NUM_STANDARD_TARGETS || info->target >= 0)
 
+static inline bool ebt_invalid_target(int target)
+{
+	return (target < -NUM_STANDARD_TARGETS || target >= 0);
+}
+
 #endif
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index 5929309beaa1..db85230e49c3 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -68,6 +68,9 @@ static int ebt_arpreply_tg_check(const struct xt_tgchk_param *par)
 	if (e->ethproto != htons(ETH_P_ARP) ||
 	    e->invflags & EBT_IPROTO)
 		return -EINVAL;
+	if (ebt_invalid_target(info->target))
+		return -EINVAL;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 30e7d894c1478c88d50ce94ddcdbd7f9763d9cdd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 May 2017 10:19:49 +0200
Subject: tracing/kprobes: Enforce kprobes teardown after testing

Enabling the tracer selftest triggers occasionally the warning in
text_poke(), which warns when the to be modified page is not marked
reserved.

The reason is that the tracer selftest installs kprobes on functions marked
__init for testing. These probes are removed after the tests, but that
removal schedules the delayed kprobes_optimizer work, which will do the
actual text poke. If the work is executed after the init text is freed,
then the warning triggers. The bug can be reproduced reliably when the work
delay is increased.

Flush the optimizer work and wait for the optimizing/unoptimizing lists to
become empty before returning from the kprobes tracer selftest. That
ensures that all operations which were queued due to the probes removal
have completed.

Link: http://lkml.kernel.org/r/20170516094802.76a468bb@gandalf.local.home

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: stable@vger.kernel.org
Fixes: 6274de498 ("kprobes: Support delayed unoptimizing")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/kprobes.h     | 3 +++
 kernel/kprobes.c            | 2 +-
 kernel/trace/trace_kprobe.c | 5 +++++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 30f90c1a0aaf..541df0b5b815 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -349,6 +349,9 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table,
 					     int write, void __user *buffer,
 					     size_t *length, loff_t *ppos);
 #endif
+extern void wait_for_kprobe_optimizer(void);
+#else
+static inline void wait_for_kprobe_optimizer(void) { }
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7367e0ec6f81..199243bba554 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -595,7 +595,7 @@ static void kprobe_optimizer(struct work_struct *work)
 }
 
 /* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+void wait_for_kprobe_optimizer(void)
 {
 	mutex_lock(&kprobe_mutex);
 
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8485f6738a87..c129fca6ec99 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1535,6 +1535,11 @@ static __init int kprobe_trace_self_tests_init(void)
 
 end:
 	release_all_trace_kprobes();
+	/*
+	 * Wait for the optimizer work to finish. Otherwise it might fiddle
+	 * with probes in already freed __init text.
+	 */
+	wait_for_kprobe_optimizer();
 	if (warn)
 		pr_cont("NG: Some tests are failed. Please check them.\n");
 	else
-- 
cgit v1.2.3


From 6a29beef9d1b16c762e469d77e28c3de3f5c3dbb Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@nxp.com>
Date: Wed, 17 May 2017 18:32:02 +0300
Subject: usb: host: xhci-ring: don't need to clear interrupt pending for MSI
 enabled hcd

According to xHCI spec Figure 30: Interrupt Throttle Flow Diagram

	If PCI Message Signaled Interrupts (MSI or MSI-X) are enabled,
       	then the assertion of the Interrupt Pending (IP) flag in Figure 30
       	generates a PCI Dword write. The IP flag is automatically cleared
       	by the completion of the PCI write.

the MSI enabled HCs don't need to clear interrupt pending bit, but
hcd->irq = 0 doesn't equal to MSI enabled HCD. At some Dual-role
controller software designs, it sets hcd->irq as 0 to avoid HCD
requesting interrupt, and they want to decide when to call usb_hcd_irq
by software.

Signed-off-by: Peter Chen <peter.chen@nxp.com>
Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/xhci-ring.c | 5 +----
 drivers/usb/host/xhci.c      | 5 +++--
 include/linux/usb/hcd.h      | 1 +
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 507ba7734b94..0830b25f9499 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -2707,12 +2707,9 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd)
 	 */
 	status |= STS_EINT;
 	writel(status, &xhci->op_regs->status);
-	/* FIXME when MSI-X is supported and there are multiple vectors */
-	/* Clear the MSI-X event interrupt status */
 
-	if (hcd->irq) {
+	if (!hcd->msi_enabled) {
 		u32 irq_pending;
-		/* Acknowledge the PCI interrupt */
 		irq_pending = readl(&xhci->ir_set->irq_pending);
 		irq_pending |= IMAN_IP;
 		writel(irq_pending, &xhci->ir_set->irq_pending);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 2d1310220832..71eb2991c698 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -359,9 +359,10 @@ static int xhci_try_enable_msi(struct usb_hcd *hcd)
 		/* fall back to msi*/
 		ret = xhci_setup_msi(xhci);
 
-	if (!ret)
-		/* hcd->irq is 0, we have MSI */
+	if (!ret) {
+		hcd->msi_enabled = 1;
 		return 0;
+	}
 
 	if (!pdev->irq) {
 		xhci_err(xhci, "No msi-x/msi found and no IRQ in BIOS\n");
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index a469999a106d..50398b69ca44 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -148,6 +148,7 @@ struct usb_hcd {
 	unsigned		rh_registered:1;/* is root hub registered? */
 	unsigned		rh_pollable:1;	/* may we poll the root hub? */
 	unsigned		msix_enabled:1;	/* driver has MSI-X enabled? */
+	unsigned		msi_enabled:1;	/* driver has MSI enabled? */
 	unsigned		remove_phy:1;	/* auto-remove USB phy */
 
 	/* The next flag is a stopgap, to be removed when all the HCDs
-- 
cgit v1.2.3


From 6bdc00d01e202ae11fa1cae0dacbef895434483d Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Fri, 28 Apr 2017 13:47:21 +0200
Subject: serdev: Restore serdev_device_write_buf for atomic context

Starting with commit 6fe729c4bdae ("serdev: Add serdev_device_write
subroutine") the function serdev_device_write_buf cannot be used in
atomic context anymore (mutex_lock is sleeping). So restore the old
behavior.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Fixes: 6fe729c4bdae ("serdev: Add serdev_device_write subroutine")
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serdev/core.c | 12 ++++++++++++
 include/linux/serdev.h    | 14 +++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c
index 433de5ea9b02..f71b47334149 100644
--- a/drivers/tty/serdev/core.c
+++ b/drivers/tty/serdev/core.c
@@ -122,6 +122,18 @@ void serdev_device_write_wakeup(struct serdev_device *serdev)
 }
 EXPORT_SYMBOL_GPL(serdev_device_write_wakeup);
 
+int serdev_device_write_buf(struct serdev_device *serdev,
+			    const unsigned char *buf, size_t count)
+{
+	struct serdev_controller *ctrl = serdev->ctrl;
+
+	if (!ctrl || !ctrl->ops->write_buf)
+		return -EINVAL;
+
+	return ctrl->ops->write_buf(ctrl, buf, count);
+}
+EXPORT_SYMBOL_GPL(serdev_device_write_buf);
+
 int serdev_device_write(struct serdev_device *serdev,
 			const unsigned char *buf, size_t count,
 			unsigned long timeout)
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index cda76c6506ca..e2a225bf716d 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -195,6 +195,7 @@ int serdev_device_open(struct serdev_device *);
 void serdev_device_close(struct serdev_device *);
 unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int);
 void serdev_device_set_flow_control(struct serdev_device *, bool);
+int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t);
 void serdev_device_wait_until_sent(struct serdev_device *, long);
 int serdev_device_get_tiocm(struct serdev_device *);
 int serdev_device_set_tiocm(struct serdev_device *, int, int);
@@ -236,6 +237,12 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev
 	return 0;
 }
 static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {}
+static inline int serdev_device_write_buf(struct serdev_device *serdev,
+					  const unsigned char *buf,
+					  size_t count)
+{
+	return -ENODEV;
+}
 static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {}
 static inline int serdev_device_get_tiocm(struct serdev_device *serdev)
 {
@@ -312,11 +319,4 @@ static inline struct device *serdev_tty_port_register(struct tty_port *port,
 static inline void serdev_tty_port_unregister(struct tty_port *port) {}
 #endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */
 
-static inline int serdev_device_write_buf(struct serdev_device *serdev,
-					  const unsigned char *data,
-					  size_t count)
-{
-	return serdev_device_write(serdev, data, count, 0);
-}
-
 #endif /*_LINUX_SERDEV_H */
-- 
cgit v1.2.3


From 8cde11b2baa1d02eb2eb955dfd47d9f2a12f12cf Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 18 May 2017 17:33:00 +0200
Subject: tty/serdev: add serdev registration interface

Add a new interface for registering a serdev controller and clients, and
a helper function to deregister serdev devices (or a tty device) that
were previously registered using the new interface.

Once every driver currently using the tty_port_register_device() helpers
have been vetted and converted to use the new serdev registration
interface (at least for deregistration), we can move serdev registration
to the current helpers and get rid of the serdev-specific functions.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serdev/serdev-ttyport.c |  6 ++-
 drivers/tty/tty_port.c              | 75 +++++++++++++++++++++++++++++++++++++
 include/linux/serdev.h              |  7 +++-
 include/linux/tty.h                 |  9 +++++
 4 files changed, 93 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c
index 013efffd2e82..d0a021c93986 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -250,16 +250,18 @@ err_reset_data:
 	return ERR_PTR(ret);
 }
 
-void serdev_tty_port_unregister(struct tty_port *port)
+int serdev_tty_port_unregister(struct tty_port *port)
 {
 	struct serdev_controller *ctrl = port->client_data;
 	struct serport *serport = serdev_controller_get_drvdata(ctrl);
 
 	if (!serport)
-		return;
+		return -ENODEV;
 
 	serdev_controller_remove(ctrl);
 	port->client_ops = NULL;
 	port->client_data = NULL;
 	serdev_controller_put(ctrl);
+
+	return 0;
 }
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 88dac3b79369..4fb3165384c4 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -16,6 +16,7 @@
 #include <linux/bitops.h>
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/serdev.h>
 
 static int tty_port_default_receive_buf(struct tty_port *port,
 					const unsigned char *p,
@@ -136,6 +137,80 @@ struct device *tty_port_register_device_attr(struct tty_port *port,
 }
 EXPORT_SYMBOL_GPL(tty_port_register_device_attr);
 
+/**
+ * tty_port_register_device_attr_serdev - register tty or serdev device
+ * @port: tty_port of the device
+ * @driver: tty_driver for this device
+ * @index: index of the tty
+ * @device: parent if exists, otherwise NULL
+ * @drvdata: driver data for the device
+ * @attr_grp: attribute group for the device
+ *
+ * Register a serdev or tty device depending on if the parent device has any
+ * defined serdev clients or not.
+ */
+struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
+		struct tty_driver *driver, unsigned index,
+		struct device *device, void *drvdata,
+		const struct attribute_group **attr_grp)
+{
+	struct device *dev;
+
+	tty_port_link_device(port, driver, index);
+
+	dev = serdev_tty_port_register(port, device, driver, index);
+	if (PTR_ERR(dev) != -ENODEV) {
+		/* Skip creating cdev if we registered a serdev device */
+		return dev;
+	}
+
+	return tty_register_device_attr(driver, index, device, drvdata,
+			attr_grp);
+}
+EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev);
+
+/**
+ * tty_port_register_device_serdev - register tty or serdev device
+ * @port: tty_port of the device
+ * @driver: tty_driver for this device
+ * @index: index of the tty
+ * @device: parent if exists, otherwise NULL
+ *
+ * Register a serdev or tty device depending on if the parent device has any
+ * defined serdev clients or not.
+ */
+struct device *tty_port_register_device_serdev(struct tty_port *port,
+		struct tty_driver *driver, unsigned index,
+		struct device *device)
+{
+	return tty_port_register_device_attr_serdev(port, driver, index,
+			device, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(tty_port_register_device_serdev);
+
+/**
+ * tty_port_unregister_device - deregister a tty or serdev device
+ * @port: tty_port of the device
+ * @driver: tty_driver for this device
+ * @index: index of the tty
+ *
+ * If a tty or serdev device is registered with a call to
+ * tty_port_register_device_serdev() then this function must be called when
+ * the device is gone.
+ */
+void tty_port_unregister_device(struct tty_port *port,
+		struct tty_driver *driver, unsigned index)
+{
+	int ret;
+
+	ret = serdev_tty_port_unregister(port);
+	if (ret == 0)
+		return;
+
+	tty_unregister_device(driver, index);
+}
+EXPORT_SYMBOL_GPL(tty_port_unregister_device);
+
 int tty_port_alloc_xmit_buf(struct tty_port *port)
 {
 	/* We may sleep in get_zeroed_page() */
diff --git a/include/linux/serdev.h b/include/linux/serdev.h
index e2a225bf716d..e69402d4a8ae 100644
--- a/include/linux/serdev.h
+++ b/include/linux/serdev.h
@@ -308,7 +308,7 @@ struct tty_driver;
 struct device *serdev_tty_port_register(struct tty_port *port,
 					struct device *parent,
 					struct tty_driver *drv, int idx);
-void serdev_tty_port_unregister(struct tty_port *port);
+int serdev_tty_port_unregister(struct tty_port *port);
 #else
 static inline struct device *serdev_tty_port_register(struct tty_port *port,
 					   struct device *parent,
@@ -316,7 +316,10 @@ static inline struct device *serdev_tty_port_register(struct tty_port *port,
 {
 	return ERR_PTR(-ENODEV);
 }
-static inline void serdev_tty_port_unregister(struct tty_port *port) {}
+static inline int serdev_tty_port_unregister(struct tty_port *port)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */
 
 #endif /*_LINUX_SERDEV_H */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index d07cd2105a6c..eccb4ec30a8a 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -558,6 +558,15 @@ extern struct device *tty_port_register_device_attr(struct tty_port *port,
 		struct tty_driver *driver, unsigned index,
 		struct device *device, void *drvdata,
 		const struct attribute_group **attr_grp);
+extern struct device *tty_port_register_device_serdev(struct tty_port *port,
+		struct tty_driver *driver, unsigned index,
+		struct device *device);
+extern struct device *tty_port_register_device_attr_serdev(struct tty_port *port,
+		struct tty_driver *driver, unsigned index,
+		struct device *device, void *drvdata,
+		const struct attribute_group **attr_grp);
+extern void tty_port_unregister_device(struct tty_port *port,
+		struct tty_driver *driver, unsigned index);
 extern int tty_port_alloc_xmit_buf(struct tty_port *port);
 extern void tty_port_free_xmit_buf(struct tty_port *port);
 extern void tty_port_destroy(struct tty_port *port);
-- 
cgit v1.2.3


From 4123109050a869a8871e58a50f28f383d41e49ad Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 5 May 2017 16:13:02 -0700
Subject: nvme-fc: correct port role bits

FC Port roles is a bit mask, not individual values.
Correct nvme definitions to unique bits.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/nvme-fc-driver.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 0db37158a61d..12e344b5b77f 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -27,8 +27,8 @@
 
 /* FC Port role bitmask - can merge with FC Port Roles in fc transport */
 #define FC_PORT_ROLE_NVME_INITIATOR	0x10
-#define FC_PORT_ROLE_NVME_TARGET	0x11
-#define FC_PORT_ROLE_NVME_DISCOVERY	0x12
+#define FC_PORT_ROLE_NVME_TARGET	0x20
+#define FC_PORT_ROLE_NVME_DISCOVERY	0x40
 
 
 /**
-- 
cgit v1.2.3


From 4b8ba5fa525bc8bdaaed2a5c5433f0f2008d7bc5 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 25 Apr 2017 16:23:09 -0700
Subject: nvmet-fc: remove target cpu scheduling flag

Remove NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED. It's unnecessary.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/target/fc.c       |  4 +---
 drivers/nvme/target/fcloop.c   |  1 -
 drivers/scsi/lpfc/lpfc_nvmet.c |  1 -
 include/linux/nvme-fc-driver.h | 12 ++----------
 4 files changed, 3 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 62eba29c85fb..2006fae61980 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -517,9 +517,7 @@ nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
 {
 	int cpu, idx, cnt;
 
-	if (!(tgtport->ops->target_features &
-			NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED) ||
-	    tgtport->ops->max_hw_queues == 1)
+	if (tgtport->ops->max_hw_queues == 1)
 		return WORK_CPU_UNBOUND;
 
 	/* Simple cpu selection based on qid modulo active cpu count */
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 15551ef79c8c..294a6611fb24 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -698,7 +698,6 @@ static struct nvmet_fc_target_template tgttemplate = {
 	.dma_boundary		= FCLOOP_DMABOUND_4G,
 	/* optional features */
 	.target_features	= NVMET_FCTGTFEAT_CMD_IN_ISR |
-				  NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
 				  NVMET_FCTGTFEAT_OPDONE_IN_ISR,
 	/* sizes of additional private data for data structures */
 	.target_priv_sz		= sizeof(struct fcloop_tport),
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 94434e621c33..0488580eea12 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -764,7 +764,6 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba)
 	lpfc_tgttemplate.max_sgl_segments = phba->cfg_nvme_seg_cnt + 1;
 	lpfc_tgttemplate.max_hw_queues = phba->cfg_nvme_io_channel;
 	lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP |
-					   NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
 					   NVMET_FCTGTFEAT_CMD_IN_ISR |
 					   NVMET_FCTGTFEAT_OPDONE_IN_ISR;
 
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 12e344b5b77f..6c8c5d8041b7 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -642,15 +642,7 @@ enum {
 		 * sequence in one LLDD operation. Errors during Data
 		 * sequence transmit must not allow RSP sequence to be sent.
 		 */
-	NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED = (1 << 1),
-		/* Bit 1: When 0, the LLDD will deliver FCP CMD
-		 * on the CPU it should be affinitized to. Thus work will
-		 * be scheduled on the cpu received on. When 1, the LLDD
-		 * may not deliver the CMD on the CPU it should be worked
-		 * on. The transport should pick a cpu to schedule the work
-		 * on.
-		 */
-	NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2),
+	NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 1),
 		/* Bit 2: When 0, the LLDD is calling the cmd rcv handler
 		 * in a non-isr context, allowing the transport to finish
 		 * op completion in the calling context. When 1, the LLDD
@@ -658,7 +650,7 @@ enum {
 		 * requiring the transport to transition to a workqueue
 		 * for op completion.
 		 */
-	NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3),
+	NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 2),
 		/* Bit 3: When 0, the LLDD is calling the op done handler
 		 * in a non-isr context, allowing the transport to finish
 		 * op completion in the calling context. When 1, the LLDD
-- 
cgit v1.2.3


From c2372c20425bd75a5527b3e2204059762190f6ca Mon Sep 17 00:00:00 2001
From: Jan Glauber <jglauber@cavium.com>
Date: Mon, 22 May 2017 13:09:20 +0200
Subject: of/platform: Make of_platform_device_destroy globally visible

of_platform_device_destroy is the counterpart to
of_platform_device_create which is a non-static function.

After creating a platform device it might be neccessary
to destroy it to deal with -EPROBE_DEFER where a
repeated of_platform_device_create call would fail otherwise.

Therefore also make of_platform_device_destroy globally visible.

Signed-off-by: Jan Glauber <jglauber@cavium.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/of/platform.c       | 3 ++-
 include/linux/of_platform.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/of/platform.c b/drivers/of/platform.c
index 71fecc2debfc..703a42118ffc 100644
--- a/drivers/of/platform.c
+++ b/drivers/of/platform.c
@@ -523,7 +523,7 @@ static int __init of_platform_default_populate_init(void)
 arch_initcall_sync(of_platform_default_populate_init);
 #endif
 
-static int of_platform_device_destroy(struct device *dev, void *data)
+int of_platform_device_destroy(struct device *dev, void *data)
 {
 	/* Do not touch devices not populated from the device tree */
 	if (!dev->of_node || !of_node_check_flag(dev->of_node, OF_POPULATED))
@@ -544,6 +544,7 @@ static int of_platform_device_destroy(struct device *dev, void *data)
 	of_node_clear_flag(dev->of_node, OF_POPULATED_BUS);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(of_platform_device_destroy);
 
 /**
  * of_platform_depopulate() - Remove devices populated from device tree
diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h
index dc8224ae28d5..e0d1946270f3 100644
--- a/include/linux/of_platform.h
+++ b/include/linux/of_platform.h
@@ -64,6 +64,7 @@ extern struct platform_device *of_platform_device_create(struct device_node *np,
 						   const char *bus_id,
 						   struct device *parent);
 
+extern int of_platform_device_destroy(struct device *dev, void *data);
 extern int of_platform_bus_probe(struct device_node *root,
 				 const struct of_device_id *matches,
 				 struct device *parent);
-- 
cgit v1.2.3


From 7254a50a5db40ca6739ddf37e0a45e6912532b2c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 22 May 2017 23:05:05 +0800
Subject: blk-mq: remove blk_mq_abort_requeue_list()

No one uses it any more, so remove it.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c         | 19 -------------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a69ad122ed66..f2224ffd225d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -628,25 +628,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 
-void blk_mq_abort_requeue_list(struct request_queue *q)
-{
-	unsigned long flags;
-	LIST_HEAD(rq_list);
-
-	spin_lock_irqsave(&q->requeue_lock, flags);
-	list_splice_init(&q->requeue_list, &rq_list);
-	spin_unlock_irqrestore(&q->requeue_lock, flags);
-
-	while (!list_empty(&rq_list)) {
-		struct request *rq;
-
-		rq = list_first_entry(&rq_list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		blk_mq_end_request(rq, -EIO);
-	}
-}
-EXPORT_SYMBOL(blk_mq_abort_requeue_list);
-
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	if (tag < tags->nr_tags) {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c47aa248c640..fcd641032f8d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -238,7 +238,6 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 				bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
-void blk_mq_abort_requeue_list(struct request_queue *q);
 void blk_mq_complete_request(struct request *rq);
 
 bool blk_mq_queue_stopped(struct request_queue *q);
-- 
cgit v1.2.3


From c70d9d809fdeecedb96972457ee45c49a232d97f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 22 May 2017 15:40:12 -0500
Subject: ptrace: Properly initialize ptracer_cred on fork

When I introduced ptracer_cred I failed to consider the weirdness of
fork where the task_struct copies the old value by default.  This
winds up leaving ptracer_cred set even when a process forks and
the child process does not wind up being ptraced.

Because ptracer_cred is not set on non-ptraced processes whose
parents were ptraced this has broken the ability of the enlightenment
window manager to start setuid children.

Fix this by properly initializing ptracer_cred in ptrace_init_task

This must be done with a little bit of care to preserve the current value
of ptracer_cred when ptrace carries through fork.  Re-reading the
ptracer_cred from the ptracing process at this point is inconsistent
with how PT_PTRACE_CAP has been maintained all of these years.

Tested-by: Takashi Iwai <tiwai@suse.de>
Fixes: 64b875f7ac8a ("ptrace: Capture the ptracer's creds not PT_PTRACE_CAP")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/ptrace.h |  7 +++++--
 kernel/ptrace.c        | 20 +++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 422bc2e4cb6a..ef3eb8bbfee4 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -54,7 +54,8 @@ extern int ptrace_request(struct task_struct *child, long request,
 			  unsigned long addr, unsigned long data);
 extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
-			  struct task_struct *new_parent);
+			  struct task_struct *new_parent,
+			  const struct cred *ptracer_cred);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
 #define PTRACE_MODE_READ	0x01
@@ -206,7 +207,7 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 
 	if (unlikely(ptrace) && current->ptrace) {
 		child->ptrace = current->ptrace;
-		__ptrace_link(child, current->parent);
+		__ptrace_link(child, current->parent, current->ptracer_cred);
 
 		if (child->ptrace & PT_SEIZED)
 			task_set_jobctl_pending(child, JOBCTL_TRAP_STOP);
@@ -215,6 +216,8 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 
 		set_tsk_thread_flag(child, TIF_SIGPENDING);
 	}
+	else
+		child->ptracer_cred = NULL;
 }
 
 /**
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 266ddcc1d8bb..60f356d91060 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,19 +60,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
 }
 
 
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+		   const struct cred *ptracer_cred)
+{
+	BUG_ON(!list_empty(&child->ptrace_entry));
+	list_add(&child->ptrace_entry, &new_parent->ptraced);
+	child->parent = new_parent;
+	child->ptracer_cred = get_cred(ptracer_cred);
+}
+
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
  *
  * Must be called with the tasklist lock write-held.
  */
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
-	BUG_ON(!list_empty(&child->ptrace_entry));
-	list_add(&child->ptrace_entry, &new_parent->ptraced);
-	child->parent = new_parent;
 	rcu_read_lock();
-	child->ptracer_cred = get_cred(__task_cred(new_parent));
+	__ptrace_link(child, new_parent, __task_cred(new_parent));
 	rcu_read_unlock();
 }
 
@@ -386,7 +392,7 @@ static int ptrace_attach(struct task_struct *task, long request,
 		flags |= PT_SEIZED;
 	task->ptrace = flags;
 
-	__ptrace_link(task, current);
+	ptrace_link(task, current);
 
 	/* SEIZE doesn't trap tracee on attach */
 	if (!seize)
@@ -459,7 +465,7 @@ static int ptrace_traceme(void)
 		 */
 		if (!ret && !(current->real_parent->flags & PF_EXITING)) {
 			current->ptrace = PT_PTRACED;
-			__ptrace_link(current, current->real_parent);
+			ptrace_link(current, current->real_parent);
 		}
 	}
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 73dd3a4839c1d27c36d4dcc92e1ff44225ecbeb7 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Thu, 23 Feb 2017 11:19:36 +0200
Subject: net/mlx5: Avoid using pending command interface slots

Currently when firmware command gets stuck or it takes long time to
complete, the driver command will get timeout and the command slot is
freed and can be used for new commands, and if the firmware receive new
command on the old busy slot its behavior is unexpected and this could
be harmful.
To fix this when the driver command gets timeout we return failure,
but we don't free the command slot and we wait for the firmware to
explicitly respond to that command.
Once all the entries are busy we will stop processing new firmware
commands.

Fixes: 9cba4ebcf374 ('net/mlx5: Fix potential deadlock in command mode change')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Cc: kernel-team@fb.com
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c    | 41 +++++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c     |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/health.c |  2 +-
 include/linux/mlx5/driver.h                      |  7 +++-
 4 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 5bdaf3d545b2..10d282841f5b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -774,7 +774,7 @@ static void cb_timeout_handler(struct work_struct *work)
 	mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
 		       mlx5_command_str(msg_to_opcode(ent->in)),
 		       msg_to_opcode(ent->in));
-	mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
+	mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
 }
 
 static void cmd_work_handler(struct work_struct *work)
@@ -804,6 +804,7 @@ static void cmd_work_handler(struct work_struct *work)
 	}
 
 	cmd->ent_arr[ent->idx] = ent;
+	set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
 	lay = get_inst(cmd, ent->idx);
 	ent->lay = lay;
 	memset(lay, 0, sizeof(*lay));
@@ -825,6 +826,20 @@ static void cmd_work_handler(struct work_struct *work)
 	if (ent->callback)
 		schedule_delayed_work(&ent->cb_timeout_work, cb_timeout);
 
+	/* Skip sending command to fw if internal error */
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		u8 status = 0;
+		u32 drv_synd;
+
+		ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status);
+		MLX5_SET(mbox_out, ent->out, status, status);
+		MLX5_SET(mbox_out, ent->out, syndrome, drv_synd);
+
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
+		return;
+	}
+
 	/* ring doorbell after the descriptor is valid */
 	mlx5_core_dbg(dev, "writing 0x%x to command doorbell\n", 1 << ent->idx);
 	wmb();
@@ -835,7 +850,7 @@ static void cmd_work_handler(struct work_struct *work)
 		poll_timeout(ent);
 		/* make sure we read the descriptor after ownership is SW */
 		rmb();
-		mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, (ent->ret == -ETIMEDOUT));
 	}
 }
 
@@ -879,7 +894,7 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
 		wait_for_completion(&ent->done);
 	} else if (!wait_for_completion_timeout(&ent->done, timeout)) {
 		ent->ret = -ETIMEDOUT;
-		mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx, true);
 	}
 
 	err = ent->ret;
@@ -1375,7 +1390,7 @@ static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
 	}
 }
 
-void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec)
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced)
 {
 	struct mlx5_cmd *cmd = &dev->cmd;
 	struct mlx5_cmd_work_ent *ent;
@@ -1395,6 +1410,19 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec)
 			struct semaphore *sem;
 
 			ent = cmd->ent_arr[i];
+
+			/* if we already completed the command, ignore it */
+			if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP,
+						&ent->state)) {
+				/* only real completion can free the cmd slot */
+				if (!forced) {
+					mlx5_core_err(dev, "Command completion arrived after timeout (entry idx = %d).\n",
+						      ent->idx);
+					free_ent(cmd, ent->idx);
+				}
+				continue;
+			}
+
 			if (ent->callback)
 				cancel_delayed_work(&ent->cb_timeout_work);
 			if (ent->page_queue)
@@ -1417,7 +1445,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec)
 				mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n",
 					      ent->ret, deliv_status_to_str(ent->status), ent->status);
 			}
-			free_ent(cmd, ent->idx);
+
+			/* only real completion will free the entry slot */
+			if (!forced)
+				free_ent(cmd, ent->idx);
 
 			if (ent->callback) {
 				ds = ent->ts2 - ent->ts1;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index ea5d8d37a75c..33eae5ad2fb0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -422,7 +422,7 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr)
 			break;
 
 		case MLX5_EVENT_TYPE_CMD:
-			mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector));
+			mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector), false);
 			break;
 
 		case MLX5_EVENT_TYPE_PORT_CHANGE:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index d0515391d33b..44f59b1d6f0f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -90,7 +90,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev)
 	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
 
 	mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
-	mlx5_cmd_comp_handler(dev, vector);
+	mlx5_cmd_comp_handler(dev, vector, true);
 	return;
 
 no_trig:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bcdf739ee41a..93273d9ea4d1 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -787,7 +787,12 @@ enum {
 
 typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
 
+enum {
+	MLX5_CMD_ENT_STATE_PENDING_COMP,
+};
+
 struct mlx5_cmd_work_ent {
+	unsigned long		state;
 	struct mlx5_cmd_msg    *in;
 	struct mlx5_cmd_msg    *out;
 	void		       *uout;
@@ -976,7 +981,7 @@ void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
 void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
 void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
 struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
-void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec);
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced);
 void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
 int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
 		       int nent, u64 mask, const char *name,
-- 
cgit v1.2.3


From 7f65b1f5adc5f8496ca8bec4947de66fefe36220 Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Mon, 22 May 2017 14:50:30 +0200
Subject: cdc-ether: divorce initialisation with a filter reset and a generic
 method

Some devices need their multicast filter reset but others are crashed by that.
So the methods need to be separated.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Reported-by: "Ridgway, Keith" <kridgway@harris.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ether.c | 31 ++++++++++++++++++++++++-------
 include/linux/usb/usbnet.h  |  1 +
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c
index f3ae88fdf332..8ab281b478f2 100644
--- a/drivers/net/usb/cdc_ether.c
+++ b/drivers/net/usb/cdc_ether.c
@@ -310,6 +310,26 @@ skip:
 		return -ENODEV;
 	}
 
+	return 0;
+
+bad_desc:
+	dev_info(&dev->udev->dev, "bad CDC descriptors\n");
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(usbnet_generic_cdc_bind);
+
+
+/* like usbnet_generic_cdc_bind() but handles filter initialization
+ * correctly
+ */
+int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
+{
+	int rv;
+
+	rv = usbnet_generic_cdc_bind(dev, intf);
+	if (rv < 0)
+		goto bail_out;
+
 	/* Some devices don't initialise properly. In particular
 	 * the packet filter is not reset. There are devices that
 	 * don't do reset all the way. So the packet filter should
@@ -317,13 +337,10 @@ skip:
 	 */
 	usbnet_cdc_update_filter(dev);
 
-	return 0;
-
-bad_desc:
-	dev_info(&dev->udev->dev, "bad CDC descriptors\n");
-	return -ENODEV;
+bail_out:
+	return rv;
 }
-EXPORT_SYMBOL_GPL(usbnet_generic_cdc_bind);
+EXPORT_SYMBOL_GPL(usbnet_ether_cdc_bind);
 
 void usbnet_cdc_unbind(struct usbnet *dev, struct usb_interface *intf)
 {
@@ -417,7 +434,7 @@ int usbnet_cdc_bind(struct usbnet *dev, struct usb_interface *intf)
 	BUILD_BUG_ON((sizeof(((struct usbnet *)0)->data)
 			< sizeof(struct cdc_state)));
 
-	status = usbnet_generic_cdc_bind(dev, intf);
+	status = usbnet_ether_cdc_bind(dev, intf);
 	if (status < 0)
 		return status;
 
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 7dffa5624ea6..97116379db5f 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -206,6 +206,7 @@ struct cdc_state {
 };
 
 extern int usbnet_generic_cdc_bind(struct usbnet *, struct usb_interface *);
+extern int usbnet_ether_cdc_bind(struct usbnet *dev, struct usb_interface *intf);
 extern int usbnet_cdc_bind(struct usbnet *, struct usb_interface *);
 extern void usbnet_cdc_unbind(struct usbnet *, struct usb_interface *);
 extern void usbnet_cdc_status(struct usbnet *, struct urb *);
-- 
cgit v1.2.3


From 12e8b570e732eaa5eae3a2895ba3fbcf91bde2b4 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 22 May 2017 20:13:07 +0200
Subject: mlx5: fix bug reading rss_hash_type from CQE

Masks for extracting part of the Completion Queue Entry (CQE)
field rss_hash_type was swapped, namely CQE_RSS_HTYPE_IP and
CQE_RSS_HTYPE_L4.

The bug resulted in setting skb->l4_hash, even-though the
rss_hash_type indicated that hash was NOT computed over the
L4 (UDP or TCP) part of the packet.

Added comments from the datasheet, to make it more clear what
these masks are selecting.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index dd9a263ed368..a940ec6a046c 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -787,8 +787,14 @@ enum {
 };
 
 enum {
-	CQE_RSS_HTYPE_IP	= 0x3 << 6,
-	CQE_RSS_HTYPE_L4	= 0x3 << 2,
+	CQE_RSS_HTYPE_IP	= 0x3 << 2,
+	/* cqe->rss_hash_type[3:2] - IP destination selected for hash
+	 * (00 = none,  01 = IPv4, 10 = IPv6, 11 = Reserved)
+	 */
+	CQE_RSS_HTYPE_L4	= 0x3 << 6,
+	/* cqe->rss_hash_type[7:6] - L4 destination selected for hash
+	 * (00 = none, 01 = TCP. 10 = UDP, 11 = IPSEC.SPI
+	 */
 };
 
 enum {
-- 
cgit v1.2.3


From 6f4dbd149d2a151b89d1a5bbf7530ee5546c7908 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 19 May 2017 11:33:16 +0200
Subject: libceph: use kbasename() and kill ceph_file_part()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 include/linux/ceph/ceph_debug.h |  6 +++---
 net/ceph/ceph_common.c          | 13 -------------
 2 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index aa2e19182d99..51c5bd64bd00 100644
--- a/include/linux/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -3,6 +3,8 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/string.h>
+
 #ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
 
 /*
@@ -12,12 +14,10 @@
  */
 
 # if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
-extern const char *ceph_file_part(const char *s, int len);
 #  define dout(fmt, ...)						\
 	pr_debug("%.*s %12.12s:%-4d : " fmt,				\
 		 8 - (int)sizeof(KBUILD_MODNAME), "    ",		\
-		 ceph_file_part(__FILE__, sizeof(__FILE__)),		\
-		 __LINE__, ##__VA_ARGS__)
+		 kbasename(__FILE__), __LINE__, ##__VA_ARGS__)
 # else
 /* faux printk call just to see any compiler warnings. */
 #  define dout(fmt, ...)	do {				\
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4fd02831beed..47e94b560ba0 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -56,19 +56,6 @@ static const struct kernel_param_ops param_ops_supported_features = {
 module_param_cb(supported_features, &param_ops_supported_features, NULL,
 		S_IRUGO);
 
-/*
- * find filename portion of a path (/foo/bar/baz -> baz)
- */
-const char *ceph_file_part(const char *s, int len)
-{
-	const char *e = s + len;
-
-	while (e != s && *(e-1) != '/')
-		e--;
-	return e;
-}
-EXPORT_SYMBOL(ceph_file_part);
-
 const char *ceph_msg_type_name(int type)
 {
 	switch (type) {
-- 
cgit v1.2.3


From 4d071c3238987325b9e50e33051a40d1cce311cc Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Tue, 23 May 2017 14:18:17 -0500
Subject: PCI/PM: Add needs_resume flag to avoid suspend complete optimization

Some drivers - like i915 - may not support the system suspend direct
complete optimization due to differences in their runtime and system
suspend sequence.  Add a flag that when set resumes the device before
calling the driver's system suspend handlers which effectively disables
the optimization.

Needed by a future patch fixing suspend/resume on i915.

Suggested by Rafael.

Signed-off-by: Imre Deak <imre.deak@intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: stable@vger.kernel.org
---
 drivers/pci/pci.c   | 3 ++-
 include/linux/pci.h | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b01bd5bba8e6..563901cd9c06 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2144,7 +2144,8 @@ bool pci_dev_keep_suspended(struct pci_dev *pci_dev)
 
 	if (!pm_runtime_suspended(dev)
 	    || pci_target_state(pci_dev) != pci_dev->current_state
-	    || platform_pci_need_resume(pci_dev))
+	    || platform_pci_need_resume(pci_dev)
+	    || (pci_dev->dev_flags & PCI_DEV_FLAGS_NEEDS_RESUME))
 		return false;
 
 	/*
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 33c2b0b77429..df7dd9021646 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -183,6 +183,11 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9),
 	/* Do not use FLR even if device advertises PCI_AF_CAP */
 	PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10),
+	/*
+	 * Resume before calling the driver's system suspend hooks, disabling
+	 * the direct_complete optimization.
+	 */
+	PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From 35d2f80b07bbe03fb358afb0bdeff7437a7d67ff Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 23 May 2017 13:38:41 -0400
Subject: vlan: Fix tcp checksum offloads in Q-in-Q vlans

It appears that TCP checksum offloading has been broken for
Q-in-Q vlans.  The behavior was execerbated by the
series
    commit afb0bc972b52 ("Merge branch 'stacked_vlan_tso'")
that that enabled accleleration features on stacked vlans.

However, event without that series, it is possible to trigger
this issue.  It just requires a lot more specialized configuration.

The root cause is the interaction between how
netdev_intersect_features() works, the features actually set on
the vlan devices and HW having the ability to run checksum with
longer headers.

The issue starts when netdev_interesect_features() replaces
NETIF_F_HW_CSUM with a combination of NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM,
if the HW advertises IP|IPV6 specific checksums.  This happens
for tagged and multi-tagged packets.   However, HW that enables
IP|IPV6 checksum offloading doesn't gurantee that packets with
arbitrarily long headers can be checksummed.

This patch disables IP|IPV6 checksums on the packet for multi-tagged
packets.

CC: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
CC: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Acked-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 8d5fcd6284ce..283dc2f5364d 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -614,14 +614,16 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb)
 static inline netdev_features_t vlan_features_check(const struct sk_buff *skb,
 						    netdev_features_t features)
 {
-	if (skb_vlan_tagged_multi(skb))
-		features = netdev_intersect_features(features,
-						     NETIF_F_SG |
-						     NETIF_F_HIGHDMA |
-						     NETIF_F_FRAGLIST |
-						     NETIF_F_HW_CSUM |
-						     NETIF_F_HW_VLAN_CTAG_TX |
-						     NETIF_F_HW_VLAN_STAG_TX);
+	if (skb_vlan_tagged_multi(skb)) {
+		/* In the case of multi-tagged packets, use a direct mask
+		 * instead of using netdev_interesect_features(), to make
+		 * sure that only devices supporting NETIF_F_HW_CSUM will
+		 * have checksum offloading support.
+		 */
+		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
+			    NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
+			    NETIF_F_HW_VLAN_STAG_TX;
+	}
 
 	return features;
 }
-- 
cgit v1.2.3


From 614d0d77b49a9b131e58b77473698ab5b2c525b7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 25 May 2017 01:05:09 +0200
Subject: bpf: add various verifier test cases

This patch adds various verifier test cases:

1) A test case for the pruning issue when tracking alignment
   is used.
2) Various PTR_TO_MAP_VALUE_OR_NULL tests to make sure pointer
   arithmetic turns such register into UNKNOWN_VALUE type.
3) Test cases for the special treatment of LD_ABS/LD_IND to
   make sure verifier doesn't break calling convention here.
   Latter is needed, since f.e. arm64 JIT uses r1 - r5 for
   storing temporary data, so they really must be marked as
   NOT_INIT.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h                      |  10 ++
 tools/include/linux/filter.h                |  10 ++
 tools/testing/selftests/bpf/test_verifier.c | 239 +++++++++++++++++++++++++++-
 3 files changed, 255 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 56197f82af45..62d948f80730 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -272,6 +272,16 @@ struct bpf_prog_aux;
 		.off   = OFF,					\
 		.imm   = IMM })
 
+/* Unconditional jumps, goto pc + off16 */
+
+#define BPF_JMP_A(OFF)						\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_JA,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Function call */
 
 #define BPF_EMIT_CALL(FUNC)					\
diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
index 390d7c9685fd..4ce25d43e8e3 100644
--- a/tools/include/linux/filter.h
+++ b/tools/include/linux/filter.h
@@ -208,6 +208,16 @@
 		.off   = OFF,					\
 		.imm   = IMM })
 
+/* Unconditional jumps, goto pc + off16 */
+
+#define BPF_JMP_A(OFF)						\
+	((struct bpf_insn) {					\
+		.code  = BPF_JMP | BPF_JA,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = OFF,					\
+		.imm   = 0 })
+
 /* Function call */
 
 #define BPF_EMIT_CALL(FUNC)					\
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3773562056da..cabb19b1e371 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -49,6 +49,7 @@
 #define MAX_NR_MAPS	4
 
 #define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS	(1 << 0)
+#define F_LOAD_WITH_STRICT_ALIGNMENT		(1 << 1)
 
 struct bpf_test {
 	const char *descr;
@@ -2614,6 +2615,30 @@ static struct bpf_test tests[] = {
 		.result = REJECT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"direct packet access: test17 (pruning, alignment)",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+				    offsetof(struct __sk_buff, data)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+				    offsetof(struct __sk_buff, data_end)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 14),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 1, 4),
+			BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+			BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, -4),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+			BPF_JMP_A(-6),
+		},
+		.errstr = "misaligned packet access off 2+15+-4 size 4",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+		.flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+	},
 	{
 		"helper access to packet: test1, valid packet_ptr range",
 		.insns = {
@@ -3340,6 +3365,70 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS
 	},
+	{
+		"alu ops on ptr_to_map_value_or_null, 1",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -2),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "R4 invalid mem access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
+	{
+		"alu ops on ptr_to_map_value_or_null, 2",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_4, -1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "R4 invalid mem access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
+	{
+		"alu ops on ptr_to_map_value_or_null, 3",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 10),
+			BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 1),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 4 },
+		.errstr = "R4 invalid mem access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS
+	},
 	{
 		"invalid memory access with multiple map_lookup_elem calls",
 		.insns = {
@@ -4937,7 +5026,149 @@ static struct bpf_test tests[] = {
 		.fixup_map_in_map = { 3 },
 		.errstr = "R1 type=map_value_or_null expected=map_ptr",
 		.result = REJECT,
-	}
+	},
+	{
+		"ld_abs: check calling conv, r1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_LD_ABS(BPF_W, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_abs: check calling conv, r2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_LD_ABS(BPF_W, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R2 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_abs: check calling conv, r3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_LD_ABS(BPF_W, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R3 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_abs: check calling conv, r4",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_LD_ABS(BPF_W, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R4 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_abs: check calling conv, r5",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_5, 0),
+			BPF_LD_ABS(BPF_W, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R5 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_abs: check calling conv, r7",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_7, 0),
+			BPF_LD_ABS(BPF_W, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"ld_ind: check calling conv, r1",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_1, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_1, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_ind: check calling conv, r2",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_2, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R2 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_ind: check calling conv, r3",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_3, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_3, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R3 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_ind: check calling conv, r4",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_4, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_4, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R4 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_ind: check calling conv, r5",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_5, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_5, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R5 !read_ok",
+		.result = REJECT,
+	},
+	{
+		"ld_ind: check calling conv, r7",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_MOV64_IMM(BPF_REG_7, 1),
+			BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
@@ -5059,9 +5290,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 
 	do_test_fixup(test, prog, map_fds);
 
-	fd_prog = bpf_load_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
-				   prog, prog_len, "GPL", 0, bpf_vlog,
-				   sizeof(bpf_vlog));
+	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
+				     prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT,
+				     "GPL", 0, bpf_vlog, sizeof(bpf_vlog));
 
 	expected_ret = unpriv && test->result_unpriv != UNDEF ?
 		       test->result_unpriv : test->result;
-- 
cgit v1.2.3


From 83b4605b0c16cde5b00c8cf192408d51eab75402 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 20 May 2017 18:59:54 +0200
Subject: PCI/msi: fix the pci_alloc_irq_vectors_affinity stub

We need to return an error for any call that asks for MSI / MSI-X
vectors only, so that non-trivial fallback logic can work properly.

Also valid dev->irq and use the "correct" errno value based on feedback
from Linus.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Fixes: aff17164 ("PCI: Provide sensible IRQ vector alloc/free routines")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pci.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 33c2b0b77429..fc2e832d7b9c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1342,9 +1342,9 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			       unsigned int max_vecs, unsigned int flags,
 			       const struct irq_affinity *aff_desc)
 {
-	if (min_vecs > 1)
-		return -EINVAL;
-	return 1;
+	if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq)
+		return 1;
+	return -ENOSPC;
 }
 
 static inline void pci_free_irq_vectors(struct pci_dev *dev)
-- 
cgit v1.2.3